From 8c02326132141581b5ae4003450f9d73e84f0495 Mon Sep 17 00:00:00 2001
From: scampion <scampion@durex.irisa.fr>
Date: Tue, 11 Jan 2011 16:26:18 +0100
Subject: [PATCH 1/7] Self-Organizing Map algorithm added with Calinksi cluster
 quality mesure

---
 examples/cluster/som_digits.py          |  76 +++++++++++
 scikits/learn/cluster/som_.py           | 171 ++++++++++++++++++++++++
 scikits/learn/cluster/tests/test_som.py |  43 ++++++
 3 files changed, 290 insertions(+)
 create mode 100644 examples/cluster/som_digits.py
 create mode 100644 scikits/learn/cluster/som_.py
 create mode 100644 scikits/learn/cluster/tests/test_som.py

diff --git a/examples/cluster/som_digits.py b/examples/cluster/som_digits.py
new file mode 100644
index 0000000000000..54f162b76a3c1
--- /dev/null
+++ b/examples/cluster/som_digits.py
@@ -0,0 +1,76 @@
+"""
+===========================================================
+A demo of Self-Organising Map and KMeans on the handwritten 
+digits data
+===========================================================
+"""
+from __future__ import division
+print __doc__
+
+from time import time
+import numpy as np
+
+from scikits.learn.cluster import KMeans
+from scikits.learn.cluster import SOM
+from scikits.learn.cluster import calinski_index
+
+from scikits.learn.datasets import load_digits
+from scikits.learn.pca import PCA
+from scikits.learn.preprocessing import scale
+
+def display(labels,digits,nbclusters):
+    r = {0:[],1:[],2:[],3:[],4:[],5:[],6:[],7:[],8:[],9:[]} 
+    for i,v in enumerate(labels):
+        r[digits.target[i]].append(v)
+
+    for k,v in r.items(): 
+        s = set(v)
+        print 'target %i | nb cluster %i |' % (k,len(s)),s
+
+np.random.seed(42)
+
+digits = load_digits()
+data = scale(digits.data)
+
+n_samples, n_features = data.shape
+n_digits = len(np.unique(digits.target))
+
+print "n_digits: %d" % n_digits
+print "n_features: %d" % n_features
+print "n_samples: %d" % n_samples
+print
+
+print "Self-Organizing Map "
+t0 = time()
+grid_width = 4
+som = SOM(w=grid_width,n_init=n_samples*5,learning_rate=1)
+som.fit(data)
+print "done in %0.3fs" % (time() - t0)
+print
+
+display(som.labels_,digits,grid_width**2)
+C = calinski_index(data,som.labels_,som.neurons_)
+print 'calinski index %0.2f | %0.2f%%' % (C,100*(C/(1+C)))
+print
+
+print '*'*80
+
+print "KMeans "
+t0 = time()
+#km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data)
+km = KMeans(init='k-means++', k=grid_width**2, n_init=10)
+km.fit(data)
+print "done in %0.3fs" % (time() - t0)
+print 
+
+display(km.labels_,digits,n_digits)
+C = calinski_index(data,km.labels_,km.cluster_centers_)
+print 'calinski index %0.2f | %0.2f%%' % (C,100*(C/(1+C)))
+
+
+
+
+
+
+
+
diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py
new file mode 100644
index 0000000000000..90a43dc10d70b
--- /dev/null
+++ b/scikits/learn/cluster/som_.py
@@ -0,0 +1,171 @@
+"""
+ Self-organizing map 
+"""
+# Authors: Sebastien Campion <sebastien.campion@inria.fr>
+# License: BSD
+from __future__ import division
+import warnings
+from ..base import BaseEstimator
+import numpy as np
+import math 
+import Image 
+
+################################################################################
+def save_rgb(neurons,ofile,thumb_size=32):
+    '''Function to save map using 3 dim, as RGB'''    
+    assert neurons.shape[-1] == 3 
+    tsize = (thumb_size,thumb_size) 
+    size  = tuple([v * thumb_size for v in neurons.shape[0:2] ])
+    im  = Image.new('RGB',size)
+    for x in range(neurons.shape[0]):
+        for y in range(neurons.shape[1]):
+            color = tuple([int(c) for c in neurons[x][y]])
+            t = Image.new('RGB',tsize,color)
+            im.paste(t,(x*thumb_size,y*thumb_size))            
+    im.save(ofile)
+
+################################################################################
+class Kohonen2DMap():
+    def __init__(self,size,dim,neurons=None):
+        #self.log = logging.getLogger('kohonen.map')
+        self.dim = dim 
+        self.size = size 
+        self.neurons = neurons
+        if neurons == None  :
+            self.neurons = np.random.rand(size,size,dim)#/10
+        self.iteration = 0 
+        
+    def bmu(self,data):
+        assert data.shape[0] == self.neurons.shape[-1] 
+        data = np.resize(data,self.neurons.shape) 
+        dists = np.sum((data-self.neurons)** 2,axis=-1)
+        min = dists.argmin()
+        #w = np.unravel_index(min,dists.shape)
+        return divmod(min,self.size)
+        
+    def learn(self,datas,nbiter,learning_rate=1,callback=None):
+        '''Given an sample of datas, we randomly choose one of them for each 
+        iteration.
+        A good ratio, nb datas = 2 or 3 x nbiter'''
+        self.iteration = 0   
+        indices = np.random.random_integers(0,len(datas)-1,nbiter)
+        for i in indices: 
+            l = nbiter/self.size
+            lr = learning_rate * math.exp(-self.iteration/l)
+            self._learn_vector(datas[i], nbiter, lr)
+            self.iteration += 1 
+            if callback != None:
+                callback(self,self.iteration)
+
+    def _learn_vector(self, data, nbiter, lr):
+        w = self.bmu(data)
+        radius = self.radius_of_the_neighbordhood(nbiter)
+        for n in self.neurons_in_radius(w,radius):
+            nx,ny = n
+            wt = self.neurons[nx][ny]
+            dr = self.dist(w,n,radius)
+            self.neurons[nx][ny] = wt + dr*lr*(data-wt)
+            #self.log.debug(('nod',n,'l_rate',lr,'d_radius',dr))
+        #self.log.debug(('bmu',w,'iter',self.iteration,'radius',radius))
+    
+    def dist(self,w,n,radius):
+        wx,wy = w
+        nx,ny = n
+        d = (wx-nx)**2 + (wy-ny)**2
+        #offcial paper implementation : return math.exp(-d/2*radius**2)
+        return math.exp(-d/radius)
+    
+    def neurons_in_radius(self,w,radius):
+        wi,wj = w 
+        r = []
+        for i in range(self.neurons.shape[0]):
+            for j in range(self.neurons.shape[1]):
+                if math.sqrt((i-wi)**2 + (j-wj)**2) < radius:
+                    r.append((i,j))
+        return r
+        
+    def radius_of_the_neighbordhood(self,nbiter):
+        l = nbiter/self.size
+        return self.size * math.exp(-self.iteration/l)
+    
+
+################################################################################
+
+class SOM(BaseEstimator):
+    """ Self-Organizing Map
+
+    Parameters
+    ----------
+
+    data : ndarray
+        A M by N array of M observations in N dimensions or a length
+        M array of M one-dimensional observations.
+
+    w : int
+        Width and height of the square mape as well as the number of
+        centroids to generate. If init initialization string is
+        'matrix', or if a ndarray is given instead, it is
+        interpreted as initial cluster to use instead.
+
+    n_iter : int
+        Number of iterations of the som algrithm to run
+
+    learning_rate : float
+        Learning rate
+
+    init : {'random', 'matrix'}
+        Method for initialization, defaults to 'random':
+
+        'random': randomly points choosed
+
+        'matrix': interpret the w parameter as a w by M array
+         of initial centroids.
+
+    Methods
+    -------
+
+    fit(X):
+        Compute SOM
+
+    Attributes
+    ----------
+
+    neurons_: array, [(x,y), n_features]
+        Coordinates of neurons and value
+
+    labels_:
+        Labels of each point
+
+    Notes
+    ------
+
+    """
+
+    def __init__(self, w=16, init='random', n_init=64,learning_rate=1):        
+        self.w = w
+        self.init = init
+        self.n_init = n_init
+        self.learning_rate = learning_rate
+        self.callback = None
+
+    def fit(self, X, **params):
+        """ Compute som"""
+        X = np.asanyarray(X)
+        self._set_params(**params)
+
+        neurons = None 
+        dim = X.shape[-1]
+        
+        if self.init == 'matrix':
+            assert len(self.w.shape) == 3 
+            neurons = self.w 
+            self.w = neurons.shape[0]
+
+        
+        km = Kohonen2DMap(self.w,dim,neurons)
+        km.learn(X,self.n_init,self.learning_rate,callback=self.callback)
+        self.neurons_ = km.neurons
+        self.labels_ = [km.bmu(x) for x in X]
+        
+        return self
+ 
diff --git a/scikits/learn/cluster/tests/test_som.py b/scikits/learn/cluster/tests/test_som.py
new file mode 100644
index 0000000000000..2c697f170a727
--- /dev/null
+++ b/scikits/learn/cluster/tests/test_som.py
@@ -0,0 +1,43 @@
+"""
+Testing for SOM 
+
+"""
+import Image
+import numpy as np
+from numpy.testing import assert_equal
+
+from ..som_ import SOM,save_rgb
+from .common import generate_clustered_data
+
+n_clusters = 4
+X = generate_clustered_data(n_clusters=n_clusters, std=.1)
+
+def test_som():
+    np.random.seed(1)
+    som = SOM().fit(X,w=4,n_init=32,learning_rate=0.4) 
+    labels = som.labels_
+
+    assert_equal(np.unique(labels).shape[0],4)
+    assert_equal(np.unique(labels[:20]).shape[0], 1)
+    assert_equal(np.unique(labels[20:40]).shape[0], 1)
+    assert_equal(np.unique(labels[40:60]).shape[0], 1)
+    assert_equal(np.unique(labels[60:]).shape[0], 1)
+
+def test_color_map():    
+    train = np.array([[0,0,0],       #black
+                      [255,255,255], #white
+                      [255,0,0],     #red
+                      [0,255,0],     #green
+                      [0,0,255],     #blue
+                      [255,255,0],   #yellow
+                      [0,255,255],   #cyan
+                      [255,0,255]    #magenta
+                      ])
+    w = np.random.rand(16,16,3)*255
+    som = SOM(w,n_init=1024,init='matrix',learning_rate=1)
+    save_rgb(w,'init.jpg')
+    som = SOM(w,n_init=1024,init='matrix',learning_rate=1)
+    som.fit(train)
+    save_rgb(som.neurons_,'color_map.jpg')
+
+

From 5e79af60f6b7d57cf648a10bc7d2fd46e068652d Mon Sep 17 00:00:00 2001
From: scampion <scampion@durex.irisa.fr>
Date: Wed, 12 Jan 2011 17:58:10 +0100
Subject: [PATCH 2/7] merge SOM and Kohonen class, improve variable name, add
 and example using color and calinski function to measure clustering quality

---
 examples/cluster/plot_som_colormap.py | 53 +++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 examples/cluster/plot_som_colormap.py

diff --git a/examples/cluster/plot_som_colormap.py b/examples/cluster/plot_som_colormap.py
new file mode 100644
index 0000000000000..ce7090039c2c3
--- /dev/null
+++ b/examples/cluster/plot_som_colormap.py
@@ -0,0 +1,53 @@
+"""
+===========================================================
+A demo of SelfOrganisingMap with colored neurons 
+===========================================================
+
+"""
+print __doc__
+
+from time import time
+import numpy as np
+from scikits.learn.cluster import SelfOrganisingMap
+import pylab as pl
+import Image 
+
+def map2image(neurons,thumb_size=32):
+    '''Function to save som using 3 dim, as RGB'''    
+    assert neurons.shape[-1] == 3 
+    tsize = (thumb_size,thumb_size) 
+    size  = tuple([v * thumb_size for v in neurons.shape[0:2] ])
+    im  = Image.new('RGB',size)
+    for x in range(neurons.shape[0]):
+        for y in range(neurons.shape[1]):
+            color = tuple([int(c) for c in neurons[x][y]])
+            t = Image.new('RGB',tsize,color)
+            im.paste(t,(x*thumb_size,y*thumb_size))            
+    return im 
+
+train = np.array([[0,0,0],       #black
+                  [255,255,255], #white
+                  [255,0,0],     #red
+                  [0,255,0],     #green
+                  [0,0,255],     #blue
+                  [255,255,0],   #yellow
+                  [0,255,255],   #cyan
+                  [255,0,255]    #magenta
+                  ])
+
+
+
+init = np.random.rand(16,16,3)*255
+
+pl.subplot(1, 2, 1)
+pl.imshow(map2image(init))
+pl.title('Initial map')
+
+som = SelfOrganisingMap(init,n_iterations=1024,
+                        init='matrix',learning_rate=1)
+som.fit(train)
+
+pl.subplot(1, 2, 2)
+pl.imshow(map2image(som.neurons_))
+pl.title('Organized Map') 
+pl.show()

From d5dc95c75f9c6c180753cae6ef865e3fce107cf4 Mon Sep 17 00:00:00 2001
From: scampion <scampion@durex.irisa.fr>
Date: Thu, 13 Jan 2011 09:47:44 +0100
Subject: [PATCH 3/7] remove PIL module, s/SelfOrganising/SelfOrganizing/ merge
 SOM and Kohonen class, improve variable name, add and example using color and
 calinski function to measure clustering quality

---
 examples/cluster/plot_som_colormap.py   |  37 ++---
 examples/cluster/som_digits.py          |  20 +--
 scikits/learn/cluster/__init__.py       |  12 ++
 scikits/learn/cluster/som_.py           | 189 ++++++++++--------------
 scikits/learn/cluster/tests/test_som.py |  25 +---
 5 files changed, 117 insertions(+), 166 deletions(-)

diff --git a/examples/cluster/plot_som_colormap.py b/examples/cluster/plot_som_colormap.py
index ce7090039c2c3..2397afe726b60 100644
--- a/examples/cluster/plot_som_colormap.py
+++ b/examples/cluster/plot_som_colormap.py
@@ -5,25 +5,18 @@
 
 """
 print __doc__
-
-from time import time
-import numpy as np
-from scikits.learn.cluster import SelfOrganisingMap
 import pylab as pl
-import Image 
+from matplotlib.colors import ListedColormap, NoNorm, rgb2hex
+import numpy as np
+from scikits.learn.cluster import SelfOrganizingMap
 
-def map2image(neurons,thumb_size=32):
-    '''Function to save som using 3 dim, as RGB'''    
+def plot(neurons):
     assert neurons.shape[-1] == 3 
-    tsize = (thumb_size,thumb_size) 
-    size  = tuple([v * thumb_size for v in neurons.shape[0:2] ])
-    im  = Image.new('RGB',size)
-    for x in range(neurons.shape[0]):
-        for y in range(neurons.shape[1]):
-            color = tuple([int(c) for c in neurons[x][y]])
-            t = Image.new('RGB',tsize,color)
-            im.paste(t,(x*thumb_size,y*thumb_size))            
-    return im 
+    h,w,d = neurons.shape
+    hexmap = np.apply_along_axis(rgb2hex,1,
+                                 neurons.reshape(-1,3)/256)
+    index  = np.arange(h*w).reshape(h,w)    
+    pl.pcolor(index,cmap=ListedColormap(hexmap),norm=NoNorm())
 
 train = np.array([[0,0,0],       #black
                   [255,255,255], #white
@@ -39,15 +32,17 @@ def map2image(neurons,thumb_size=32):
 
 init = np.random.rand(16,16,3)*255
 
-pl.subplot(1, 2, 1)
-pl.imshow(map2image(init))
+pl.subplot(1, 2, 1,aspect='equal')
+plot(init)
 pl.title('Initial map')
 
-som = SelfOrganisingMap(init,n_iterations=1024,
+som = SelfOrganizingMap(init,n_iterations=1024,
                         init='matrix',learning_rate=1)
 som.fit(train)
 
-pl.subplot(1, 2, 2)
-pl.imshow(map2image(som.neurons_))
+pl.subplot(1, 2, 2,aspect='equal')
+plot(som.neurons_)
 pl.title('Organized Map') 
+F = pl.gcf()
+F.set_size_inches( (40,20) )
 pl.show()
diff --git a/examples/cluster/som_digits.py b/examples/cluster/som_digits.py
index 54f162b76a3c1..ff9a9c544ce56 100644
--- a/examples/cluster/som_digits.py
+++ b/examples/cluster/som_digits.py
@@ -1,8 +1,7 @@
 """
-===========================================================
-A demo of Self-Organising Map and KMeans on the handwritten 
-digits data
-===========================================================
+=======================================================================
+A demo of Self-Organising Map and KMeans on the handwritten digits data
+=======================================================================
 """
 from __future__ import division
 print __doc__
@@ -11,11 +10,10 @@
 import numpy as np
 
 from scikits.learn.cluster import KMeans
-from scikits.learn.cluster import SOM
+from scikits.learn.cluster import SelfOrganizingMap
 from scikits.learn.cluster import calinski_index
 
 from scikits.learn.datasets import load_digits
-from scikits.learn.pca import PCA
 from scikits.learn.preprocessing import scale
 
 def display(labels,digits,nbclusters):
@@ -43,7 +41,8 @@ def display(labels,digits,nbclusters):
 print "Self-Organizing Map "
 t0 = time()
 grid_width = 4
-som = SOM(w=grid_width,n_init=n_samples*5,learning_rate=1)
+som = SelfOrganizingMap(size=grid_width,n_iterations=n_samples*5,
+                        learning_rate=1)
 som.fit(data)
 print "done in %0.3fs" % (time() - t0)
 print
@@ -53,8 +52,6 @@ def display(labels,digits,nbclusters):
 print 'calinski index %0.2f | %0.2f%%' % (C,100*(C/(1+C)))
 print
 
-print '*'*80
-
 print "KMeans "
 t0 = time()
 #km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data)
@@ -69,8 +66,3 @@ def display(labels,digits,nbclusters):
 
 
 
-
-
-
-
-
diff --git a/scikits/learn/cluster/__init__.py b/scikits/learn/cluster/__init__.py
index cedab2e2f8e0d..1f2b9f90a0cc8 100644
--- a/scikits/learn/cluster/__init__.py
+++ b/scikits/learn/cluster/__init__.py
@@ -6,4 +6,16 @@
 from .mean_shift_ import mean_shift, MeanShift, estimate_bandwidth
 from .affinity_propagation_ import affinity_propagation, AffinityPropagation
 from .k_means_ import k_means, KMeans
+from .som_ import SelfOrganizingMap
 
+import numpy as np
+
+def calinski_index(X,labels,centroids):
+    mean = np.mean(X,axis=0) 
+    B = np.sum([ (c - mean)**2 for c in centroids])
+    W = np.sum([ (x-centroids[labels[i]])**2 
+                 for i,x in enumerate(X)])
+    c = len(centroids)
+    n = len(X)
+    return (B /(c-1))/(W/ (n-c))
+    
diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py
index 90a43dc10d70b..b5fd08a3ff6de 100644
--- a/scikits/learn/cluster/som_.py
+++ b/scikits/learn/cluster/som_.py
@@ -1,113 +1,34 @@
 """
  Self-organizing map 
+ 
+ Reference : (to check)
+ Kohonen, T.; , "The self-organizing map,"
+ Proceedings of the IEEE , vol.78, no.9, pp.1464-1480, Sep 1990 
 """
 # Authors: Sebastien Campion <sebastien.campion@inria.fr>
 # License: BSD
 from __future__ import division
-import warnings
 from ..base import BaseEstimator
 import numpy as np
 import math 
-import Image 
-
-################################################################################
-def save_rgb(neurons,ofile,thumb_size=32):
-    '''Function to save map using 3 dim, as RGB'''    
-    assert neurons.shape[-1] == 3 
-    tsize = (thumb_size,thumb_size) 
-    size  = tuple([v * thumb_size for v in neurons.shape[0:2] ])
-    im  = Image.new('RGB',size)
-    for x in range(neurons.shape[0]):
-        for y in range(neurons.shape[1]):
-            color = tuple([int(c) for c in neurons[x][y]])
-            t = Image.new('RGB',tsize,color)
-            im.paste(t,(x*thumb_size,y*thumb_size))            
-    im.save(ofile)
-
-################################################################################
-class Kohonen2DMap():
-    def __init__(self,size,dim,neurons=None):
-        #self.log = logging.getLogger('kohonen.map')
-        self.dim = dim 
-        self.size = size 
-        self.neurons = neurons
-        if neurons == None  :
-            self.neurons = np.random.rand(size,size,dim)#/10
-        self.iteration = 0 
-        
-    def bmu(self,data):
-        assert data.shape[0] == self.neurons.shape[-1] 
-        data = np.resize(data,self.neurons.shape) 
-        dists = np.sum((data-self.neurons)** 2,axis=-1)
-        min = dists.argmin()
-        #w = np.unravel_index(min,dists.shape)
-        return divmod(min,self.size)
-        
-    def learn(self,datas,nbiter,learning_rate=1,callback=None):
-        '''Given an sample of datas, we randomly choose one of them for each 
-        iteration.
-        A good ratio, nb datas = 2 or 3 x nbiter'''
-        self.iteration = 0   
-        indices = np.random.random_integers(0,len(datas)-1,nbiter)
-        for i in indices: 
-            l = nbiter/self.size
-            lr = learning_rate * math.exp(-self.iteration/l)
-            self._learn_vector(datas[i], nbiter, lr)
-            self.iteration += 1 
-            if callback != None:
-                callback(self,self.iteration)
 
-    def _learn_vector(self, data, nbiter, lr):
-        w = self.bmu(data)
-        radius = self.radius_of_the_neighbordhood(nbiter)
-        for n in self.neurons_in_radius(w,radius):
-            nx,ny = n
-            wt = self.neurons[nx][ny]
-            dr = self.dist(w,n,radius)
-            self.neurons[nx][ny] = wt + dr*lr*(data-wt)
-            #self.log.debug(('nod',n,'l_rate',lr,'d_radius',dr))
-        #self.log.debug(('bmu',w,'iter',self.iteration,'radius',radius))
-    
-    def dist(self,w,n,radius):
-        wx,wy = w
-        nx,ny = n
-        d = (wx-nx)**2 + (wy-ny)**2
-        #offcial paper implementation : return math.exp(-d/2*radius**2)
-        return math.exp(-d/radius)
-    
-    def neurons_in_radius(self,w,radius):
-        wi,wj = w 
-        r = []
-        for i in range(self.neurons.shape[0]):
-            for j in range(self.neurons.shape[1]):
-                if math.sqrt((i-wi)**2 + (j-wj)**2) < radius:
-                    r.append((i,j))
-        return r
-        
-    def radius_of_the_neighbordhood(self,nbiter):
-        l = nbiter/self.size
-        return self.size * math.exp(-self.iteration/l)
-    
-
-################################################################################
-
-class SOM(BaseEstimator):
+class SelfOrganizingMap(BaseEstimator):
     """ Self-Organizing Map
 
     Parameters
     ----------
 
-    data : ndarray
+    X : ndarray
         A M by N array of M observations in N dimensions or a length
         M array of M one-dimensional observations.
 
-    w : int
-        Width and height of the square mape as well as the number of
+    size : int
+        Width and height of the square map as well as the number of
         centroids to generate. If init initialization string is
         'matrix', or if a ndarray is given instead, it is
         interpreted as initial cluster to use instead.
 
-    n_iter : int
+    n_iterations : int
         Number of iterations of the som algrithm to run
 
     learning_rate : float
@@ -140,32 +61,82 @@ class SOM(BaseEstimator):
     ------
 
     """
-
-    def __init__(self, w=16, init='random', n_init=64,learning_rate=1):        
-        self.w = w
+    def __init__(self,size=16,init='random',n_iterations=64,learning_rate=1,
+                 callback=None ):   
+        self.size = size
         self.init = init
-        self.n_init = n_init
+        self.n_iterations = n_iterations
         self.learning_rate = learning_rate
-        self.callback = None
+        self.callback = callback
 
     def fit(self, X, **params):
-        """ Compute som"""
+        """Given an sample of X, we randomly choose one of them for each 
+        iteration.
+        A good ratio, nb X = 2 or 3 x nbiter"""
         X = np.asanyarray(X)
         self._set_params(**params)
-
-        neurons = None 
-        dim = X.shape[-1]
-        
-        if self.init == 'matrix':
-            assert len(self.w.shape) == 3 
-            neurons = self.w 
-            self.w = neurons.shape[0]
-
-        
-        km = Kohonen2DMap(self.w,dim,neurons)
-        km.learn(X,self.n_init,self.learning_rate,callback=self.callback)
-        self.neurons_ = km.neurons
-        self.labels_ = [km.bmu(x) for x in X]
-        
+        self.dim = X.shape[-1]
+        self.neurons_ = None 
+
+        #init neurons_
+        if self.init == 'random': 
+            self.neurons_ = np.random.rand(self.size,self.size,self.dim)
+        elif self.init == 'matrix': 
+            assert len(self.size.shape) == 3 
+            self.neurons_ = self.size 
+            self.size = self.neurons_.shape[0]
+    
+        #iteration loop 
+        self.iteration = 0   
+        indices = np.random.random_integers(0,len(X)-1,self.n_iterations)
+        for i in indices: 
+            l = self.n_iterations/self.size
+            lr = self.learning_rate * math.exp(-self.iteration/l)
+            self._learn_vector(X[i],lr)
+            self.iteration += 1 
+            if self.callback != None:
+                self.callback(self,self.iteration)
+                
+        #assign labels
+        self.labels_ = [self.bmu(x) for x in X]
         return self
+
+    def _learn_vector(self, vector, lr):
+        winner = self.bmu(vector)
+        radius = self.radius_of_the_neighbordhood()
+        for n in self.neurons_in_radius(winner,radius):
+            nx,ny = n
+            wt = self.neurons_[nx][ny]
+            dr = self.dist(winner,n,radius)
+            self.neurons_[nx][ny] = wt + dr*lr*(vector - wt)
  
+    def bmu(self,vector):
+        """
+        best matching unit
+        """
+        assert vector.shape[0] == self.neurons_.shape[-1] 
+        vector = np.resize(vector,self.neurons_.shape) 
+        dists = np.sum((vector-self.neurons_)** 2,axis=-1)
+        min = dists.argmin()
+        #w = np.unravel_index(min,dists.shape)
+        return divmod(min,self.size)
+    
+    def dist(self,w,n,radius):
+        wx,wy = w
+        nx,ny = n
+        d = (wx-nx)**2 + (wy-ny)**2
+        #offcial paper implementation : return math.exp(-d/2*radius**2)
+        return math.exp(-d/radius)
+    
+    def neurons_in_radius(self,winner,radius):
+        wi,wj = winner 
+        r = []
+        for i in range(self.neurons_.shape[0]):
+            for j in range(self.neurons_.shape[1]):
+                if math.sqrt((i-wi)**2 + (j-wj)**2) < radius:
+                    r.append((i,j))
+        return r
+        
+    def radius_of_the_neighbordhood(self):
+        l = self.n_iterations/self.size
+        return self.size * math.exp(-self.iteration/l)
diff --git a/scikits/learn/cluster/tests/test_som.py b/scikits/learn/cluster/tests/test_som.py
index 2c697f170a727..e99bd05dee8d2 100644
--- a/scikits/learn/cluster/tests/test_som.py
+++ b/scikits/learn/cluster/tests/test_som.py
@@ -2,11 +2,10 @@
 Testing for SOM 
 
 """
-import Image
 import numpy as np
 from numpy.testing import assert_equal
 
-from ..som_ import SOM,save_rgb
+from ..som_ import SelfOrganizingMap
 from .common import generate_clustered_data
 
 n_clusters = 4
@@ -14,7 +13,8 @@
 
 def test_som():
     np.random.seed(1)
-    som = SOM().fit(X,w=4,n_init=32,learning_rate=0.4) 
+    som = SelfOrganizingMap(size=2,n_iterations=10,learning_rate=1) 
+    som.fit(X)
     labels = som.labels_
 
     assert_equal(np.unique(labels).shape[0],4)
@@ -22,22 +22,3 @@ def test_som():
     assert_equal(np.unique(labels[20:40]).shape[0], 1)
     assert_equal(np.unique(labels[40:60]).shape[0], 1)
     assert_equal(np.unique(labels[60:]).shape[0], 1)
-
-def test_color_map():    
-    train = np.array([[0,0,0],       #black
-                      [255,255,255], #white
-                      [255,0,0],     #red
-                      [0,255,0],     #green
-                      [0,0,255],     #blue
-                      [255,255,0],   #yellow
-                      [0,255,255],   #cyan
-                      [255,0,255]    #magenta
-                      ])
-    w = np.random.rand(16,16,3)*255
-    som = SOM(w,n_init=1024,init='matrix',learning_rate=1)
-    save_rgb(w,'init.jpg')
-    som = SOM(w,n_init=1024,init='matrix',learning_rate=1)
-    som.fit(train)
-    save_rgb(som.neurons_,'color_map.jpg')
-
-

From 4e313371d1b3031148c8bdc0c123b75193a86e09 Mon Sep 17 00:00:00 2001
From: Alexandre Gramfort <alexandre.gramfort@inria.fr>
Date: Sat, 15 Jan 2011 22:34:54 -0500
Subject: [PATCH 4/7] review SOM code (PEP8, cosmits and inline comments)

---
 examples/cluster/plot_som_colormap.py   |  55 ++++++-----
 examples/cluster/som_digits.py          |  34 +++----
 scikits/learn/cluster/som_.py           | 122 ++++++++++++------------
 scikits/learn/cluster/tests/test_som.py |   7 +-
 4 files changed, 110 insertions(+), 108 deletions(-)

diff --git a/examples/cluster/plot_som_colormap.py b/examples/cluster/plot_som_colormap.py
index 2397afe726b60..434a508f47c85 100644
--- a/examples/cluster/plot_som_colormap.py
+++ b/examples/cluster/plot_som_colormap.py
@@ -1,8 +1,10 @@
 """
 ===========================================================
-A demo of SelfOrganisingMap with colored neurons 
+A demo of SelfOrganisingMap with colored neurons
 ===========================================================
 
+XXX : add description of example.
+
 """
 print __doc__
 import pylab as pl
@@ -10,39 +12,36 @@
 import numpy as np
 from scikits.learn.cluster import SelfOrganizingMap
 
-def plot(neurons):
-    assert neurons.shape[-1] == 3 
-    h,w,d = neurons.shape
-    hexmap = np.apply_along_axis(rgb2hex,1,
-                                 neurons.reshape(-1,3)/256)
-    index  = np.arange(h*w).reshape(h,w)    
-    pl.pcolor(index,cmap=ListedColormap(hexmap),norm=NoNorm())
-
-train = np.array([[0,0,0],       #black
-                  [255,255,255], #white
-                  [255,0,0],     #red
-                  [0,255,0],     #green
-                  [0,0,255],     #blue
-                  [255,255,0],   #yellow
-                  [0,255,255],   #cyan
-                  [255,0,255]    #magenta
-                  ])
-
 
-
-init = np.random.rand(16,16,3)*255
-
-pl.subplot(1, 2, 1,aspect='equal')
+def plot(neurons):
+    assert neurons.shape[-1] == 3
+    h, w, d = neurons.shape
+    hexmap = np.apply_along_axis(rgb2hex, 1, neurons.reshape(-1, 3) / 256)
+    index = np.arange(h * w).reshape(h, w)
+    pl.pcolor(index, cmap=ListedColormap(hexmap), norm=NoNorm())
+
+train = np.array([[0, 0, 0],       # black
+                  [255, 255, 255], # white
+                  [255, 0, 0],     # red
+                  [0, 255, 0],     # green
+                  [0, 0, 255],     # blue
+                  [255, 255, 0],   # yellow
+                  [0, 255, 255],   # cyan
+                  [255, 0, 255]])  # magenta
+
+init = np.random.rand(16, 16, 3) * 255
+
+pl.subplot(1, 2, 1, aspect='equal')
 plot(init)
 pl.title('Initial map')
 
-som = SelfOrganizingMap(init,n_iterations=1024,
-                        init='matrix',learning_rate=1)
+som = SelfOrganizingMap(init, n_iterations=1024,
+                        init='matrix', learning_rate=1)
 som.fit(train)
 
-pl.subplot(1, 2, 2,aspect='equal')
+pl.subplot(1, 2, 2, aspect='equal')
 plot(som.neurons_)
-pl.title('Organized Map') 
+pl.title('Organized Map')
 F = pl.gcf()
-F.set_size_inches( (40,20) )
+F.set_size_inches((40, 20))
 pl.show()
diff --git a/examples/cluster/som_digits.py b/examples/cluster/som_digits.py
index ff9a9c544ce56..c2e0a37efd199 100644
--- a/examples/cluster/som_digits.py
+++ b/examples/cluster/som_digits.py
@@ -2,6 +2,9 @@
 =======================================================================
 A demo of Self-Organising Map and KMeans on the handwritten digits data
 =======================================================================
+
+XXX : Should add text to describe what the example and what to expect
+from the output. Would it be possible to plot something?
 """
 from __future__ import division
 print __doc__
@@ -16,14 +19,16 @@
 from scikits.learn.datasets import load_digits
 from scikits.learn.preprocessing import scale
 
-def display(labels,digits,nbclusters):
-    r = {0:[],1:[],2:[],3:[],4:[],5:[],6:[],7:[],8:[],9:[]} 
-    for i,v in enumerate(labels):
+
+def display(labels, digits, n_clusters):
+    # XXX : n_clusters unused
+    r = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}
+    for i, v in enumerate(labels):
         r[digits.target[i]].append(v)
 
-    for k,v in r.items(): 
+    for k, v in r.items():
         s = set(v)
-        print 'target %i | nb cluster %i |' % (k,len(s)),s
+        print 'target %i | nb cluster %i |' % (k, len(s)), s
 
 np.random.seed(42)
 
@@ -41,15 +46,15 @@ def display(labels,digits,nbclusters):
 print "Self-Organizing Map "
 t0 = time()
 grid_width = 4
-som = SelfOrganizingMap(size=grid_width,n_iterations=n_samples*5,
+som = SelfOrganizingMap(size=grid_width, n_iterations=n_samples*5,
                         learning_rate=1)
 som.fit(data)
 print "done in %0.3fs" % (time() - t0)
 print
 
-display(som.labels_,digits,grid_width**2)
-C = calinski_index(data,som.labels_,som.neurons_)
-print 'calinski index %0.2f | %0.2f%%' % (C,100*(C/(1+C)))
+display(som.labels_, digits, grid_width**2)
+C = calinski_index(data, som.labels_, som.neurons_)
+print 'calinski index %0.2f | %0.2f%%' % (C, 100 * (C / (1 + C)))
 print
 
 print "KMeans "
@@ -58,11 +63,8 @@ def display(labels,digits,nbclusters):
 km = KMeans(init='k-means++', k=grid_width**2, n_init=10)
 km.fit(data)
 print "done in %0.3fs" % (time() - t0)
-print 
-
-display(km.labels_,digits,n_digits)
-C = calinski_index(data,km.labels_,km.cluster_centers_)
-print 'calinski index %0.2f | %0.2f%%' % (C,100*(C/(1+C)))
-
-
+print
 
+display(km.labels_, digits, n_digits)
+C = calinski_index(data, km.labels_, km.cluster_centers_)
+print 'calinski index %0.2f | %0.2f%%' % (C, 100 * (C / (1 + C)))
diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py
index b5fd08a3ff6de..18d6d3bccb08d 100644
--- a/scikits/learn/cluster/som_.py
+++ b/scikits/learn/cluster/som_.py
@@ -1,26 +1,26 @@
 """
- Self-organizing map 
- 
+ Self-organizing map
+
  Reference : (to check)
  Kohonen, T.; , "The self-organizing map,"
- Proceedings of the IEEE , vol.78, no.9, pp.1464-1480, Sep 1990 
+ Proceedings of the IEEE , vol.78, no.9, pp.1464-1480, Sep 1990
 """
 # Authors: Sebastien Campion <sebastien.campion@inria.fr>
 # License: BSD
 from __future__ import division
-from ..base import BaseEstimator
+import math
 import numpy as np
-import math 
+from ..base import BaseEstimator
+
 
 class SelfOrganizingMap(BaseEstimator):
-    """ Self-Organizing Map
+    """Self-Organizing Map
 
     Parameters
     ----------
-
     X : ndarray
         A M by N array of M observations in N dimensions or a length
-        M array of M one-dimensional observations.
+        M array of N one-dimensional observations.
 
     size : int
         Width and height of the square map as well as the number of
@@ -44,13 +44,11 @@ class SelfOrganizingMap(BaseEstimator):
 
     Methods
     -------
-
     fit(X):
         Compute SOM
 
     Attributes
     ----------
-
     neurons_: array, [(x,y), n_features]
         Coordinates of neurons and value
 
@@ -61,8 +59,9 @@ class SelfOrganizingMap(BaseEstimator):
     ------
 
     """
-    def __init__(self,size=16,init='random',n_iterations=64,learning_rate=1,
-                 callback=None ):   
+
+    def __init__(self, size=16, init='random', n_iterations=64,
+                 learning_rate=1, callback=None):
         self.size = size
         self.init = init
         self.n_iterations = n_iterations
@@ -70,73 +69,74 @@ def __init__(self,size=16,init='random',n_iterations=64,learning_rate=1,
         self.callback = callback
 
     def fit(self, X, **params):
-        """Given an sample of X, we randomly choose one of them for each 
+        """Given an sample of X, we randomly choose one of them for each
         iteration.
         A good ratio, nb X = 2 or 3 x nbiter"""
         X = np.asanyarray(X)
         self._set_params(**params)
-        self.dim = X.shape[-1]
-        self.neurons_ = None 
-
-        #init neurons_
-        if self.init == 'random': 
-            self.neurons_ = np.random.rand(self.size,self.size,self.dim)
-        elif self.init == 'matrix': 
-            assert len(self.size.shape) == 3 
-            self.neurons_ = self.size 
+        self.dim = X.shape[-1] # XXX : could we avoid storing dim in self?
+        self.neurons_ = None
+
+        # init neurons_
+        if self.init == 'random':
+            self.neurons_ = np.random.rand(self.size, self.size, self.dim)
+        elif self.init == 'matrix':
+            # XXX : untested
+            assert len(self.size.shape) == 3
+            self.neurons_ = self.size
             self.size = self.neurons_.shape[0]
-    
-        #iteration loop 
-        self.iteration = 0   
-        indices = np.random.random_integers(0,len(X)-1,self.n_iterations)
-        for i in indices: 
-            l = self.n_iterations/self.size
-            lr = self.learning_rate * math.exp(-self.iteration/l)
-            self._learn_vector(X[i],lr)
-            self.iteration += 1 
+
+        # iteration loop
+        self.iteration = 0 # XXX : could we avoid storing iteration in self?
+        indices = np.random.random_integers(0, len(X)-1, self.n_iterations)
+        for i in indices:
+            l = self.n_iterations / self.size
+            lr = self.learning_rate * math.exp(-self.iteration / l)
+            self._learn_vector(X[i], lr)
+            self.iteration += 1
             if self.callback != None:
-                self.callback(self,self.iteration)
-                
-        #assign labels
+                self.callback(self, self.iteration)
+
+        # assign labels
         self.labels_ = [self.bmu(x) for x in X]
         return self
 
     def _learn_vector(self, vector, lr):
         winner = self.bmu(vector)
         radius = self.radius_of_the_neighbordhood()
-        for n in self.neurons_in_radius(winner,radius):
-            nx,ny = n
+        for n in self.neurons_in_radius(winner, radius):
+            nx, ny = n
             wt = self.neurons_[nx][ny]
-            dr = self.dist(winner,n,radius)
-            self.neurons_[nx][ny] = wt + dr*lr*(vector - wt)
- 
-    def bmu(self,vector):
-        """
-        best matching unit
+            dr = self.dist(winner, n, radius)
+            self.neurons_[nx][ny] = wt + dr * lr * (vector - wt)
+
+    def bmu(self, vector):
+        """Best matching unit
         """
-        assert vector.shape[0] == self.neurons_.shape[-1] 
-        vector = np.resize(vector,self.neurons_.shape) 
-        dists = np.sum((vector-self.neurons_)** 2,axis=-1)
+        assert vector.shape[0] == self.neurons_.shape[-1]
+        vector = np.resize(vector, self.neurons_.shape)
+        dists = np.sum((vector - self.neurons_)**2, axis=-1)
         min = dists.argmin()
         #w = np.unravel_index(min,dists.shape)
-        return divmod(min,self.size)
-    
-    def dist(self,w,n,radius):
-        wx,wy = w
-        nx,ny = n
-        d = (wx-nx)**2 + (wy-ny)**2
-        #offcial paper implementation : return math.exp(-d/2*radius**2)
-        return math.exp(-d/radius)
-    
-    def neurons_in_radius(self,winner,radius):
-        wi,wj = winner 
+        return divmod(min, self.size)
+
+    def dist(self, w, n, radius):
+        wx, wy = w
+        nx, ny = n
+        d = (wx - nx)**2 + (wy - ny)**2
+        # offcial paper implementation : return math.exp(-d/2*radius**2)
+        return math.exp(-d / radius)
+
+    def neurons_in_radius(self, winner, radius):
+        wi, wj = winner
         r = []
+        # XXX : should be vertorized with numpy
         for i in range(self.neurons_.shape[0]):
             for j in range(self.neurons_.shape[1]):
-                if math.sqrt((i-wi)**2 + (j-wj)**2) < radius:
-                    r.append((i,j))
+                if math.sqrt((i - wi)**2 + (j - wj)**2) < radius:
+                    r.append((i, j))
         return r
-        
+
     def radius_of_the_neighbordhood(self):
-        l = self.n_iterations/self.size
-        return self.size * math.exp(-self.iteration/l)
+        l = self.n_iterations / self.size
+        return self.size * math.exp(-self.iteration / l)
diff --git a/scikits/learn/cluster/tests/test_som.py b/scikits/learn/cluster/tests/test_som.py
index e99bd05dee8d2..cc40acfb05896 100644
--- a/scikits/learn/cluster/tests/test_som.py
+++ b/scikits/learn/cluster/tests/test_som.py
@@ -1,5 +1,5 @@
 """
-Testing for SOM 
+Testing for SOM
 
 """
 import numpy as np
@@ -11,13 +11,14 @@
 n_clusters = 4
 X = generate_clustered_data(n_clusters=n_clusters, std=.1)
 
+
 def test_som():
     np.random.seed(1)
-    som = SelfOrganizingMap(size=2,n_iterations=10,learning_rate=1) 
+    som = SelfOrganizingMap(size=2, n_iterations=10, learning_rate=1)
     som.fit(X)
     labels = som.labels_
 
-    assert_equal(np.unique(labels).shape[0],4)
+    assert_equal(np.unique(labels).shape[0], 4)
     assert_equal(np.unique(labels[:20]).shape[0], 1)
     assert_equal(np.unique(labels[20:40]).shape[0], 1)
     assert_equal(np.unique(labels[40:60]).shape[0], 1)

From 7d9c7e979b8d99cf3bfe95d65a85a53f666b100f Mon Sep 17 00:00:00 2001
From: Sebastien Campion <seb@scamp.fr>
Date: Thu, 20 Jan 2011 12:57:07 +0100
Subject: [PATCH 5/7] numpy vectorized function + some changes on variable
 scope

---
 scikits/learn/cluster/som_.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py
index 18d6d3bccb08d..adb4dab8003db 100644
--- a/scikits/learn/cluster/som_.py
+++ b/scikits/learn/cluster/som_.py
@@ -74,12 +74,12 @@ def fit(self, X, **params):
         A good ratio, nb X = 2 or 3 x nbiter"""
         X = np.asanyarray(X)
         self._set_params(**params)
-        self.dim = X.shape[-1] # XXX : could we avoid storing dim in self?
         self.neurons_ = None
+        dim = X.shape[-1]
 
         # init neurons_
         if self.init == 'random':
-            self.neurons_ = np.random.rand(self.size, self.size, self.dim)
+            self.neurons_ = np.random.rand(self.size, self.size, dim)
         elif self.init == 'matrix':
             # XXX : untested
             assert len(self.size.shape) == 3
@@ -87,23 +87,23 @@ def fit(self, X, **params):
             self.size = self.neurons_.shape[0]
 
         # iteration loop
-        self.iteration = 0 # XXX : could we avoid storing iteration in self?
+        iteration = 0 
         indices = np.random.random_integers(0, len(X)-1, self.n_iterations)
         for i in indices:
             l = self.n_iterations / self.size
-            lr = self.learning_rate * math.exp(-self.iteration / l)
-            self._learn_vector(X[i], lr)
-            self.iteration += 1
+            lr = self.learning_rate * math.exp(-iteration / l)
+            self._learn_vector(X[i], lr, iteration)
+            iteration += 1
             if self.callback != None:
-                self.callback(self, self.iteration)
+                self.callback(self, iteration)
 
         # assign labels
         self.labels_ = [self.bmu(x) for x in X]
         return self
 
-    def _learn_vector(self, vector, lr):
+    def _learn_vector(self, vector, lr, iteration):
         winner = self.bmu(vector)
-        radius = self.radius_of_the_neighbordhood()
+        radius = self.radius_of_the_neighbordhood(iteration)
         for n in self.neurons_in_radius(winner, radius):
             nx, ny = n
             wt = self.neurons_[nx][ny]
@@ -129,14 +129,17 @@ def dist(self, w, n, radius):
 
     def neurons_in_radius(self, winner, radius):
         wi, wj = winner
-        r = []
-        # XXX : should be vertorized with numpy
+        x = y = np.arange(self.size)
+        xx, yy = np.meshgrid(x, y)
+        v = np.sqrt((xx - wi)**2 + (yy - wj)**2) < radius
+        return np.c_[np.nonzero(v)]
+
         for i in range(self.neurons_.shape[0]):
             for j in range(self.neurons_.shape[1]):
                 if math.sqrt((i - wi)**2 + (j - wj)**2) < radius:
                     r.append((i, j))
         return r
 
-    def radius_of_the_neighbordhood(self):
+    def radius_of_the_neighbordhood(self, iteration):
         l = self.n_iterations / self.size
-        return self.size * math.exp(-self.iteration / l)
+        return self.size * math.exp(-iteration / l)

From e4453b198a7d0f0c8b5531f1312cc2425079d8ff Mon Sep 17 00:00:00 2001
From: scampion <scampion@durex.irisa.fr>
Date: Fri, 21 Jan 2011 09:23:16 +0100
Subject: [PATCH 6/7] small fix, clean unused part of code

---
 examples/cluster/som_digits.py | 1 -
 scikits/learn/cluster/som_.py  | 6 ------
 2 files changed, 7 deletions(-)

diff --git a/examples/cluster/som_digits.py b/examples/cluster/som_digits.py
index c2e0a37efd199..f9c82a5b9f2d6 100644
--- a/examples/cluster/som_digits.py
+++ b/examples/cluster/som_digits.py
@@ -59,7 +59,6 @@ def display(labels, digits, n_clusters):
 
 print "KMeans "
 t0 = time()
-#km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data)
 km = KMeans(init='k-means++', k=grid_width**2, n_init=10)
 km.fit(data)
 print "done in %0.3fs" % (time() - t0)
diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py
index adb4dab8003db..3b9201b80e27a 100644
--- a/scikits/learn/cluster/som_.py
+++ b/scikits/learn/cluster/som_.py
@@ -134,12 +134,6 @@ def neurons_in_radius(self, winner, radius):
         v = np.sqrt((xx - wi)**2 + (yy - wj)**2) < radius
         return np.c_[np.nonzero(v)]
 
-        for i in range(self.neurons_.shape[0]):
-            for j in range(self.neurons_.shape[1]):
-                if math.sqrt((i - wi)**2 + (j - wj)**2) < radius:
-                    r.append((i, j))
-        return r
-
     def radius_of_the_neighbordhood(self, iteration):
         l = self.n_iterations / self.size
         return self.size * math.exp(-iteration / l)

From 690f47dc352616df4e66908ca9298f3869e3c131 Mon Sep 17 00:00:00 2001
From: Sebastien Campion <seb@scamp.fr>
Date: Fri, 4 Feb 2011 12:02:59 +0100
Subject: [PATCH 7/7] Added test, renamed calinski to pseudo_F and update few
 docs

---
 examples/cluster/plot_som_colormap.py   |  3 +-
 examples/cluster/som_digits.py          | 49 ++++++++++++-------------
 scikits/learn/cluster/__init__.py       | 15 +++++++-
 scikits/learn/cluster/som_.py           |  5 +--
 scikits/learn/cluster/tests/test_som.py | 19 +++++++++-
 5 files changed, 58 insertions(+), 33 deletions(-)

diff --git a/examples/cluster/plot_som_colormap.py b/examples/cluster/plot_som_colormap.py
index 434a508f47c85..19f326d156cea 100644
--- a/examples/cluster/plot_som_colormap.py
+++ b/examples/cluster/plot_som_colormap.py
@@ -3,7 +3,8 @@
 A demo of SelfOrganisingMap with colored neurons
 ===========================================================
 
-XXX : add description of example.
+Example for SOM clustering using 3 dimensionals vectors (RGB)
+with 8 colors (black, white, red, green, blue, yellow, cyan, magenta)
 
 """
 print __doc__
diff --git a/examples/cluster/som_digits.py b/examples/cluster/som_digits.py
index f9c82a5b9f2d6..db64fc892819d 100644
--- a/examples/cluster/som_digits.py
+++ b/examples/cluster/som_digits.py
@@ -3,8 +3,9 @@
 A demo of Self-Organising Map and KMeans on the handwritten digits data
 =======================================================================
 
-XXX : Should add text to describe what the example and what to expect
-from the output. Would it be possible to plot something?
+Comparing various SOM and Kmeans clustering on the handwritten digits data
+with the pseudo_F index
+ 
 """
 from __future__ import division
 print __doc__
@@ -14,35 +15,30 @@
 
 from scikits.learn.cluster import KMeans
 from scikits.learn.cluster import SelfOrganizingMap
-from scikits.learn.cluster import calinski_index
-
+from scikits.learn.cluster import pseudo_F
 from scikits.learn.datasets import load_digits
 from scikits.learn.preprocessing import scale
-
-
-def display(labels, digits, n_clusters):
-    # XXX : n_clusters unused
-    r = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}
-    for i, v in enumerate(labels):
-        r[digits.target[i]].append(v)
-
-    for k, v in r.items():
-        s = set(v)
-        print 'target %i | nb cluster %i |' % (k, len(s)), s
-
+from scikits.learn.metrics import confusion_matrix
+    
 np.random.seed(42)
 
+################################################################################
+# Load dataset 
+
 digits = load_digits()
 data = scale(digits.data)
-
 n_samples, n_features = data.shape
 n_digits = len(np.unique(digits.target))
 
-print "n_digits: %d" % n_digits
-print "n_features: %d" % n_features
-print "n_samples: %d" % n_samples
+print "Digits dataset"
+print "n_digits   : %d" % n_digits
+print "n_features : %d" % n_features
+print "n_samples  : %d" % n_samples
 print
 
+################################################################################
+# Digits dataset clustering using Self-Organizing Map
+
 print "Self-Organizing Map "
 t0 = time()
 grid_width = 4
@@ -52,11 +48,13 @@ def display(labels, digits, n_clusters):
 print "done in %0.3fs" % (time() - t0)
 print
 
-display(som.labels_, digits, grid_width**2)
-C = calinski_index(data, som.labels_, som.neurons_)
-print 'calinski index %0.2f | %0.2f%%' % (C, 100 * (C / (1 + C)))
+F = pseudo_F(data, som.labels_, som.neurons_)
+print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F)))
 print
 
+################################################################################
+# Digits dataset clustering using Kmeans
+
 print "KMeans "
 t0 = time()
 km = KMeans(init='k-means++', k=grid_width**2, n_init=10)
@@ -64,6 +62,5 @@ def display(labels, digits, n_clusters):
 print "done in %0.3fs" % (time() - t0)
 print
 
-display(km.labels_, digits, n_digits)
-C = calinski_index(data, km.labels_, km.cluster_centers_)
-print 'calinski index %0.2f | %0.2f%%' % (C, 100 * (C / (1 + C)))
+F = pseudo_F(data, km.labels_, km.cluster_centers_)
+print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F)))
diff --git a/scikits/learn/cluster/__init__.py b/scikits/learn/cluster/__init__.py
index 1f2b9f90a0cc8..c0647f9685654 100644
--- a/scikits/learn/cluster/__init__.py
+++ b/scikits/learn/cluster/__init__.py
@@ -10,11 +10,22 @@
 
 import numpy as np
 
-def calinski_index(X,labels,centroids):
+def pseudo_F(X, labels, centroids):
+    '''
+    The pseudo F statistic :
+
+    pseudo F = [( [(T - PG)/(G - 1)])/( [(PG)/(n - G)])] 
+
+    The pseudo F statistic was suggested by Calinski and Harabasz (1974)
+
+    Calinski, T. and J. Harabasz. 1974. 
+    A dendrite method for cluster analysis. Commun. Stat. 3: 1-27.
+    http://dx.doi.org/10.1080/03610927408827101
+    '''
     mean = np.mean(X,axis=0) 
     B = np.sum([ (c - mean)**2 for c in centroids])
     W = np.sum([ (x-centroids[labels[i]])**2 
-                 for i,x in enumerate(X)])
+                 for i, x in enumerate(X)])
     c = len(centroids)
     n = len(X)
     return (B /(c-1))/(W/ (n-c))
diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py
index 3b9201b80e27a..772588bd8c369 100644
--- a/scikits/learn/cluster/som_.py
+++ b/scikits/learn/cluster/som_.py
@@ -39,8 +39,8 @@ class SelfOrganizingMap(BaseEstimator):
 
         'random': randomly points choosed
 
-        'matrix': interpret the w parameter as a w by M array
-         of initial centroids.
+        'matrix': interpret the size parameter as a size by M array
+         of initial neurons.
 
     Methods
     -------
@@ -81,7 +81,6 @@ def fit(self, X, **params):
         if self.init == 'random':
             self.neurons_ = np.random.rand(self.size, self.size, dim)
         elif self.init == 'matrix':
-            # XXX : untested
             assert len(self.size.shape) == 3
             self.neurons_ = self.size
             self.size = self.neurons_.shape[0]
diff --git a/scikits/learn/cluster/tests/test_som.py b/scikits/learn/cluster/tests/test_som.py
index cc40acfb05896..080648f2643fb 100644
--- a/scikits/learn/cluster/tests/test_som.py
+++ b/scikits/learn/cluster/tests/test_som.py
@@ -9,7 +9,8 @@
 from .common import generate_clustered_data
 
 n_clusters = 4
-X = generate_clustered_data(n_clusters=n_clusters, std=.1)
+n_features = 2
+X = generate_clustered_data(n_clusters=n_clusters, n_features=2, std=.1)
 
 
 def test_som():
@@ -23,3 +24,19 @@ def test_som():
     assert_equal(np.unique(labels[20:40]).shape[0], 1)
     assert_equal(np.unique(labels[40:60]).shape[0], 1)
     assert_equal(np.unique(labels[60:]).shape[0], 1)
+
+def test_som_init_matrix():
+    np.random.seed(1)
+    random_ind = np.random.randint(0, X.shape[0], size=n_clusters)
+    init_map = X[random_ind].reshape(2,2,n_features)
+
+    som = SelfOrganizingMap(size=init_map, init='matrix',
+                            n_iterations=2000, learning_rate=0.1)
+
+    som.fit(X)
+    labels = som.labels_
+    assert_equal(np.unique(labels).shape[0], 4)
+    assert_equal(np.unique(labels[:20]).shape[0], 1)
+    assert_equal(np.unique(labels[20:40]).shape[0], 1)
+    assert_equal(np.unique(labels[40:60]).shape[0], 1)
+    assert_equal(np.unique(labels[60:]).shape[0], 1)