Added ksdensity (KDE) function in mlab and adapted _axes violinplot function to accept new KDE.

khchan · solvents · commit da40c9db2382 · 2014-05-24T16:47:57.000-04:00
Added basic violinplot demo in examples
diff --git a/examples/statistics/violinplot_demo.py b/examples/statistics/violinplot_demo.py
@@ -0,0 +1,51 @@
+"""
+Demo of the new violinplot functionality
+"""
+
+import random
+import numpy as np
+import matplotlib.pyplot as plt
+
+# fake data
+fs = 10 # fontsize
+pos = range(5)
+data = [np.random.normal(size=100) for i in pos]
+
+# TODO: future customizability dicts go here
+
+# (From boxplot demo)
+# demonstrate how to customize the display different elements: 
+# boxprops = dict(linestyle='--', linewidth=3, color='darkgoldenrod')
+# flierprops = dict(marker='o', markerfacecolor='green', markersize=12,
+#                   linestyle='none')
+# medianprops = dict(linestyle='-.', linewidth=2.5, color='firebrick')
+# meanpointprops = dict(marker='D', markeredgecolor='black',
+#                       markerfacecolor='firebrick')
+# meanlineprops = dict(linestyle='--', linewidth=2.5, color='purple')
+
+fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(6,6))
+
+axes[0, 0].violinplot(data, pos, width=0.1)
+axes[0, 0].set_title('Custom violinplot 1', fontsize=fs)
+
+axes[0, 1].violinplot(data, pos, width=0.3)
+axes[0, 1].set_title('Custom violinplot 2', fontsize=fs)
+
+axes[0, 2].violinplot(data, pos, width=0.5)
+axes[0, 2].set_title('Custom violinplot 3', fontsize=fs)
+
+axes[1, 0].violinplot(data, pos, width=0.7)
+axes[1, 0].set_title('Custom violinplot 4', fontsize=fs)
+
+axes[1, 1].violinplot(data, pos, width=0.9)
+axes[1, 1].set_title('Custom violinplot 5', fontsize=fs)
+
+axes[1, 2].violinplot(data, pos, width=1.1)
+axes[1, 2].set_title('Custom violinplot 6', fontsize=fs)
+
+for ax in axes.flatten():
+    ax.set_yticklabels([])
+
+fig.suptitle("Violin Plotting Examples")
+fig.subplots_adjust(hspace=0.4)
+plt.show()
diff --git a/lib/matplotlib/axes/_axes.py b/lib/matplotlib/axes/_axes.py
@@ -6725,24 +6725,23 @@ def matshow(self, Z, **kwargs):
                                                  integer=True))
         return im
 
-
-    def violinplot(self, x, positions=None, width=0.5):
+    def violinplot(self, dataset, positions=None, width=0.5):
         """
         Make a violin plot.
 
         Call signature::
 
-          violinplot(x, positions=None)
+          violinplot(dataset, positions=None)
 
-        Make a violin plot for each column of *x* or each
-        vector in sequence *x*.  Each filled area extends to represent the
+        Make a violin plot for each column of *dataset* or each vector in 
+        sequence *dataset*.  Each filled area extends to represent the
         entire data range, with three lines at the mean, the minimum, and
         the maximum.
 
         Parameters
         ----------
 
-          x : Array or a sequence of vectors.
+          dataset : Array or a sequence of vectors.
             The input data.
 
           positions : array-like, default = [1, 2, ..., n]
@@ -6777,26 +6776,28 @@ def violinplot(self, x, positions=None, width=0.5):
         caps = []
 
         if positions == None:
-            positions = range(1, len(x) + 1)
-        elif len(positions) != len(x):
+            positions = range(1, len(dataset) + 1)
+        elif len(positions) != len(dataset):
             raise ValueError(datashape_message.format("positions"))
 
-        # TODO: Use kde estimation function on x
-        # These numbers are contrived
-        coords = np.arange(0.0, np.pi, np.pi/100.)
-        datasets = map(lambda i: np.sin(coords) ** i, range(1,len(x) + 1))
-        
-        for d,x in zip(datasets,positions):
-            # Since each data point p is plotted from x-p to x+p,
+        for d,p in zip(dataset,positions):            
+            # Calculate the kernel density
+            kde = mlab.ksdensity(d)
+            m = kde['xmin']
+            M = kde['xmax']
+            mean = kde['mean']
+            median = kde['median']
+            v = kde['result']
+            coords = np.arange(m,M,(M-m)/100.)
+
+            # Since each data point p is plotted from v-p to v+p,
             # we need to scale it by an additional 0.5 factor so that we get
             # correct width in the end.
-            d = 0.5 * widths * d/d.max()
-            m = d.min() # This should actually be the min for the dataset
-            M = d.max() # likewise
-            # bodies += [self.fill_betweenx(np.arange(m,M,(M-m)/100.),
+            v = 0.5 * width * v/v.max()
+
             bodies += [self.fill_betweenx(coords,
-                                          -d+x,
-                                          d+x,
+                                          -v+p,
+                                          v+p,
                                           facecolor='y',
                                           alpha=0.3)]
 
diff --git a/lib/matplotlib/mlab.py b/lib/matplotlib/mlab.py
@@ -3656,6 +3656,151 @@ def stineman_interp(xi,x,y,yp=None):
                                   1/(dy1+dy2),))
     return yi
 
+def ksdensity(dataset, bw_method=None):
+    """
+    Representation of a kernel-density estimate using Gaussian kernels.
+
+    Call signature::
+    xmin, xmax, result = ksdensity(dataset, 'scott')
+
+    Parameters
+    ----------
+    dataset : array_like
+        Datapoints to estimate from. In case of univariate data this is a 1-D
+        array, otherwise a 2-D array with shape (# of dims, # of data).
+    bw_method : str or scalar, optional
+        The method used to calculate the estimator bandwidth.  This can be
+        'scott', 'silverman', or a scalar constant.  If a scalar, this will
+        be used directly as `kde.factor`. If None (default), 'scott' is used.
+
+    Attributes
+    ----------
+    dataset : ndarray
+        The dataset with which `ksdensity` was initialized.
+    d : int
+        Number of dimensions.
+    n : int
+        Number of datapoints.
+    factor : float
+        The bandwidth factor, obtained from `kde.covariance_factor`, with which
+        the covariance matrix is multiplied.
+    covariance : ndarray
+        The covariance matrix of `dataset`, scaled by the calculated bandwidth
+        (`kde.factor`).
+    inv_cov : ndarray
+        The inverse of `covariance`.
+
+    Returns
+    -------
+    A dictionary mapping each various aspects of the computed KDE.
+    The dictionary has the following keys:
+
+        xmin : number
+            The min of the input dataset
+        xmax : number
+            The max of the input dataset
+        mean : number
+            The mean of the result
+        median: number
+            The median of the result
+        result: (# of points,)-array
+            The array of the evaluated PDF estimation
+
+    Raises
+    ------
+    ValueError : if the dimensionality of the input points is different than
+                 the dimensionality of the KDE.
+
+    """
+
+    # This implementation with minor modification was too good to pass up.
+    # from scipy: https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py 
+
+    dataset = np.atleast_2d(dataset)
+    xmin = dataset.min()
+    xmax = dataset.max()
+
+    if not dataset.size > 1:
+        raise ValueError("`dataset` input should have multiple elements.")
+
+    d, n = dataset.shape
+
+    # ----------------------------------------------
+    # Set Bandwidth, defaulted to Scott's Factor
+    # ----------------------------------------------
+    scotts_factor = lambda: np.power(n, -1./(d+4))
+    silverman_factor = lambda: np.power(n*(d+2.0)/4.0, -1./(d+4))
+
+    # Default method to calculate bandwidth, can be overwritten by subclass
+    covariance_factor = scotts_factor
+
+    if bw_method is None:
+        pass
+    elif bw_method == 'scott':
+        covariance_factor = scotts_factor
+    elif bw_method == 'silverman':
+        covariance_factor = silverman_factor
+    elif np.isscalar(bw_method) and not isinstance(bw_method, string_types):
+        covariance_factor = lambda: bw_method
+    else:
+        msg = "`bw_method` should be 'scott', 'silverman', or a scalar"
+        raise ValueError(msg)
+
+    # ---------------------------------------------------------------
+    # Computes covariance matrix for each Gaussian kernel with factor
+    # ---------------------------------------------------------------
+    factor = covariance_factor()
+
+    # Cache covariance and inverse covariance of the data
+    data_covariance = np.atleast_2d(np.cov(dataset, rowvar=1,bias=False))
+    data_inv_cov = np.linalg.inv(data_covariance)
+
+    covariance = data_covariance * factor**2
+    inv_cov = data_inv_cov / factor**2
+    norm_factor = np.sqrt(np.linalg.det(2*np.pi*covariance)) * n
+
+    # ----------------------------------------------
+    # Evaluate the estimated pdf on a set of points.
+    # ----------------------------------------------
+    points = np.atleast_2d(np.arange(xmin,xmax, (xmax-xmin)/100.))
+
+    d1, m1 = points.shape
+    if d1 != d:
+        if d1 == 1 and m1 == d:
+            # points was passed in as a row vector
+            points = np.reshape(points, (d, 1))
+            m1 = 1
+        else:
+            msg = "points have dimension %s, dataset has dimension %s" % (d1, d)
+            raise ValueError(msg)
+
+    result = np.zeros((m1,), dtype=np.float)
+
+    if m1 >= n:
+        # there are more points than data, so loop over data
+        for i in range(n):
+            diff = dataset[:, i, np.newaxis] - points
+            tdiff = np.dot(inv_cov, diff)
+            energy = np.sum(diff*tdiff,axis=0) / 2.0
+            result = result + np.exp(-energy)
+    else:
+        # loop over points
+        for i in range(m):
+            diff = dataset - points[:, i, newaxis]
+            tdiff = np.dot(inv_cov, diff)
+            energy = np.sum(diff * tdiff, axis=0) / 2.0
+            result[i] = np.sum(np.exp(-energy), axis=0)
+
+    result = result / norm_factor
+
+    return {
+        'xmin' : xmin,
+        'xmax' : xmax,
+        'mean' : np.mean(result),
+        'median' : np.median(result),
+        'result' : result
+    }
+
 ##################################################
 # Code related to things in and around polygons
 ##################################################
diff --git a/lib/matplotlib/pylab.py b/lib/matplotlib/pylab.py
@@ -23,6 +23,7 @@
   broken_barh - a set of horizontal bars with gaps
   box       - set the axes frame on/off state
   boxplot   - make a box and whisker plot
+  violinplot - make a violin plot
   cla       - clear current axes
   clabel    - label a contour plot
   clf       - clear a figure window
@@ -162,8 +163,8 @@
 
 _Statistics
 
-  amax       - the maximum along dimension m
-  amin       - the minimum along dimension m
+  amax      - the maximum along dimension m
+  amin      - the minimum along dimension m
   corrcoef  - correlation coefficient
   cov       - covariance matrix
   mean      - the mean along dimension m
@@ -172,7 +173,8 @@
   prod      - the product along dimension m
   ptp       - the max-min along dimension m
   std       - the standard deviation along dimension m
-  asum       - the sum along dimension m
+  asum      - the sum along dimension m
+  ksdensity - the kernel density estimate
 
 _Time series analysis