Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit da40c9d

Browse files
khchansolvents
authored andcommitted
Added ksdensity (KDE) function in mlab and adapted _axes violinplot function to accept new KDE.
Added basic violinplot demo in examples
1 parent d9f71fb commit da40c9d

File tree

4 files changed

+223
-24
lines changed

4 files changed

+223
-24
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
"""
2+
Demo of the new violinplot functionality
3+
"""
4+
5+
import random
6+
import numpy as np
7+
import matplotlib.pyplot as plt
8+
9+
# fake data
10+
fs = 10 # fontsize
11+
pos = range(5)
12+
data = [np.random.normal(size=100) for i in pos]
13+
14+
# TODO: future customizability dicts go here
15+
16+
# (From boxplot demo)
17+
# demonstrate how to customize the display different elements:
18+
# boxprops = dict(linestyle='--', linewidth=3, color='darkgoldenrod')
19+
# flierprops = dict(marker='o', markerfacecolor='green', markersize=12,
20+
# linestyle='none')
21+
# medianprops = dict(linestyle='-.', linewidth=2.5, color='firebrick')
22+
# meanpointprops = dict(marker='D', markeredgecolor='black',
23+
# markerfacecolor='firebrick')
24+
# meanlineprops = dict(linestyle='--', linewidth=2.5, color='purple')
25+
26+
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(6,6))
27+
28+
axes[0, 0].violinplot(data, pos, width=0.1)
29+
axes[0, 0].set_title('Custom violinplot 1', fontsize=fs)
30+
31+
axes[0, 1].violinplot(data, pos, width=0.3)
32+
axes[0, 1].set_title('Custom violinplot 2', fontsize=fs)
33+
34+
axes[0, 2].violinplot(data, pos, width=0.5)
35+
axes[0, 2].set_title('Custom violinplot 3', fontsize=fs)
36+
37+
axes[1, 0].violinplot(data, pos, width=0.7)
38+
axes[1, 0].set_title('Custom violinplot 4', fontsize=fs)
39+
40+
axes[1, 1].violinplot(data, pos, width=0.9)
41+
axes[1, 1].set_title('Custom violinplot 5', fontsize=fs)
42+
43+
axes[1, 2].violinplot(data, pos, width=1.1)
44+
axes[1, 2].set_title('Custom violinplot 6', fontsize=fs)
45+
46+
for ax in axes.flatten():
47+
ax.set_yticklabels([])
48+
49+
fig.suptitle("Violin Plotting Examples")
50+
fig.subplots_adjust(hspace=0.4)
51+
plt.show()

lib/matplotlib/axes/_axes.py

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6725,24 +6725,23 @@ def matshow(self, Z, **kwargs):
67256725
integer=True))
67266726
return im
67276727

6728-
6729-
def violinplot(self, x, positions=None, width=0.5):
6728+
def violinplot(self, dataset, positions=None, width=0.5):
67306729
"""
67316730
Make a violin plot.
67326731
67336732
Call signature::
67346733
6735-
violinplot(x, positions=None)
6734+
violinplot(dataset, positions=None)
67366735
6737-
Make a violin plot for each column of *x* or each
6738-
vector in sequence *x*. Each filled area extends to represent the
6736+
Make a violin plot for each column of *dataset* or each vector in
6737+
sequence *dataset*. Each filled area extends to represent the
67396738
entire data range, with three lines at the mean, the minimum, and
67406739
the maximum.
67416740
67426741
Parameters
67436742
----------
67446743
6745-
x : Array or a sequence of vectors.
6744+
dataset : Array or a sequence of vectors.
67466745
The input data.
67476746
67486747
positions : array-like, default = [1, 2, ..., n]
@@ -6777,26 +6776,28 @@ def violinplot(self, x, positions=None, width=0.5):
67776776
caps = []
67786777

67796778
if positions == None:
6780-
positions = range(1, len(x) + 1)
6781-
elif len(positions) != len(x):
6779+
positions = range(1, len(dataset) + 1)
6780+
elif len(positions) != len(dataset):
67826781
raise ValueError(datashape_message.format("positions"))
67836782

6784-
# TODO: Use kde estimation function on x
6785-
# These numbers are contrived
6786-
coords = np.arange(0.0, np.pi, np.pi/100.)
6787-
datasets = map(lambda i: np.sin(coords) ** i, range(1,len(x) + 1))
6788-
6789-
for d,x in zip(datasets,positions):
6790-
# Since each data point p is plotted from x-p to x+p,
6783+
for d,p in zip(dataset,positions):
6784+
# Calculate the kernel density
6785+
kde = mlab.ksdensity(d)
6786+
m = kde['xmin']
6787+
M = kde['xmax']
6788+
mean = kde['mean']
6789+
median = kde['median']
6790+
v = kde['result']
6791+
coords = np.arange(m,M,(M-m)/100.)
6792+
6793+
# Since each data point p is plotted from v-p to v+p,
67916794
# we need to scale it by an additional 0.5 factor so that we get
67926795
# correct width in the end.
6793-
d = 0.5 * widths * d/d.max()
6794-
m = d.min() # This should actually be the min for the dataset
6795-
M = d.max() # likewise
6796-
# bodies += [self.fill_betweenx(np.arange(m,M,(M-m)/100.),
6796+
v = 0.5 * width * v/v.max()
6797+
67976798
bodies += [self.fill_betweenx(coords,
6798-
-d+x,
6799-
d+x,
6799+
-v+p,
6800+
v+p,
68006801
facecolor='y',
68016802
alpha=0.3)]
68026803

lib/matplotlib/mlab.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3656,6 +3656,151 @@ def stineman_interp(xi,x,y,yp=None):
36563656
1/(dy1+dy2),))
36573657
return yi
36583658

3659+
def ksdensity(dataset, bw_method=None):
3660+
"""
3661+
Representation of a kernel-density estimate using Gaussian kernels.
3662+
3663+
Call signature::
3664+
xmin, xmax, result = ksdensity(dataset, 'scott')
3665+
3666+
Parameters
3667+
----------
3668+
dataset : array_like
3669+
Datapoints to estimate from. In case of univariate data this is a 1-D
3670+
array, otherwise a 2-D array with shape (# of dims, # of data).
3671+
bw_method : str or scalar, optional
3672+
The method used to calculate the estimator bandwidth. This can be
3673+
'scott', 'silverman', or a scalar constant. If a scalar, this will
3674+
be used directly as `kde.factor`. If None (default), 'scott' is used.
3675+
3676+
Attributes
3677+
----------
3678+
dataset : ndarray
3679+
The dataset with which `ksdensity` was initialized.
3680+
d : int
3681+
Number of dimensions.
3682+
n : int
3683+
Number of datapoints.
3684+
factor : float
3685+
The bandwidth factor, obtained from `kde.covariance_factor`, with which
3686+
the covariance matrix is multiplied.
3687+
covariance : ndarray
3688+
The covariance matrix of `dataset`, scaled by the calculated bandwidth
3689+
(`kde.factor`).
3690+
inv_cov : ndarray
3691+
The inverse of `covariance`.
3692+
3693+
Returns
3694+
-------
3695+
A dictionary mapping each various aspects of the computed KDE.
3696+
The dictionary has the following keys:
3697+
3698+
xmin : number
3699+
The min of the input dataset
3700+
xmax : number
3701+
The max of the input dataset
3702+
mean : number
3703+
The mean of the result
3704+
median: number
3705+
The median of the result
3706+
result: (# of points,)-array
3707+
The array of the evaluated PDF estimation
3708+
3709+
Raises
3710+
------
3711+
ValueError : if the dimensionality of the input points is different than
3712+
the dimensionality of the KDE.
3713+
3714+
"""
3715+
3716+
# This implementation with minor modification was too good to pass up.
3717+
# from scipy: https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py
3718+
3719+
dataset = np.atleast_2d(dataset)
3720+
xmin = dataset.min()
3721+
xmax = dataset.max()
3722+
3723+
if not dataset.size > 1:
3724+
raise ValueError("`dataset` input should have multiple elements.")
3725+
3726+
d, n = dataset.shape
3727+
3728+
# ----------------------------------------------
3729+
# Set Bandwidth, defaulted to Scott's Factor
3730+
# ----------------------------------------------
3731+
scotts_factor = lambda: np.power(n, -1./(d+4))
3732+
silverman_factor = lambda: np.power(n*(d+2.0)/4.0, -1./(d+4))
3733+
3734+
# Default method to calculate bandwidth, can be overwritten by subclass
3735+
covariance_factor = scotts_factor
3736+
3737+
if bw_method is None:
3738+
pass
3739+
elif bw_method == 'scott':
3740+
covariance_factor = scotts_factor
3741+
elif bw_method == 'silverman':
3742+
covariance_factor = silverman_factor
3743+
elif np.isscalar(bw_method) and not isinstance(bw_method, string_types):
3744+
covariance_factor = lambda: bw_method
3745+
else:
3746+
msg = "`bw_method` should be 'scott', 'silverman', or a scalar"
3747+
raise ValueError(msg)
3748+
3749+
# ---------------------------------------------------------------
3750+
# Computes covariance matrix for each Gaussian kernel with factor
3751+
# ---------------------------------------------------------------
3752+
factor = covariance_factor()
3753+
3754+
# Cache covariance and inverse covariance of the data
3755+
data_covariance = np.atleast_2d(np.cov(dataset, rowvar=1,bias=False))
3756+
data_inv_cov = np.linalg.inv(data_covariance)
3757+
3758+
covariance = data_covariance * factor**2
3759+
inv_cov = data_inv_cov / factor**2
3760+
norm_factor = np.sqrt(np.linalg.det(2*np.pi*covariance)) * n
3761+
3762+
# ----------------------------------------------
3763+
# Evaluate the estimated pdf on a set of points.
3764+
# ----------------------------------------------
3765+
points = np.atleast_2d(np.arange(xmin,xmax, (xmax-xmin)/100.))
3766+
3767+
d1, m1 = points.shape
3768+
if d1 != d:
3769+
if d1 == 1 and m1 == d:
3770+
# points was passed in as a row vector
3771+
points = np.reshape(points, (d, 1))
3772+
m1 = 1
3773+
else:
3774+
msg = "points have dimension %s, dataset has dimension %s" % (d1, d)
3775+
raise ValueError(msg)
3776+
3777+
result = np.zeros((m1,), dtype=np.float)
3778+
3779+
if m1 >= n:
3780+
# there are more points than data, so loop over data
3781+
for i in range(n):
3782+
diff = dataset[:, i, np.newaxis] - points
3783+
tdiff = np.dot(inv_cov, diff)
3784+
energy = np.sum(diff*tdiff,axis=0) / 2.0
3785+
result = result + np.exp(-energy)
3786+
else:
3787+
# loop over points
3788+
for i in range(m):
3789+
diff = dataset - points[:, i, newaxis]
3790+
tdiff = np.dot(inv_cov, diff)
3791+
energy = np.sum(diff * tdiff, axis=0) / 2.0
3792+
result[i] = np.sum(np.exp(-energy), axis=0)
3793+
3794+
result = result / norm_factor
3795+
3796+
return {
3797+
'xmin' : xmin,
3798+
'xmax' : xmax,
3799+
'mean' : np.mean(result),
3800+
'median' : np.median(result),
3801+
'result' : result
3802+
}
3803+
36593804
##################################################
36603805
# Code related to things in and around polygons
36613806
##################################################

lib/matplotlib/pylab.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
broken_barh - a set of horizontal bars with gaps
2424
box - set the axes frame on/off state
2525
boxplot - make a box and whisker plot
26+
violinplot - make a violin plot
2627
cla - clear current axes
2728
clabel - label a contour plot
2829
clf - clear a figure window
@@ -162,8 +163,8 @@
162163
163164
_Statistics
164165
165-
amax - the maximum along dimension m
166-
amin - the minimum along dimension m
166+
amax - the maximum along dimension m
167+
amin - the minimum along dimension m
167168
corrcoef - correlation coefficient
168169
cov - covariance matrix
169170
mean - the mean along dimension m
@@ -172,7 +173,8 @@
172173
prod - the product along dimension m
173174
ptp - the max-min along dimension m
174175
std - the standard deviation along dimension m
175-
asum - the sum along dimension m
176+
asum - the sum along dimension m
177+
ksdensity - the kernel density estimate
176178
177179
_Time series analysis
178180

0 commit comments

Comments
 (0)