Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f2da1f0

Browse files
committed
DOC: normalizing histograms
1 parent 8be243c commit f2da1f0

File tree

3 files changed

+257
-62
lines changed

3 files changed

+257
-62
lines changed

galleries/examples/statistics/hist.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
axs[0].hist(dist1, bins=n_bins)
3737
axs[1].hist(dist2, bins=n_bins)
3838

39+
plt.show()
40+
3941

4042
# %%
4143
# Updating histogram colors
@@ -99,8 +101,6 @@
99101
# We can also define custom numbers of bins for each axis
100102
axs[2].hist2d(dist1, dist2, bins=(80, 10), norm=colors.LogNorm())
101103

102-
plt.show()
103-
104104
# %%
105105
#
106106
# .. admonition:: References

galleries/examples/statistics/histogram_features.py

Lines changed: 0 additions & 60 deletions
This file was deleted.
Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
"""
2+
.. redirect-from:: /gallery/statistics/histogram_features
3+
4+
===================================
5+
Histogram bins, density, and weight
6+
===================================
7+
8+
The `.Axes.hist` method can flexibly create histograms in a few different ways,
9+
which is flexible and helpful, but can also lead to confusion. In particular,
10+
you can:
11+
12+
- bin the data as you want, either with an automatically chosen number of
13+
bins, or with fixed bin edges,
14+
- normalize the histogram so that its integral is one,
15+
- and assign weights to the data points, so that each data point affects the
16+
count in its bin differently.
17+
18+
The Matplotlib ``hist`` method calls `numpy.histogram` and plots the results,
19+
therefore users should consult the numpy documentation for a definitive guide.
20+
21+
Histograms are created by defining bin edges, and taking a dataset of values
22+
and sorting them into the bins, and counting or summing how much data is in
23+
each bin. In this simple example, 9 numbers between 1 and 4 are sorted into 3
24+
bins:
25+
"""
26+
27+
import matplotlib.pyplot as plt
28+
import numpy as np
29+
30+
rng = np.random.default_rng(19680801)
31+
32+
xdata = np.array([1.2, 2.3, 3.3, 3.1, 1.7, 3.4, 2.1, 1.25, 1.3])
33+
xbins = np.array([1, 2, 3, 4])
34+
35+
# changing the style of the histogram bars just to make it
36+
# very clear where the boundaries of the bins are:
37+
style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3}
38+
39+
fig, ax = plt.subplots()
40+
ax.hist(xdata, bins=xbins, **style)
41+
42+
# plot the xdata locations on the x axis:
43+
ax.plot(xdata, 0*xdata, 'd')
44+
ax.set_ylabel('Number per bin')
45+
ax.set_xlabel('x bins (dx=1.0)')
46+
47+
# %%
48+
# Modifying bins
49+
# ==============
50+
#
51+
# Changing the bin size changes the shape of this sparse histogram, so its a
52+
# good idea to choose bins with some care with respect to your data. Here we
53+
# make the bins half as wide.
54+
55+
xbins = np.arange(1, 4.5, 0.5)
56+
57+
fig, ax = plt.subplots()
58+
ax.hist(xdata, bins=xbins, **style)
59+
ax.plot(xdata, 0*xdata, 'd')
60+
ax.set_ylabel('Number per bin')
61+
ax.set_xlabel('x bins (dx=0.5)')
62+
63+
# %%
64+
# We can also let numpy (via Matplotlib) choose the bins automatically, or
65+
# specify a number of bins to choose automatically:
66+
67+
fig, ax = plt.subplot_mosaic([['auto', 'n4']],
68+
sharex=True, sharey=True, layout='constrained')
69+
70+
ax['auto'].hist(xdata, **style)
71+
ax['auto'].plot(xdata, 0*xdata, 'd')
72+
ax['auto'].set_ylabel('Number per bin')
73+
ax['auto'].set_xlabel('x bins (auto)')
74+
75+
ax['n4'].hist(xdata, bins=4, **style)
76+
ax['n4'].plot(xdata, 0*xdata, 'd')
77+
ax['n4'].set_xlabel('x bins ("bins=4")')
78+
79+
# %%
80+
# Normalizing histograms: density and weight
81+
# ==========================================
82+
#
83+
# Counts-per-bin is the default length of each bar in the histogram. However,
84+
# we can also normalize the bar lengths as a probability density function using
85+
# the ``density`` parameter:
86+
87+
fig, ax = plt.subplots()
88+
ax.hist(xdata, bins=xbins, density=True, **style)
89+
ax.set_ylabel('Probability density [$V^{-1}$])')
90+
ax.set_xlabel('x bins (dx=0.5 $V$)')
91+
92+
# %%
93+
# This normalization can be a little hard to interpret when just exploring the
94+
# data. The value attached to each bar is divided by the total number of data
95+
# points *and* the width of the bin, and thus the values _integrate_ to one
96+
# when integrating across the full range of data.
97+
# e.g. ::
98+
#
99+
# density = counts / (sum(counts) * np.diff(bins))
100+
# np.sum(density * np.diff(bins)) == 1
101+
#
102+
# This normalization is how `probability density functions
103+
# <https://en.wikipedia.org/wiki/Probability_density_function>`_ are defined in
104+
# statistics. If :math:`X` is a random variable on :math:`x`, then :math:`f_X`
105+
# is is the probability density function if :math:`P[a<X<b] = \int_a^b f_X dx`.
106+
# If the units of x are Volts, then the units of :math:`f_X` are :math:`V^{-1}`
107+
# or probability per change in voltage.
108+
#
109+
# The usefulness of this normalization is a little more clear when we draw from
110+
# a known distribution and try to compare with theory. So, choose 1000 points
111+
# from a `normal distribution
112+
# <https://en.wikipedia.org/wiki/Normal_distribution>`_, and also calculate the
113+
# known probability density function:
114+
115+
xdata = rng.normal(size=1000)
116+
xpdf = np.arange(-4, 4, 0.1)
117+
pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
118+
119+
# %%
120+
# If we don't use ``density=True``, we need to scale the expected probability
121+
# distribution function by both the length of the data and the width of the
122+
# bins:
123+
124+
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
125+
dx = 0.1
126+
xbins = np.arange(-4, 4, dx)
127+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
128+
129+
# scale and plot the expected pdf:
130+
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$')
131+
ax['False'].set_ylabel('Count per bin')
132+
ax['False'].set_xlabel('x bins [V]')
133+
ax['False'].legend()
134+
135+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
136+
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
137+
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
138+
ax['True'].set_xlabel('x bins [$V$]')
139+
ax['True'].legend()
140+
141+
# %%
142+
# One advantage of using the density is therefore that the shape and amplitude
143+
# of the histogram does not depend on the size of the bins. Consider an
144+
# extreme case where the bins do not have the same width. In this example, the
145+
# bins below ``x=-1.25`` are six times wider than the rest of the bins. By
146+
# normalizing by density, we preserve the shape of the distribution, whereas if
147+
# we do not, then the wider bins have much higher counts than the thinner bins:
148+
149+
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
150+
dx = 0.1
151+
xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
152+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
153+
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$')
154+
ax['False'].set_ylabel('Count per bin')
155+
ax['False'].set_xlabel('x bins [V]')
156+
ax['False'].legend()
157+
158+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
159+
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
160+
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
161+
ax['True'].set_xlabel('x bins [$V$]')
162+
ax['True'].legend()
163+
164+
# %%
165+
# Similarly, if we want to compare histograms with different bin widths, we may
166+
# want to use ``density=True``:
167+
168+
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
169+
170+
# expected PDF
171+
ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k')
172+
173+
for nn, dx in enumerate([0.1, 0.4, 1.2]):
174+
xbins = np.arange(-4, 4, dx)
175+
# expected histogram:
176+
ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}')
177+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
178+
179+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)
180+
181+
# Labels:
182+
ax['False'].set_xlabel('x bins [$V$]')
183+
ax['False'].set_ylabel('Count per bin')
184+
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
185+
ax['True'].set_xlabel('x bins [$V$]')
186+
ax['True'].legend(fontsize='small', title='bin width:')
187+
188+
# %%
189+
# Sometimes people want to normalize so that the sum of counts is one. This is
190+
# analogous to a `probability mass function
191+
# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete
192+
# variable where the sum of probabilities for all the values equals one. Using
193+
# ``hist``, we can get this normalization if we set the *weights* to 1/N.
194+
# Note that the amplitude of this normalized histogram still depends on
195+
# width and/or number of the bins:
196+
197+
fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
198+
199+
for nn, dx in enumerate([0.1, 0.4, 1.2]):
200+
xbins = np.arange(-4, 4, dx)
201+
ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
202+
histtype='step', label=f'{dx}')
203+
ax.set_xlabel('x bins [$V$]')
204+
ax.set_ylabel('Bin count / N')
205+
ax.legend(fontsize='small', title='bin width:')
206+
207+
# %%
208+
# The value of normalizing histograms is comparing two distributions that have
209+
# different sized populations. Here we compare the distribution of ``xdata``
210+
# with a population of 1000, and ``xdata2`` with 100 members.
211+
212+
xdata2 = rng.normal(size=100)
213+
214+
fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']],
215+
layout='constrained', figsize=(8, 4))
216+
217+
xbins = np.arange(-4, 4, 0.25)
218+
219+
ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
220+
ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
221+
ax['no_norm'].set_ylabel('Counts')
222+
ax['no_norm'].set_xlabel('x bins [$V$]')
223+
ax['no_norm'].set_title('No normalization')
224+
225+
ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
226+
ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
227+
ax['density'].set_ylabel('Probability density [$V^{-1}$]')
228+
ax['density'].set_title('Density=True')
229+
ax['density'].set_xlabel('x bins [$V$]')
230+
231+
ax['weight'].hist(xdata, bins=xbins, histtype='step',
232+
weights=1 / len(xdata) * np.ones(len(xdata)),
233+
label='N=1000')
234+
ax['weight'].hist(xdata2, bins=xbins, histtype='step',
235+
weights=1 / len(xdata2) * np.ones(len(xdata2)),
236+
label='N=100')
237+
ax['weight'].set_xlabel('x bins [$V$]')
238+
ax['weight'].set_ylabel('Counts / N')
239+
ax['weight'].legend(fontsize='small')
240+
ax['weight'].set_title('Weight = 1/N')
241+
242+
plt.show()
243+
244+
# %%
245+
#
246+
# .. admonition:: References
247+
#
248+
# The use of the following functions, methods, classes and modules is shown
249+
# in this example:
250+
#
251+
# - `matplotlib.axes.Axes.hist` / `matplotlib.pyplot.hist`
252+
# - `matplotlib.axes.Axes.set_title`
253+
# - `matplotlib.axes.Axes.set_xlabel`
254+
# - `matplotlib.axes.Axes.set_ylabel`
255+
# - `matplotlib.axes.Axes.legend`

0 commit comments

Comments
 (0)