Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit de7b768

Browse files
committed
doc: follow up for normalizing histogram
1 parent 60d2f95 commit de7b768

File tree

2 files changed

+162
-110
lines changed

2 files changed

+162
-110
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,28 @@
11
"""
22
.. redirect-from:: /gallery/statistics/histogram_features
33
4-
===================================
5-
Histogram bins, density, and weight
6-
===================================
4+
.. _histogram_normalization:
5+
6+
==================================
7+
bins, density, and weights in hist
8+
==================================
79
810
The `.Axes.hist` method can flexibly create histograms in a few different ways,
911
which is flexible and helpful, but can also lead to confusion. In particular,
10-
you can:
12+
the method has the following parameters:
1113
12-
- bin the data as you want, either with an automatically chosen number of
14+
- ``bins`` bin the data as you want, either with an automatically chosen number of
1315
bins, or with fixed bin edges,
14-
- normalize the histogram so that its integral is one,
15-
- and assign weights to the data points, so that each data point affects the
16-
count in its bin differently.
16+
- ``density`` normalize the histogram so that its integral is one,
17+
- ``weights`` assign weights to each data point so that each point affects the
18+
count differently.
1719
1820
The Matplotlib ``hist`` method calls `numpy.histogram` and plots the results,
1921
therefore users should consult the numpy documentation for a definitive guide.
2022
2123
Histograms are created by defining bin edges, and taking a dataset of values
2224
and sorting them into the bins, and counting or summing how much data is in
23-
each bin. In this simple example, 9 numbers between 1 and 4 are sorted into 3
24-
bins:
25+
each bin. In this example, 9 numbers between 1 and 4 are sorted into 3 bins:
2526
"""
2627

2728
import matplotlib.pyplot as plt
@@ -36,63 +37,86 @@
3637
# very clear where the boundaries of the bins are:
3738
style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3}
3839

39-
fig, ax = plt.subplots()
40+
fig, ax = plt.subplots(layout='constrained', figsize=(8, 4))
41+
42+
# count the number of values in xdata between each value in xbins
4043
ax.hist(xdata, bins=xbins, **style)
4144

42-
# plot the xdata locations on the x axis:
43-
ax.plot(xdata, 0*xdata, 'd')
44-
ax.set_ylabel('Number per bin')
45-
ax.set_xlabel('x bins (dx=1.0)')
45+
# plot the xdata events
46+
ax.eventplot(xdata, color='C1', alpha=.5)
47+
48+
ax.set(xlabel='Number per bin', ylabel='x bins (dx=1.0)', title='histogram')
4649

4750
# %%
48-
# Modifying bins
49-
# ==============
51+
# bins
52+
# ====
5053
#
5154
# Changing the bin size changes the shape of this sparse histogram, so its a
52-
# good idea to choose bins with some care with respect to your data. Here we
53-
# make the bins half as wide.
55+
# good idea to choose bins with some care with respect to your data. The `.Axes.hist`
56+
# *bins* parameter accepts either the number of bins or a list of bin edges.
57+
#
58+
#
59+
# Fixed bin edges
60+
# ---------------
61+
#
62+
# Here the bins are set to the list of edges [1, 1.5, 2, 2.5, 3, 3.5, 4].
63+
# This is half as wide as the previous example.
5464

5565
xbins = np.arange(1, 4.5, 0.5)
5666

57-
fig, ax = plt.subplots()
67+
fig, ax = plt.subplots(layout='constrained', figsize=(8, 3))
68+
5869
ax.hist(xdata, bins=xbins, **style)
59-
ax.plot(xdata, 0*xdata, 'd')
60-
ax.set_ylabel('Number per bin')
61-
ax.set_xlabel('x bins (dx=0.5)')
70+
71+
ax.eventplot(xdata, lineoffsets=.5, color='C1', alpha=.5)
72+
73+
ax.set(ylabel='cpunt', xlabel='x bins (dx=0.5)',
74+
title='fixed bin edges: bins=np.arange(1, 4.5, .5)',)
6275

6376
# %%
77+
#
78+
# Number of bins
79+
# --------------
80+
#
6481
# We can also let numpy (via Matplotlib) choose the bins automatically, or
6582
# specify a number of bins to choose automatically:
6683

67-
fig, ax = plt.subplot_mosaic([['auto', 'n4']],
68-
sharex=True, sharey=True, layout='constrained')
84+
fig, ax = plt.subplot_mosaic([['auto'], ['n4']],
85+
sharex=True, sharey=True,
86+
layout='constrained', figsize=(8, 6))
6987

7088
ax['auto'].hist(xdata, **style)
71-
ax['auto'].plot(xdata, 0*xdata, 'd')
72-
ax['auto'].set_ylabel('Number per bin')
73-
ax['auto'].set_xlabel('x bins (auto)')
89+
ax['auto'].eventplot(xdata, lineoffsets=.5, color='C1', alpha=.5)
90+
91+
ax['auto'].set(ylabel='count', xlabel='x bins',
92+
title='dynamically computed bin edges: bins="auto"')
7493

7594
ax['n4'].hist(xdata, bins=4, **style)
76-
ax['n4'].plot(xdata, 0*xdata, 'd')
77-
ax['n4'].set_xlabel('x bins ("bins=4")')
95+
ax['n4'].eventplot(xdata, lineoffsets=.5, color='C1', alpha=.5)
96+
97+
ax['n4'].set(ylabel='count', xlabel='x bins',
98+
title='fixed number of bins: bins=4',)
7899

79100
# %%
80-
# Normalizing histograms: density and weight
81-
# ==========================================
101+
# density
102+
# =======
82103
#
83104
# Counts-per-bin is the default length of each bar in the histogram. However,
84105
# we can also normalize the bar lengths as a probability density function using
85106
# the ``density`` parameter:
86107

87-
fig, ax = plt.subplots()
108+
fig, ax = plt.subplots(layout='constrained', figsize=(8, 3))
109+
88110
ax.hist(xdata, bins=xbins, density=True, **style)
89-
ax.set_ylabel('Probability density [$V^{-1}$])')
90-
ax.set_xlabel('x bins (dx=0.5 $V$)')
111+
112+
ax.set(ylabel='Probability density [$V^{-1}$])',
113+
xlabel='x bins (dx=0.5 $V$)',
114+
title='normalizing histogram using density')
91115

92116
# %%
93117
# This normalization can be a little hard to interpret when just exploring the
94118
# data. The value attached to each bar is divided by the total number of data
95-
# points *and* the width of the bin, and thus the values _integrate_ to one
119+
# points *and* the width of the bin, and thus the values *integrate* to one
96120
# when integrating across the full range of data.
97121
# e.g. ::
98122
#
@@ -121,123 +145,148 @@
121145
# distribution function by both the length of the data and the width of the
122146
# bins:
123147

124-
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
125148
dx = 0.1
126149
xbins = np.arange(-4, 4, dx)
127-
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
128150

151+
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained',
152+
figsize=(8, 4))
153+
154+
155+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
129156
# scale and plot the expected pdf:
130-
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$')
131-
ax['False'].set_ylabel('Count per bin')
132-
ax['False'].set_xlabel('x bins [V]')
133-
ax['False'].legend()
157+
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$', alpha=.5)
158+
134159

135160
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
136-
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
137-
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
138-
ax['True'].set_xlabel('x bins [$V$]')
161+
ax['True'].plot(xpdf, pdf, label='$f_X(x)$', alpha=.5)
162+
163+
164+
ax['False'].set(ylabel='Count per bin', xlabel='x bins [V]',
165+
title="normalization using scaling, density=False")
166+
ax['False'].legend()
167+
ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
168+
title="density=True")
139169
ax['True'].legend()
140170

141171
# %%
142-
# One advantage of using the density is therefore that the shape and amplitude
143-
# of the histogram does not depend on the size of the bins. Consider an
144-
# extreme case where the bins do not have the same width. In this example, the
145-
# bins below ``x=-1.25`` are six times wider than the rest of the bins. By
146-
# normalizing by density, we preserve the shape of the distribution, whereas if
147-
# we do not, then the wider bins have much higher counts than the thinner bins:
148-
149-
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
172+
# weights
173+
# =======
174+
#
175+
# Sometimes people want to normalize so that the sum of counts is one. This is
176+
# analogous to a `probability mass function
177+
# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete
178+
# variable where the sum of probabilities for all the values equals one.
179+
#
180+
# Using ``hist``, we can get this normalization if we set the *weights* to 1/N.
181+
# Note that the amplitude of this normalized histogram still depends on
182+
# width and/or number of bins:
183+
184+
fig, ax = plt.subplots(layout='constrained', figsize=(8, 3))
185+
186+
for nn, dx in enumerate([0.1, 0.4, 1.2]):
187+
xbins = np.arange(-4, 4, dx)
188+
ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
189+
histtype='step', label=f'{dx}')
190+
191+
ax.set(ylabel='Bin count / N', xlabel='x bins [$V$]',
192+
title="histogram normalization using weights")
193+
ax.legend(fontsize='small', title='bin width:')
194+
195+
# %%
196+
# Case studies
197+
# ============
198+
#
199+
# Histogram normalization is used to compare histograms of different populations or
200+
# generated in different ways.
201+
#
202+
# Irregularly spaced bins
203+
# -----------------------
204+
# One advantage of using the density is that the shape and amplitude of the histogram
205+
# does not depend on the size of the bins. Consider an extreme case where the bins do
206+
# not have the same width. In this example, the bins below ``x=-1.25`` are six times
207+
# wider than the rest of the bins. By normalizing by density, we preserve the shape of
208+
# the distribution, whereas if we do not, then the wider bins have much higher counts
209+
# than the thinner bins:
210+
150211
dx = 0.1
151212
xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
213+
214+
fig, ax = plt.subplot_mosaic([['False', 'True']],
215+
layout='constrained', figsize=(8, 3))
216+
217+
152218
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
153-
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$')
154-
ax['False'].set_ylabel('Count per bin')
155-
ax['False'].set_xlabel('x bins [V]')
156-
ax['False'].legend()
219+
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$',
220+
alpha=.5)
157221

158222
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
159-
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
160-
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
161-
ax['True'].set_xlabel('x bins [$V$]')
223+
ax['True'].plot(xpdf, pdf, label='$f_X(x)$', alpha=.5)
224+
225+
226+
ax['False'].set(ylabel='Count per bin', xlabel='x bins [V]',
227+
title="irregularly spaced bins, density=False")
228+
ax['False'].legend()
229+
230+
ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
231+
title="irregularly spaced bins, density=True",)
162232
ax['True'].legend()
163233

164234
# %%
235+
# Different bin widths
236+
# --------------------
237+
#
165238
# Similarly, if we want to compare histograms with different bin widths, we may
166239
# want to use ``density=True``:
167240

168-
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
241+
fig, ax = plt.subplot_mosaic([['False', 'True']],
242+
layout='constrained', figsize=(8, 3))
169243

170244
# expected PDF
171245
ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k')
172246

173247
for nn, dx in enumerate([0.1, 0.4, 1.2]):
174248
xbins = np.arange(-4, 4, dx)
175249
# expected histogram:
176-
ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}')
250+
ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}', alpha=.5)
177251
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
178252

179253
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)
180254

181-
# Labels:
182-
ax['False'].set_xlabel('x bins [$V$]')
183-
ax['False'].set_ylabel('Count per bin')
184-
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
185-
ax['True'].set_xlabel('x bins [$V$]')
255+
ax['False'].set(ylabel='Count per bin', xlabel='x bins [$V$]',
256+
title="density=False")
257+
ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
258+
title='density=True')
186259
ax['True'].legend(fontsize='small', title='bin width:')
187-
188-
# %%
189-
# Sometimes people want to normalize so that the sum of counts is one. This is
190-
# analogous to a `probability mass function
191-
# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete
192-
# variable where the sum of probabilities for all the values equals one. Using
193-
# ``hist``, we can get this normalization if we set the *weights* to 1/N.
194-
# Note that the amplitude of this normalized histogram still depends on
195-
# width and/or number of the bins:
196-
197-
fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
198-
199-
for nn, dx in enumerate([0.1, 0.4, 1.2]):
200-
xbins = np.arange(-4, 4, dx)
201-
ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
202-
histtype='step', label=f'{dx}')
203-
ax.set_xlabel('x bins [$V$]')
204-
ax.set_ylabel('Bin count / N')
205-
ax.legend(fontsize='small', title='bin width:')
206-
207260
# %%
261+
# Populations of different sizes
262+
# ------------------------------
263+
#
208264
# The value of normalizing histograms is comparing two distributions that have
209-
# different sized populations. Here we compare the distribution of ``xdata``
265+
# different sized populations. Here we compare the distribution of ``xdata``
210266
# with a population of 1000, and ``xdata2`` with 100 members.
211267

212268
xdata2 = rng.normal(size=100)
213269

214270
fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']],
215-
layout='constrained', figsize=(8, 4))
271+
layout='constrained', figsize=(9, 3))
216272

217273
xbins = np.arange(-4, 4, 0.25)
218274

219-
ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
220-
ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
221-
ax['no_norm'].set_ylabel('Counts')
222-
ax['no_norm'].set_xlabel('x bins [$V$]')
223-
ax['no_norm'].set_title('No normalization')
224-
225-
ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
226-
ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
227-
ax['density'].set_ylabel('Probability density [$V^{-1}$]')
228-
ax['density'].set_title('Density=True')
229-
ax['density'].set_xlabel('x bins [$V$]')
230-
231-
ax['weight'].hist(xdata, bins=xbins, histtype='step',
232-
weights=1 / len(xdata) * np.ones(len(xdata)),
233-
label='N=1000')
234-
ax['weight'].hist(xdata2, bins=xbins, histtype='step',
235-
weights=1 / len(xdata2) * np.ones(len(xdata2)),
236-
label='N=100')
237-
ax['weight'].set_xlabel('x bins [$V$]')
238-
ax['weight'].set_ylabel('Counts / N')
275+
for xd in [xdata, xdata2]:
276+
ax['no_norm'].hist(xd, bins=xbins, histtype='step')
277+
ax['density'].hist(xd, bins=xbins, histtype='step', density=True)
278+
N = len(xd)
279+
ax['weight'].hist(xd, bins=xbins, histtype='step', weights=1 / N * np.ones(N),
280+
label=f'N={N}')
281+
282+
283+
ax['no_norm'].set(ylabel='Counts', xlabel='x bins [$V$]',
284+
title='No normalization')
285+
ax['density'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
286+
title='Density=True')
287+
ax['weight'].set(ylabel='Counts / N', xlabel='x bins [$V$]',
288+
title='Weight = 1/N')
239289
ax['weight'].legend(fontsize='small')
240-
ax['weight'].set_title('Weight = 1/N')
241290

242291
plt.show()
243292

@@ -253,3 +302,4 @@
253302
# - `matplotlib.axes.Axes.set_xlabel`
254303
# - `matplotlib.axes.Axes.set_ylabel`
255304
# - `matplotlib.axes.Axes.legend`
305+
#

galleries/tutorials/index.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ a :ref:`FAQ <faq-index>` in our :ref:`user guide <users-guide-index>`.
9595
/tutorials/images
9696
/tutorials/lifecycle
9797
/tutorials/artists
98+
/tutorials/histogram_normalization
9899

99100
.. only:: html
100101

@@ -134,6 +135,7 @@ Intermediate
134135
- :ref:`arranging_axes`
135136
- :ref:`autoscale`
136137
- :ref:`imshow_extent`
138+
- :ref:`histogram_normalization`
137139

138140
Advanced
139141
^^^^^^^^

0 commit comments

Comments
 (0)