Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9fdb0cd

Browse files
committed
doc: follow up for normalizing histogram
1 parent f2da1f0 commit 9fdb0cd

File tree

2 files changed

+143
-74
lines changed

2 files changed

+143
-74
lines changed

galleries/examples/statistics/histogram_normalization.py renamed to galleries/tutorials/histogram_normalization.py

Lines changed: 141 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
"""
22
.. redirect-from:: /gallery/statistics/histogram_features
33
4+
.. _histogram-normalization::
5+
46
===================================
57
Histogram bins, density, and weight
68
===================================
@@ -34,60 +36,84 @@
3436

3537
# changing the style of the histogram bars just to make it
3638
# very clear where the boundaries of the bins are:
37-
style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3}
39+
style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3, 'alpha': .5}
40+
41+
fig, ax = plt.subplots(figsize=(6, 3))
3842

39-
fig, ax = plt.subplots()
43+
# count the number of values in xdata between each value in xbins
4044
ax.hist(xdata, bins=xbins, **style)
4145

4246
# plot the xdata locations on the x axis:
43-
ax.plot(xdata, 0*xdata, 'd')
44-
ax.set_ylabel('Number per bin')
45-
ax.set_xlabel('x bins (dx=1.0)')
47+
ax.stem(xdata, [.5]*len(xdata), 'd')
48+
49+
ax.set(xlabel='Number per bin', ylabel='x bins (dx=1.0)',
50+
title='histogram',)
4651

4752
# %%
4853
# Modifying bins
4954
# ==============
5055
#
5156
# Changing the bin size changes the shape of this sparse histogram, so its a
52-
# good idea to choose bins with some care with respect to your data. Here we
53-
# make the bins half as wide.
57+
# good idea to choose bins with some care with respect to your data. The `.Axes.hist`
58+
# *bins* parameter accepts either the number of bins or a list of bin edges.
59+
#
60+
#
61+
# Set *bins* using fixed edges
62+
# -----------------------------
63+
#
64+
# Here the bins are set to the list of edges [1, 1.5, 2, 2.5, 3, 3.5, 4].
65+
# This is half as wide as the previous example.
5466

5567
xbins = np.arange(1, 4.5, 0.5)
5668

57-
fig, ax = plt.subplots()
69+
fig, ax = plt.subplots(figsize=(6, 3))
70+
5871
ax.hist(xdata, bins=xbins, **style)
59-
ax.plot(xdata, 0*xdata, 'd')
60-
ax.set_ylabel('Number per bin')
61-
ax.set_xlabel('x bins (dx=0.5)')
72+
73+
ax.stem(xdata, [.5]*len(xdata), 'd')
74+
75+
ax.set(ylabel='Number per bin', xlabel='x bins (dx=0.5)',
76+
title='fixed bin edges',)
6277

6378
# %%
79+
#
80+
# Set *bins* using number of bins
81+
# -------------------------------
82+
#
6483
# We can also let numpy (via Matplotlib) choose the bins automatically, or
6584
# specify a number of bins to choose automatically:
6685

67-
fig, ax = plt.subplot_mosaic([['auto', 'n4']],
68-
sharex=True, sharey=True, layout='constrained')
86+
fig, ax = plt.subplot_mosaic([['auto'], ['n4']],
87+
sharex=True, sharey=True,
88+
layout='constrained', figsize=(6, 6))
6989

7090
ax['auto'].hist(xdata, **style)
71-
ax['auto'].plot(xdata, 0*xdata, 'd')
72-
ax['auto'].set_ylabel('Number per bin')
73-
ax['auto'].set_xlabel('x bins (auto)')
91+
ax['auto'].stem(xdata, [.5]*len(xdata), 'd')
92+
93+
ax['auto'].set(ylabel='Number per bin', xlabel='x bins (auto)',
94+
title='dynamically computed bin edges')
7495

7596
ax['n4'].hist(xdata, bins=4, **style)
76-
ax['n4'].plot(xdata, 0*xdata, 'd')
77-
ax['n4'].set_xlabel('x bins ("bins=4")')
97+
ax['n4'].stem(xdata, [.5]*len(xdata), 'd')
98+
99+
ax['n4'].set(ylabel='Number per bin', xlabel='x bins ("bins=4")',
100+
title='fixed number of bins',)
78101

79102
# %%
80-
# Normalizing histograms: density and weight
81-
# ==========================================
103+
# Normalize histograms using density
104+
# ==================================
82105
#
83106
# Counts-per-bin is the default length of each bar in the histogram. However,
84107
# we can also normalize the bar lengths as a probability density function using
85108
# the ``density`` parameter:
86109

87-
fig, ax = plt.subplots()
110+
fig, ax = plt.subplots(figsize=(6, 3))
111+
88112
ax.hist(xdata, bins=xbins, density=True, **style)
89-
ax.set_ylabel('Probability density [$V^{-1}$])')
90-
ax.set_xlabel('x bins (dx=0.5 $V$)')
113+
114+
ax.set(ylabel='Probability density [$V^{-1}$])',
115+
xlabel='x bins (dx=0.5 $V$)',
116+
title='normalizing histogram using density')
91117

92118
# %%
93119
# This normalization can be a little hard to interpret when just exploring the
@@ -117,55 +143,83 @@
117143
pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
118144

119145
# %%
146+
# *density* parameter
147+
# -------------------
148+
#
120149
# If we don't use ``density=True``, we need to scale the expected probability
121150
# distribution function by both the length of the data and the width of the
122151
# bins:
123152

124-
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
125153
dx = 0.1
126154
xbins = np.arange(-4, 4, dx)
127-
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
128155

156+
fig, ax = plt.subplot_mosaic([['False'], ['True']], layout='constrained',
157+
figsize=(6, 6))
158+
159+
160+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step',
161+
label='Counts', alpha=style['alpha'])
129162
# scale and plot the expected pdf:
130163
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$')
131-
ax['False'].set_ylabel('Count per bin')
132-
ax['False'].set_xlabel('x bins [V]')
133-
ax['False'].legend()
134164

135-
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
165+
166+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step',
167+
label='density', alpha=style['alpha'])
136168
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
137-
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
138-
ax['True'].set_xlabel('x bins [$V$]')
169+
170+
171+
ax['False'].set(ylabel='Count per bin', xlabel='x bins [V]',
172+
title="normalization using scaling, density=False")
173+
ax['False'].legend()
174+
ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
175+
title="density=True")
139176
ax['True'].legend()
140177

141178
# %%
142-
# One advantage of using the density is therefore that the shape and amplitude
143-
# of the histogram does not depend on the size of the bins. Consider an
144-
# extreme case where the bins do not have the same width. In this example, the
145-
# bins below ``x=-1.25`` are six times wider than the rest of the bins. By
179+
# Preserving distribution shape
180+
# -----------------------------
181+
# One advantage of using the density is that the shape and amplitude of the histogram
182+
# does not depend on the size of the bins.
183+
#
184+
# Irregularly spaced bins
185+
# ^^^^^^^^^^^^^^^^^^^^^^^
186+
# Consider an extreme case where the bins do not have the same width. In this example,
187+
# the bins below ``x=-1.25`` are six times wider than the rest of the bins. By
146188
# normalizing by density, we preserve the shape of the distribution, whereas if
147189
# we do not, then the wider bins have much higher counts than the thinner bins:
148190

149-
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
150191
dx = 0.1
151192
xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
152-
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
193+
194+
fig, ax = plt.subplot_mosaic([['False'], ['True']],
195+
layout='constrained', figsize=(6, 6))
196+
197+
198+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step',
199+
label='Counts', alpha=style['alpha'])
153200
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$')
154-
ax['False'].set_ylabel('Count per bin')
155-
ax['False'].set_xlabel('x bins [V]')
156-
ax['False'].legend()
157201

158-
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
202+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step',
203+
label='density', alpha=style['alpha'])
159204
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
160-
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
161-
ax['True'].set_xlabel('x bins [$V$]')
205+
206+
207+
ax['False'].set(ylabel='Count per bin', xlabel='x bins [V]',
208+
title="irregularly spaced bins, density=False")
209+
ax['False'].legend()
210+
211+
ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
212+
title="irregularly spaced bins, density=True",)
162213
ax['True'].legend()
163214

164215
# %%
216+
# Histograms with different bin widths
217+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
165218
# Similarly, if we want to compare histograms with different bin widths, we may
166219
# want to use ``density=True``:
167220

168-
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
221+
fig, ax = plt.subplot_mosaic([['False'], ['True']],
222+
layout='constrained', figsize=(6, 6))
169223

170224
# expected PDF
171225
ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k')
@@ -174,70 +228,83 @@
174228
xbins = np.arange(-4, 4, dx)
175229
# expected histogram:
176230
ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}')
177-
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
231+
ax['False'].hist(xdata, bins=xbins, density=False,
232+
histtype='step', alpha=style['alpha'])
178233

179-
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)
234+
ax['True'].hist(xdata, bins=xbins, density=True,
235+
histtype='step', label=dx, alpha=style['alpha'])
180236

181237
# Labels:
182-
ax['False'].set_xlabel('x bins [$V$]')
183-
ax['False'].set_ylabel('Count per bin')
184-
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
185-
ax['True'].set_xlabel('x bins [$V$]')
238+
ax['False'].set(ylabel='Count per bin', xlabel='x bins [$V$]',
239+
title="density=False")
240+
ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
241+
title='density=True')
186242
ax['True'].legend(fontsize='small', title='bin width:')
187243

188244
# %%
245+
# Normalize histograms using weights
246+
# ==================================
247+
#
189248
# Sometimes people want to normalize so that the sum of counts is one. This is
190249
# analogous to a `probability mass function
191250
# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete
192-
# variable where the sum of probabilities for all the values equals one. Using
193-
# ``hist``, we can get this normalization if we set the *weights* to 1/N.
251+
# variable where the sum of probabilities for all the values equals one.
252+
#
253+
# *weights* parameter
254+
# -------------------
255+
# Using ``hist``, we can get this normalization if we set the *weights* to 1/N.
194256
# Note that the amplitude of this normalized histogram still depends on
195-
# width and/or number of the bins:
257+
# width and/or number of bins:
196258

197259
fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
198260

199261
for nn, dx in enumerate([0.1, 0.4, 1.2]):
200262
xbins = np.arange(-4, 4, dx)
201263
ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
202-
histtype='step', label=f'{dx}')
203-
ax.set_xlabel('x bins [$V$]')
204-
ax.set_ylabel('Bin count / N')
264+
histtype='step', label=f'{dx}', alpha=style['alpha'])
265+
266+
ax.set(ylabel='Bin count / N', xlabel='x bins [$V$]',
267+
title="histogram normalization using weights")
205268
ax.legend(fontsize='small', title='bin width:')
206269

207270
# %%
271+
# Populations of different sizes
272+
# ------------------------------
208273
# The value of normalizing histograms is comparing two distributions that have
209-
# different sized populations. Here we compare the distribution of ``xdata``
274+
# different sized populations. Here we compare the distribution of ``xdata``
210275
# with a population of 1000, and ``xdata2`` with 100 members.
211276

212277
xdata2 = rng.normal(size=100)
213278

214-
fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']],
215-
layout='constrained', figsize=(8, 4))
279+
fig, ax = plt.subplot_mosaic([['no_norm'], ['density'], ['weight']],
280+
layout='constrained', figsize=(6, 9))
216281

217282
xbins = np.arange(-4, 4, 0.25)
218283

219-
ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
220-
ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
221-
ax['no_norm'].set_ylabel('Counts')
222-
ax['no_norm'].set_xlabel('x bins [$V$]')
223-
ax['no_norm'].set_title('No normalization')
284+
ax['no_norm'].hist(xdata, bins=xbins, histtype='step', alpha=style['alpha'])
285+
ax['no_norm'].hist(xdata2, bins=xbins, histtype='step', alpha=style['alpha'])
286+
224287

225-
ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
226-
ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
227-
ax['density'].set_ylabel('Probability density [$V^{-1}$]')
228-
ax['density'].set_title('Density=True')
229-
ax['density'].set_xlabel('x bins [$V$]')
288+
ax['density'].hist(xdata, bins=xbins, histtype='step',
289+
density=True, alpha=style['alpha'])
290+
ax['density'].hist(xdata2, bins=xbins, histtype='step',
291+
density=True, alpha=style['alpha'])
230292

231293
ax['weight'].hist(xdata, bins=xbins, histtype='step',
232294
weights=1 / len(xdata) * np.ones(len(xdata)),
233-
label='N=1000')
295+
label='N=1000', alpha=style['alpha'])
234296
ax['weight'].hist(xdata2, bins=xbins, histtype='step',
235297
weights=1 / len(xdata2) * np.ones(len(xdata2)),
236-
label='N=100')
237-
ax['weight'].set_xlabel('x bins [$V$]')
238-
ax['weight'].set_ylabel('Counts / N')
298+
label='N=100', alpha=style['alpha'])
299+
300+
301+
ax['no_norm'].set(ylabel='Counts', xlabel='x bins [$V$]',
302+
title='No normalization')
303+
ax['density'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
304+
title='Density=True')
305+
ax['weight'].set(ylabel='Counts / N', xlabel='x bins [$V$]',
306+
title='Weight = 1/N')
239307
ax['weight'].legend(fontsize='small')
240-
ax['weight'].set_title('Weight = 1/N')
241308

242309
plt.show()
243310

galleries/tutorials/index.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ a :ref:`FAQ <faq-index>` in our :ref:`user guide <users-guide-index>`.
9595
/tutorials/images
9696
/tutorials/lifecycle
9797
/tutorials/artists
98+
/tutorials/histogram_normalization
9899

99100
.. only:: html
100101

@@ -134,6 +135,7 @@ Intermediate
134135
- :ref:`arranging_axes`
135136
- :ref:`autoscale`
136137
- :ref:`imshow_extent`
138+
- :ref:`histogram_normalization`
137139

138140
Advanced
139141
^^^^^^^^

0 commit comments

Comments
 (0)