Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 981021d

Browse files
committed
DOC: normalizing histograms
1 parent d8e272f commit 981021d

File tree

3 files changed

+209
-62
lines changed

3 files changed

+209
-62
lines changed

galleries/examples/statistics/hist.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
axs[0].hist(dist1, bins=n_bins)
3737
axs[1].hist(dist2, bins=n_bins)
3838

39+
plt.show()
40+
3941

4042
# %%
4143
# Updating histogram colors
@@ -99,8 +101,6 @@
99101
# We can also define custom numbers of bins for each axis
100102
axs[2].hist2d(dist1, dist2, bins=(80, 10), norm=colors.LogNorm())
101103

102-
plt.show()
103-
104104
# %%
105105
#
106106
# .. admonition:: References

galleries/examples/statistics/histogram_features.py

Lines changed: 0 additions & 60 deletions
This file was deleted.
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
"""
2+
.. redirect-from:: /gallery/statistics/histogram_features
3+
4+
===================================
5+
Histogram bins, density, and weight
6+
===================================
7+
8+
The `.Axes.hist` method can flexibly create histograms in a few different ways,
9+
which is flexible and helpful, but can also lead to confusion. In particular,
10+
you can
11+
- bin the data as you want, either with an automatically chosen number of
12+
bins, or with fixed bin edges,
13+
- normalize the histogram so that its integral is one,
14+
- and assign weights to the data points, so that each data point affects the
15+
count in its bin differently.
16+
17+
The Matplotlib ``hist`` method calls `numpy.histogram` and plots the results,
18+
therefore users should consult the numpy documentation for a definitive guide.
19+
20+
Histograms are created by defining bin edges, and taking a dataset of values
21+
and sorting them into the bins, and counting or summing how much data is in
22+
each bin. In this simple example, 9 numbers between 1 and 4 are sorted into 3
23+
bins:
24+
"""
25+
26+
import matplotlib.pyplot as plt
27+
import numpy as np
28+
29+
rng = np.random.default_rng(19680801)
30+
31+
xdata = np.array([1.2, 2.3, 3.3, 3.1, 1.7, 3.4, 2.1, 1.25, 1.3])
32+
xbins = np.array([1, 2, 3, 4])
33+
34+
# changing the style of the histogram bars just to make it
35+
# very clear where the boundaries of the bins are:
36+
style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3}
37+
38+
fig, ax = plt.subplots()
39+
ax.hist(xdata, bins=xbins, **style)
40+
41+
# plot the xdata locations on the x axis:
42+
ax.plot(xdata, 0*xdata, 'd')
43+
ax.set_ylabel('Number per bin')
44+
ax.set_xlabel('x bins (dx=1.0)')
45+
46+
# %%
47+
# Modifying bins
48+
# ==============
49+
#
50+
# Changing the bin size changes the shape of this sparse histogram, so its a
51+
# good idea to choose bins with some care with respect to your data. Here we
52+
# make the bins half as wide.
53+
54+
xbins = np.arange(1, 4.5, 0.5)
55+
56+
fig, ax = plt.subplots()
57+
ax.hist(xdata, bins=xbins, **style)
58+
ax.plot(xdata, 0*xdata, 'd')
59+
ax.set_ylabel('Number per bin')
60+
ax.set_xlabel('x bins (dx=0.5)')
61+
62+
# %%
63+
# We can also let numpy (via Matplotlib) choose the bins automatically, or
64+
# specify a number of bins to choose automatically:
65+
66+
fig, ax = plt.subplot_mosaic([['auto', 'n4']],
67+
sharex=True, sharey=True, layout='constrained')
68+
69+
ax['auto'].hist(xdata, **style)
70+
ax['auto'].plot(xdata, 0*xdata, 'd')
71+
ax['auto'].set_ylabel('Number per bin')
72+
ax['auto'].set_xlabel('x bins (auto)')
73+
74+
ax['n4'].hist(xdata, bins=4, **style)
75+
ax['n4'].plot(xdata, 0*xdata, 'd')
76+
ax['n4'].set_xlabel('x bins ("bins=4")')
77+
78+
# %%
79+
# Normalizing histograms: density and weight
80+
# ==========================================
81+
#
82+
# Counts-per-bin is the default length of each bar in the histogram. However,
83+
# we can also normalize the bar lengths as a probability density function using
84+
# the ``density`` parameter:
85+
86+
fig, ax = plt.subplots()
87+
ax.hist(xdata, bins=xbins, density=True, **style)
88+
89+
90+
# %%
91+
# This normalization can be a little hard to interpret when just exploring the
92+
# data. The value attached to each bar is divided by the total number of data
93+
# points _and_ the width of the bin, and the values _integrate_ to one when
94+
# integrating across the full range of data.
95+
#
96+
# The usefulness of this normalization is a little more clear when we draw from
97+
# a known distribution and try to compare with theory. So, choose 1000 points
98+
# from a normal distribution, and also calculate the known probability density
99+
# function
100+
101+
xdata = rng.normal(size=1000)
102+
xpdf = np.arange(-4, 4, 0.1)
103+
pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
104+
105+
# %%
106+
# to make the point very obvious, consider bins that do not have the same
107+
# spacing. By normalizing by density, we preserve the shape of the
108+
# distribution, whereas if we do not, then the wider bins have much higher
109+
# values than the thin bins:
110+
111+
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
112+
dx = 0.1
113+
xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
114+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
115+
ax['False'].set_ylabel('Count per bin')
116+
ax['False'].set_xlabel('x bins (below -1.25 bins are wider)')
117+
118+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step')
119+
ax['True'].plot(xpdf, pdf)
120+
ax['True'].set_ylabel('Probability per x')
121+
ax['True'].set_xlabel('x bins (below -1.25 bins are wider)')
122+
123+
124+
# %%
125+
# This also makes it easier to compare histograms with different bin widths.
126+
# Note that in order to get the theoretical distribution, we must multiply the
127+
# distribution by the number of data points and the bin width
128+
129+
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
130+
131+
ax['True'].plot(xpdf, pdf, '--', label='PDF', color='k')
132+
for nn, dx in enumerate([0.1, 0.4, 1.2]):
133+
xbins = np.arange(-4, 4, dx)
134+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
135+
ax['False'].set_xlabel('x bins')
136+
ax['False'].set_ylabel('Count per bin')
137+
ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}')
138+
139+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)
140+
ax['True'].set_ylabel('Probability per x')
141+
ax['True'].set_xlabel('x bins')
142+
ax['True'].legend(fontsize='small')
143+
144+
# %%
145+
# Sometimes people want to normalize so that the sum of counts is one. This is
146+
# _not_ done with the *density* kwarg, but instead we can set the *weights* to
147+
# 1/N. Note, however, that the amplitude of the histogram still depends on
148+
# width of the bins
149+
150+
fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
151+
152+
for nn, dx in enumerate([0.1, 0.4, 1.2]):
153+
xbins = np.arange(-4, 4, dx)
154+
ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
155+
histtype='step', label=f'{dx}')
156+
ax.set_xlabel('x bins')
157+
ax.set_ylabel('Bin count / N')
158+
ax.legend(fontsize='small')
159+
160+
# %%
161+
# The true value of normalizing is if you do want to compare two distributions
162+
# that have different sized populations:
163+
164+
xdata2 = rng.normal(size=100)
165+
166+
fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']],
167+
layout='constrained', figsize=(8, 4))
168+
169+
xbins = np.arange(-4, 4, 0.25)
170+
171+
ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
172+
ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
173+
ax['no_norm'].set_ylabel('Counts')
174+
ax['no_norm'].set_xlabel('x bins')
175+
ax['no_norm'].set_title('No normalization')
176+
177+
ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
178+
ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
179+
ax['density'].set_ylabel('Probabilty per x')
180+
ax['density'].set_title('Density=True')
181+
ax['density'].set_xlabel('x bins')
182+
183+
ax['weight'].hist(xdata, bins=xbins, histtype='step',
184+
weights=1 / len(xdata) * np.ones(len(xdata)),
185+
label='N=1000')
186+
ax['weight'].hist(xdata2, bins=xbins, histtype='step',
187+
weights=1 / len(xdata2) * np.ones(len(xdata2)),
188+
label='N=100')
189+
ax['weight'].set_xlabel('x bins')
190+
ax['weight'].set_ylabel('Counts / N')
191+
ax['weight'].legend(fontsize='small')
192+
ax['weight'].set_title('Weight = 1/N')
193+
194+
plt.show()
195+
196+
# %%
197+
#
198+
# .. admonition:: References
199+
#
200+
# The use of the following functions, methods, classes and modules is shown
201+
# in this example:
202+
#
203+
# - `matplotlib.axes.Axes.hist` / `matplotlib.pyplot.hist`
204+
# - `matplotlib.axes.Axes.set_title`
205+
# - `matplotlib.axes.Axes.set_xlabel`
206+
# - `matplotlib.axes.Axes.set_ylabel`
207+
# - `matplotlib.axes.Axes.legend`

0 commit comments

Comments
 (0)