DOC: normalizing histograms

jklymak · jklymak · commit 981021d10816 · 2023-12-02T14:38:39.000-08:00
diff --git a/galleries/examples/statistics/hist.py b/galleries/examples/statistics/hist.py
@@ -36,6 +36,8 @@
 axs[0].hist(dist1, bins=n_bins)
 axs[1].hist(dist2, bins=n_bins)
 
+plt.show()
+
 
 # %%
 # Updating histogram colors
@@ -99,8 +101,6 @@
 # We can also define custom numbers of bins for each axis
 axs[2].hist2d(dist1, dist2, bins=(80, 10), norm=colors.LogNorm())
 
-plt.show()
-
 # %%
 #
 # .. admonition:: References
diff --git a/galleries/examples/statistics/histogram_features.py b/galleries/examples/statistics/histogram_features.py
diff --git a/galleries/examples/statistics/histogram_normalization.py b/galleries/examples/statistics/histogram_normalization.py
@@ -0,0 +1,207 @@
+"""
+.. redirect-from:: /gallery/statistics/histogram_features
+
+===================================
+Histogram bins, density, and weight
+===================================
+
+The `.Axes.hist` method can flexibly create histograms in a few different ways,
+which is flexible and helpful, but can also lead to confusion.  In particular,
+you can
+  - bin the data as you want, either with an automatically chosen number of
+    bins, or with fixed bin edges,
+  - normalize the histogram so that its integral is one,
+  - and assign weights to the data points, so that each data point affects the
+    count in its bin differently.
+
+The Matplotlib ``hist`` method calls `numpy.histogram` and plots the results,
+therefore users should consult the numpy documentation for a definitive guide.
+
+Histograms are created by defining bin edges, and taking a dataset of values
+and sorting them into the bins, and counting or summing how much data is in
+each bin.  In this simple example, 9 numbers between 1 and 4 are sorted into 3
+bins:
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+rng = np.random.default_rng(19680801)
+
+xdata = np.array([1.2, 2.3, 3.3, 3.1, 1.7, 3.4, 2.1, 1.25, 1.3])
+xbins = np.array([1, 2, 3, 4])
+
+# changing the style of the histogram bars just to make it
+# very clear where the boundaries of the bins are:
+style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3}
+
+fig, ax = plt.subplots()
+ax.hist(xdata, bins=xbins, **style)
+
+# plot the xdata locations on the x axis:
+ax.plot(xdata, 0*xdata, 'd')
+ax.set_ylabel('Number per bin')
+ax.set_xlabel('x bins (dx=1.0)')
+
+# %%
+# Modifying bins
+# ==============
+#
+# Changing the bin size changes the shape of this sparse histogram, so its a
+# good idea to choose bins with some care with respect to your data.  Here we
+# make the bins half as wide.
+
+xbins = np.arange(1, 4.5, 0.5)
+
+fig, ax = plt.subplots()
+ax.hist(xdata, bins=xbins, **style)
+ax.plot(xdata, 0*xdata, 'd')
+ax.set_ylabel('Number per bin')
+ax.set_xlabel('x bins (dx=0.5)')
+
+# %%
+# We can also let numpy (via Matplotlib) choose the bins automatically, or
+# specify a number of bins to choose automatically:
+
+fig, ax = plt.subplot_mosaic([['auto', 'n4']],
+                             sharex=True, sharey=True, layout='constrained')
+
+ax['auto'].hist(xdata, **style)
+ax['auto'].plot(xdata, 0*xdata, 'd')
+ax['auto'].set_ylabel('Number per bin')
+ax['auto'].set_xlabel('x bins (auto)')
+
+ax['n4'].hist(xdata, bins=4, **style)
+ax['n4'].plot(xdata, 0*xdata, 'd')
+ax['n4'].set_xlabel('x bins ("bins=4")')
+
+# %%
+# Normalizing histograms: density and weight
+# ==========================================
+#
+# Counts-per-bin is the default length of each bar in the histogram.  However,
+# we can also normalize the bar lengths as a probability density function using
+# the ``density`` parameter:
+
+fig, ax = plt.subplots()
+ax.hist(xdata, bins=xbins, density=True, **style)
+
+
+# %%
+# This normalization can be a little hard to interpret when just exploring the
+# data.  The value attached to each bar is divided by the total number of data
+# points _and_ the width of the bin, and the values _integrate_ to one when
+# integrating across the full range of data.
+#
+# The usefulness of this normalization is a little more clear when we draw from
+# a known distribution and try to compare with theory.  So, choose 1000 points
+# from a normal distribution, and also calculate the known probability density
+# function
+
+xdata = rng.normal(size=1000)
+xpdf = np.arange(-4, 4, 0.1)
+pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
+
+# %%
+# to make the point very obvious, consider bins that do not have the same
+# spacing.  By normalizing by density, we preserve the shape of the
+# distribution, whereas if we do not, then the wider bins have much higher
+# values than the thin bins:
+
+fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
+dx = 0.1
+xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
+ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
+ax['False'].set_ylabel('Count per bin')
+ax['False'].set_xlabel('x bins (below -1.25 bins are wider)')
+
+ax['True'].hist(xdata, bins=xbins, density=True, histtype='step')
+ax['True'].plot(xpdf, pdf)
+ax['True'].set_ylabel('Probability per x')
+ax['True'].set_xlabel('x bins (below -1.25 bins are wider)')
+
+
+# %%
+# This also makes it easier to compare histograms with different bin widths.
+# Note that in order to get the theoretical distribution, we must multiply the
+# distribution by the number of data points and the bin width
+
+fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
+
+ax['True'].plot(xpdf, pdf, '--', label='PDF', color='k')
+for nn, dx in enumerate([0.1, 0.4, 1.2]):
+    xbins = np.arange(-4, 4, dx)
+    ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
+    ax['False'].set_xlabel('x bins')
+    ax['False'].set_ylabel('Count per bin')
+    ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}')
+
+    ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)
+    ax['True'].set_ylabel('Probability per x')
+    ax['True'].set_xlabel('x bins')
+ax['True'].legend(fontsize='small')
+
+# %%
+# Sometimes people want to normalize so that the sum of counts is one.  This is
+# _not_ done with the *density* kwarg, but instead we can set the *weights* to
+# 1/N.  Note, however, that the amplitude of the histogram still depends on
+# width of the bins
+
+fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
+
+for nn, dx in enumerate([0.1, 0.4, 1.2]):
+    xbins = np.arange(-4, 4, dx)
+    ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
+                   histtype='step', label=f'{dx}')
+ax.set_xlabel('x bins')
+ax.set_ylabel('Bin count / N')
+ax.legend(fontsize='small')
+
+# %%
+# The true value of normalizing is if you do want to compare two distributions
+# that have different sized populations:
+
+xdata2 = rng.normal(size=100)
+
+fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']],
+                             layout='constrained', figsize=(8, 4))
+
+xbins = np.arange(-4, 4, 0.25)
+
+ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
+ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
+ax['no_norm'].set_ylabel('Counts')
+ax['no_norm'].set_xlabel('x bins')
+ax['no_norm'].set_title('No normalization')
+
+ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
+ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
+ax['density'].set_ylabel('Probabilty per x')
+ax['density'].set_title('Density=True')
+ax['density'].set_xlabel('x bins')
+
+ax['weight'].hist(xdata, bins=xbins, histtype='step',
+                  weights=1 / len(xdata) * np.ones(len(xdata)),
+                  label='N=1000')
+ax['weight'].hist(xdata2, bins=xbins, histtype='step',
+                  weights=1 / len(xdata2) * np.ones(len(xdata2)),
+                  label='N=100')
+ax['weight'].set_xlabel('x bins')
+ax['weight'].set_ylabel('Counts / N')
+ax['weight'].legend(fontsize='small')
+ax['weight'].set_title('Weight = 1/N')
+
+plt.show()
+
+# %%
+#
+# .. admonition:: References
+#
+#    The use of the following functions, methods, classes and modules is shown
+#    in this example:
+#
+#    - `matplotlib.axes.Axes.hist` / `matplotlib.pyplot.hist`
+#    - `matplotlib.axes.Axes.set_title`
+#    - `matplotlib.axes.Axes.set_xlabel`
+#    - `matplotlib.axes.Axes.set_ylabel`
+#    - `matplotlib.axes.Axes.legend`