doc: follow up for normalizing histogram

story645 · story645 · commit aec07ce06476 · 2023-12-18T02:11:02.000-05:00
diff --git a/galleries/tutorials/histogram_normalization.py b/galleries/tutorials/histogram_normalization.py
@@ -1,6 +1,8 @@
 """
 .. redirect-from:: /gallery/statistics/histogram_features
 
+.. _histogram_normalization:
+
 ===================================
 Histogram bins, density, and weight
 ===================================
@@ -34,65 +36,90 @@
 
 # changing the style of the histogram bars just to make it
 # very clear where the boundaries of the bins are:
-style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3}
+style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3, 'alpha': .5}
+
+fig, ax = plt.subplots(figsize=(6, 3))
+
+fig, ax = plt.subplots(layout='constrained', figsize=(8, 4))
 
-fig, ax = plt.subplots()
+# count the number of values in xdata between each value in xbins
 ax.hist(xdata, bins=xbins, **style)
 
-# plot the xdata locations on the x axis:
-ax.plot(xdata, 0*xdata, 'd')
-ax.set_ylabel('Number per bin')
-ax.set_xlabel('x bins (dx=1.0)')
+# plot the xdata events:
+ax.eventplot(xdata, orientation='vertical', color='C1', alpha=.5)
+
+ax.set(xlabel='Number per bin', ylabel='x bins (dx=1.0)', title='histogram')
 
 # %%
-# Modifying bins
-# ==============
+# Choose bins
+# ===========
 #
 # Changing the bin size changes the shape of this sparse histogram, so its a
-# good idea to choose bins with some care with respect to your data.  Here we
-# make the bins half as wide.
+# good idea to choose bins with some care with respect to your data. The `.Axes.hist`
+# *bins* parameter accepts either the number of bins or a list of bin edges.
+#
+#
+# Set *bins* using fixed edges
+# ----------------------------
+#
+# Here the bins are set to the list of edges [1, 1.5, 2, 2.5, 3, 3.5, 4].
+# This is half as wide as the previous example.
 
 xbins = np.arange(1, 4.5, 0.5)
 
-fig, ax = plt.subplots()
+fig, ax = plt.subplots(layout='constrained', figsize=(8, 4))
+
 ax.hist(xdata, bins=xbins, **style)
-ax.plot(xdata, 0*xdata, 'd')
-ax.set_ylabel('Number per bin')
-ax.set_xlabel('x bins (dx=0.5)')
+
+ax.eventplot(xdata, orientation='vertical', color='C1', alpha=.5)
+
+ax.set(ylabel='cpunt', xlabel='x bins (dx=0.5)',
+       title='fixed bin edges: bins=np.arange(1, 4.5, .5)',)
 
 # %%
+#
+# Set *bins* using number of bins
+# -------------------------------
+#
 # We can also let numpy (via Matplotlib) choose the bins automatically, or
 # specify a number of bins to choose automatically:
 
-fig, ax = plt.subplot_mosaic([['auto', 'n4']],
-                             sharex=True, sharey=True, layout='constrained')
+fig, ax = plt.subplot_mosaic([['auto'], ['n4']],
+                             sharex=True, sharey=True,
+                             layout='constrained', figsize=(8, 4))
 
 ax['auto'].hist(xdata, **style)
-ax['auto'].plot(xdata, 0*xdata, 'd')
-ax['auto'].set_ylabel('Number per bin')
-ax['auto'].set_xlabel('x bins (auto)')
+ax['auto'].eventplot(xdata, orientation='vertical', color='C1', alpha=.5)
+
+ax['auto'].set(ylabel='count', xlabel='x bins',
+               title='dynamically computed bin edges: bins="auto"')
 
 ax['n4'].hist(xdata, bins=4, **style)
-ax['n4'].plot(xdata, 0*xdata, 'd')
-ax['n4'].set_xlabel('x bins ("bins=4")')
+ax['n4'].eventplot(xdata, orientation='vertical', color='C1', alpha=.5)
+
+ax['n4'].set(ylabel='count', xlabel='x bins',
+             title='fixed number of bins: bins=4',)
 
 # %%
-# Normalizing histograms: density and weight
-# ==========================================
+# Normalize histogram
+# ===================
 #
 # Counts-per-bin is the default length of each bar in the histogram.  However,
 # we can also normalize the bar lengths as a probability density function using
 # the ``density`` parameter:
 
-fig, ax = plt.subplots()
+fig, ax = plt.subplots(layout='constrained', figsize=(8, 4))
+
 ax.hist(xdata, bins=xbins, density=True, **style)
-ax.set_ylabel('Probability density [$V^{-1}$])')
-ax.set_xlabel('x bins (dx=0.5 $V$)')
+
+ax.set(ylabel='Probability density [$V^{-1}$])',
+       xlabel='x bins (dx=0.5 $V$)',
+       title='normalizing histogram using density')
 
 # %%
 # This normalization can be a little hard to interpret when just exploring the
 # data. The value attached to each bar is divided by the total number of data
-# points *and* the width of the bin, and thus the values _integrate_ to one
+# points *and* the width of the bin, and thus the values *integrate* to one
 # when integrating across the full range of data.
 # e.g. ::
 #
@@ -117,127 +144,154 @@
 pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
 
 # %%
+# *density* parameter
+# -------------------
+#
 # If we don't use ``density=True``, we need to scale the expected probability
 # distribution function by both the length of the data and the width of the
 # bins:
 
-fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
 dx = 0.1
 xbins = np.arange(-4, 4, dx)
-ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
 
+fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained',
+                             figsize=(8, 4))
+
+
+ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
 # scale and plot the expected pdf:
-ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$')
-ax['False'].set_ylabel('Count per bin')
-ax['False'].set_xlabel('x bins [V]')
-ax['False'].legend()
+ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$', alpha=.5)
+
 
 ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
-ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
-ax['True'].set_ylabel('Probability density [$V^{-1}$]')
-ax['True'].set_xlabel('x bins [$V$]')
+ax['True'].plot(xpdf, pdf, label='$f_X(x)$', alpha=.5)
+
+
+ax['False'].set(ylabel='Count per bin', xlabel='x bins [V]',
+                title="normalization using scaling, density=False")
+ax['False'].legend()
+ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
+               title="density=True")
 ax['True'].legend()
 
 # %%
-# One advantage of using the density is therefore that the shape and amplitude
-# of the histogram does not depend on the size of the bins.  Consider an
-# extreme case where the bins do not have the same width.  In this example, the
-# bins below ``x=-1.25`` are six times wider than the rest of the bins.   By
+# Preserving distribution shape
+# -----------------------------
+# One advantage of using the density is that the shape and amplitude of the histogram
+# does not depend on the size of the bins.
+#
+# Irregularly spaced bins
+# ^^^^^^^^^^^^^^^^^^^^^^^
+# Consider an extreme case where the bins do not have the same width.  In this example,
+# the bins below ``x=-1.25`` are six times wider than the rest of the bins. By
 # normalizing by density, we preserve the shape of the distribution, whereas if
 # we do not, then the wider bins have much higher counts than the thinner bins:
 
-fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
 dx = 0.1
 xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
+
+fig, ax = plt.subplot_mosaic([['False', 'True']],
+                             layout='constrained', figsize=(8, 4))
+
+
 ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
-ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$')
-ax['False'].set_ylabel('Count per bin')
-ax['False'].set_xlabel('x bins [V]')
-ax['False'].legend()
+ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$', alpha=.5)
 
 ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
-ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
-ax['True'].set_ylabel('Probability density [$V^{-1}$]')
-ax['True'].set_xlabel('x bins [$V$]')
+ax['True'].plot(xpdf, pdf, label='$f_X(x)$', alpha=.5)
+
+
+ax['False'].set(ylabel='Count per bin', xlabel='x bins [V]',
+                title="irregularly spaced bins, density=False")
+ax['False'].legend()
+
+ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
+               title="irregularly spaced bins, density=True",)
 ax['True'].legend()
 
 # %%
+# Histograms with different bin widths
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 # Similarly, if we want to compare histograms with different bin widths, we may
 # want to use ``density=True``:
 
-fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
+fig, ax = plt.subplot_mosaic([['False', 'True']],
+                             layout='constrained', figsize=(8, 4))
 
 # expected PDF
 ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k')
 
 for nn, dx in enumerate([0.1, 0.4, 1.2]):
     xbins = np.arange(-4, 4, dx)
     # expected histogram:
-    ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}')
+    ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}', alpha=.5)
     ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
 
-    ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)
+    ax['True'].hist(xdata, bins=xbins, density=True,
+                    histtype='step', label=dx, alpha=style['alpha'])
 
 # Labels:
-ax['False'].set_xlabel('x bins [$V$]')
-ax['False'].set_ylabel('Count per bin')
-ax['True'].set_ylabel('Probability density [$V^{-1}$]')
-ax['True'].set_xlabel('x bins [$V$]')
+ax['False'].set(ylabel='Count per bin', xlabel='x bins [$V$]',
+                title="density=False")
+ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
+               title='density=True')
 ax['True'].legend(fontsize='small', title='bin width:')
 
 # %%
+# Assign weights
+# ==============
+#
 # Sometimes people want to normalize so that the sum of counts is one.  This is
 # analogous to a `probability mass function
 # <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete
-# variable where the sum of probabilities for all the values equals one.  Using
-# ``hist``, we can get this normalization if we set the *weights* to 1/N.
+# variable where the sum of probabilities for all the values equals one.
+#
+# *weights* parameter
+# -------------------
+# Using ``hist``, we can get this normalization if we set the *weights* to 1/N.
 # Note that the amplitude of this normalized histogram still depends on
-# width and/or number of the bins:
+# width and/or number of bins:
 
-fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
+fig, ax = plt.subplots(layout='constrained', figsize=(8, 4))
 
 for nn, dx in enumerate([0.1, 0.4, 1.2]):
     xbins = np.arange(-4, 4, dx)
     ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
                    histtype='step', label=f'{dx}')
-ax.set_xlabel('x bins [$V$]')
-ax.set_ylabel('Bin count / N')
+
+ax.set(ylabel='Bin count / N', xlabel='x bins [$V$]',
+       title="histogram normalization using weights")
 ax.legend(fontsize='small', title='bin width:')
 
 # %%
+# Populations of different sizes
+# ------------------------------
 # The value of normalizing histograms is comparing two distributions that have
-# different sized populations.  Here we compare the distribution of ``xdata``
+# different sized populations. Here we compare the distribution of ``xdata``
 # with a population of 1000, and ``xdata2`` with 100 members.
 
 xdata2 = rng.normal(size=100)
 
-fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']],
-                             layout='constrained', figsize=(8, 4))
+fig, ax = plt.subplot_mosaic([['no_norm'], ['density'], ['weight']],
+                             layout='constrained', figsize=(8,2))
 
 xbins = np.arange(-4, 4, 0.25)
 
-ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
-ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
-ax['no_norm'].set_ylabel('Counts')
-ax['no_norm'].set_xlabel('x bins [$V$]')
-ax['no_norm'].set_title('No normalization')
-
-ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
-ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
-ax['density'].set_ylabel('Probability density [$V^{-1}$]')
-ax['density'].set_title('Density=True')
-ax['density'].set_xlabel('x bins [$V$]')
-
-ax['weight'].hist(xdata, bins=xbins, histtype='step',
-                  weights=1 / len(xdata) * np.ones(len(xdata)),
-                  label='N=1000')
-ax['weight'].hist(xdata2, bins=xbins, histtype='step',
-                  weights=1 / len(xdata2) * np.ones(len(xdata2)),
-                  label='N=100')
-ax['weight'].set_xlabel('x bins [$V$]')
-ax['weight'].set_ylabel('Counts / N')
+for xd in [xdata, xdata2]:
+  ax['no_norm'].hist(xd, bins=xbins, histtype='step')
+  ax['density'].hist(xd, bins=xbins, histtype='step', density=True)
+  ax['weight'].hist(xd, bins=xbins, histtype='step',
+                    weights=1 / len(xd) * np.ones(len(xd)),
+                    label=f'N={len(xd)}')
+
+
+ax['no_norm'].set(ylabel='Counts', xlabel='x bins [$V$]',
+                  title='No normalization')
+ax['density'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
+                  title='Density=True')
+ax['weight'].set(ylabel='Counts / N', xlabel='x bins [$V$]',
+                 title='Weight = 1/N')
 ax['weight'].legend(fontsize='small')
-ax['weight'].set_title('Weight = 1/N')
 
 plt.show()
 
@@ -253,3 +307,4 @@
 #    - `matplotlib.axes.Axes.set_xlabel`
 #    - `matplotlib.axes.Axes.set_ylabel`
 #    - `matplotlib.axes.Axes.legend`
+#
diff --git a/galleries/tutorials/index.rst b/galleries/tutorials/index.rst
@@ -95,6 +95,7 @@ a :ref:`FAQ <faq-index>` in our :ref:`user guide <users-guide-index>`.
    /tutorials/images
    /tutorials/lifecycle
    /tutorials/artists
+   /tutorials/histogram_normalization
 
 .. only:: html
 
@@ -134,6 +135,7 @@ Intermediate
 - :ref:`arranging_axes`
 - :ref:`autoscale`
 - :ref:`imshow_extent`
+- :ref:`histogram_normalization`
 
 Advanced
 ^^^^^^^^