|
| 1 | +""" |
| 2 | +.. _histogram_normalization |
| 3 | +
|
| 4 | +Histogram normalization |
| 5 | +======================= |
| 6 | +
|
| 7 | +Histogram normalization rescales data into probabilities and therefore is a popular |
| 8 | +technique for comparing populations of different sizes or histograms computed using |
| 9 | +different bin edges. For more information on using `.Axes.hist` see |
| 10 | +:ref:`histogram_features`. |
| 11 | +
|
| 12 | +Irregularly spaced bins |
| 13 | +----------------------- |
| 14 | +In this example, the bins below ``x=-1.25`` are six times wider than the rest of the |
| 15 | +bins :: |
| 16 | +
|
| 17 | + dx = 0.1 |
| 18 | + xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)]) |
| 19 | +
|
| 20 | +By normalizing by density, we preserve the shape of the distribution, whereas if we do |
| 21 | +not, then the wider bins have much higher counts than the thinner bins: |
| 22 | +""" |
| 23 | + |
| 24 | +import matplotlib.pyplot as plt |
| 25 | +import numpy as np |
| 26 | + |
| 27 | +rng = np.random.default_rng(19680801) |
| 28 | + |
| 29 | +xdata = rng.normal(size=1000) |
| 30 | +xpdf = np.arange(-4, 4, 0.1) |
| 31 | +pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2) |
| 32 | + |
| 33 | +dx = 0.1 |
| 34 | +xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)]) |
| 35 | + |
| 36 | +fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
| 37 | + |
| 38 | +fig.suptitle("Histogram with irregularly spaced bins") |
| 39 | + |
| 40 | + |
| 41 | +ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') |
| 42 | +ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$', |
| 43 | + alpha=.5) |
| 44 | + |
| 45 | +ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density') |
| 46 | +ax['True'].plot(xpdf, pdf, label='$f_X(x)$', alpha=.5) |
| 47 | + |
| 48 | + |
| 49 | +ax['False'].set(xlabel='x [V]', ylabel='Count per bin', title="density=False") |
| 50 | + |
| 51 | +# add the bin widths on the minor axes to highlight irregularity |
| 52 | +ax['False'].set_xticks(xbins, minor=True) |
| 53 | +ax['False'].legend() |
| 54 | + |
| 55 | +ax['True'].set(xlabel='x [$V$]', ylabel='Probability density [$V^{-1}$]', |
| 56 | + title="density=True") |
| 57 | +ax['False'].set_xticks(xbins, minor=True) |
| 58 | +ax['True'].legend() |
| 59 | + |
| 60 | + |
| 61 | +# %% |
| 62 | +# Different bin widths |
| 63 | +# -------------------- |
| 64 | +# |
| 65 | +# Here we use normalization to compare histograms with binwidths of 0.1, 0.4, and 1.2: |
| 66 | + |
| 67 | +fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
| 68 | + |
| 69 | +fig.suptitle("Comparing histograms with different bin widths") |
| 70 | +# expected PDF |
| 71 | +ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k') |
| 72 | + |
| 73 | +for nn, dx in enumerate([0.1, 0.4, 1.2]): |
| 74 | + xbins = np.arange(-4, 4, dx) |
| 75 | + # expected histogram: |
| 76 | + ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}', alpha=.5) |
| 77 | + ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label=dx) |
| 78 | + |
| 79 | + ax['True'].hist(xdata, bins=xbins, density=True, histtype='step') |
| 80 | + |
| 81 | +ax['False'].set(xlabel='x [$V$]', ylabel='Count per bin', |
| 82 | + title="density=False") |
| 83 | +ax['True'].set(xlabel='x [$V$]', ylabel='Probability density [$V^{-1}$]', |
| 84 | + title='density=True') |
| 85 | +ax['False'].legend(fontsize='small', title='bin width:') |
| 86 | +# %% |
| 87 | +# Populations of different sizes |
| 88 | +# ------------------------------ |
| 89 | +# |
| 90 | +# Here we compare the distribution of ``xdata`` with a population of 1000, and |
| 91 | +# ``xdata2`` with 100 members. We demonstrate using *density* to generate the |
| 92 | +# probability density function(`pdf`_) and *weight* to generate an analog to the |
| 93 | +# probability mass function (`pmf`_). |
| 94 | +# |
| 95 | +# .. _pdf: https://en.wikipedia.org/wiki/Probability_density_function |
| 96 | +# .. _pmf: https://en.wikipedia.org/wiki/Probability_mass_function |
| 97 | + |
| 98 | +xdata2 = rng.normal(size=100) |
| 99 | + |
| 100 | +fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']], layout='constrained') |
| 101 | + |
| 102 | +fig.suptitle("Comparing histograms of populations of different sizes") |
| 103 | + |
| 104 | +xbins = np.arange(-4, 4, 0.25) |
| 105 | + |
| 106 | +for xd in [xdata, xdata2]: |
| 107 | + ax['no_norm'].hist(xd, bins=xbins, histtype='step') |
| 108 | + ax['density'].hist(xd, bins=xbins, histtype='step', density=True) |
| 109 | + ax['weight'].hist(xd, bins=xbins, histtype='step', weights=np.ones(len(xd))/len(xd), |
| 110 | + label=f'N={len(xd)}') |
| 111 | + |
| 112 | + |
| 113 | +ax['no_norm'].set(xlabel='x [$V$]', ylabel='Counts', title='No normalization') |
| 114 | +ax['density'].set(xlabel='x [$V$]', |
| 115 | + ylabel='Probability density [$V^{-1}$]', title='Density=True') |
| 116 | +ax['weight'].set(xlabel='x bins [$V$]', ylabel='Counts / N', title='Weight = 1/N') |
| 117 | + |
| 118 | +ax['weight'].legend(fontsize='small') |
| 119 | + |
| 120 | +plt.show() |
| 121 | + |
| 122 | +# %% |
| 123 | +# |
| 124 | +# .. tags:: plot type: histogram |
| 125 | +# |
| 126 | +# .. admonition:: References |
| 127 | +# |
| 128 | +# The use of the following functions, methods, classes and modules is shown |
| 129 | +# in this example: |
| 130 | +# |
| 131 | +# - `matplotlib.axes.Axes.hist` / `matplotlib.pyplot.hist` |
| 132 | +# - `matplotlib.axes.Axes.set` |
| 133 | +# - `matplotlib.axes.Axes.legend` |
| 134 | +# |
0 commit comments