|
| 1 | +""" |
| 2 | +.. redirect-from:: /gallery/statistics/histogram_features |
| 3 | +
|
| 4 | +=================================== |
| 5 | +Histogram bins, density, and weight |
| 6 | +=================================== |
| 7 | +
|
| 8 | +The `.Axes.hist` method can flexibly create histograms in a few different ways, |
| 9 | +which is flexible and helpful, but can also lead to confusion. In particular, |
| 10 | +you can: |
| 11 | +
|
| 12 | +- bin the data as you want, either with an automatically chosen number of |
| 13 | + bins, or with fixed bin edges, |
| 14 | +- normalize the histogram so that its integral is one, |
| 15 | +- and assign weights to the data points, so that each data point affects the |
| 16 | + count in its bin differently. |
| 17 | +
|
| 18 | +The Matplotlib ``hist`` method calls `numpy.histogram` and plots the results, |
| 19 | +therefore users should consult the numpy documentation for a definitive guide. |
| 20 | +
|
| 21 | +Histograms are created by defining bin edges, and taking a dataset of values |
| 22 | +and sorting them into the bins, and counting or summing how much data is in |
| 23 | +each bin. In this simple example, 9 numbers between 1 and 4 are sorted into 3 |
| 24 | +bins: |
| 25 | +""" |
| 26 | + |
| 27 | +import matplotlib.pyplot as plt |
| 28 | +import numpy as np |
| 29 | + |
| 30 | +rng = np.random.default_rng(19680801) |
| 31 | + |
| 32 | +xdata = np.array([1.2, 2.3, 3.3, 3.1, 1.7, 3.4, 2.1, 1.25, 1.3]) |
| 33 | +xbins = np.array([1, 2, 3, 4]) |
| 34 | + |
| 35 | +# changing the style of the histogram bars just to make it |
| 36 | +# very clear where the boundaries of the bins are: |
| 37 | +style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3} |
| 38 | + |
| 39 | +fig, ax = plt.subplots() |
| 40 | +ax.hist(xdata, bins=xbins, **style) |
| 41 | + |
| 42 | +# plot the xdata locations on the x axis: |
| 43 | +ax.plot(xdata, 0*xdata, 'd') |
| 44 | +ax.set_ylabel('Number per bin') |
| 45 | +ax.set_xlabel('x bins (dx=1.0)') |
| 46 | + |
| 47 | +# %% |
| 48 | +# Modifying bins |
| 49 | +# ============== |
| 50 | +# |
| 51 | +# Changing the bin size changes the shape of this sparse histogram, so its a |
| 52 | +# good idea to choose bins with some care with respect to your data. Here we |
| 53 | +# make the bins half as wide. |
| 54 | + |
| 55 | +xbins = np.arange(1, 4.5, 0.5) |
| 56 | + |
| 57 | +fig, ax = plt.subplots() |
| 58 | +ax.hist(xdata, bins=xbins, **style) |
| 59 | +ax.plot(xdata, 0*xdata, 'd') |
| 60 | +ax.set_ylabel('Number per bin') |
| 61 | +ax.set_xlabel('x bins (dx=0.5)') |
| 62 | + |
| 63 | +# %% |
| 64 | +# We can also let numpy (via Matplotlib) choose the bins automatically, or |
| 65 | +# specify a number of bins to choose automatically: |
| 66 | + |
| 67 | +fig, ax = plt.subplot_mosaic([['auto', 'n4']], |
| 68 | + sharex=True, sharey=True, layout='constrained') |
| 69 | + |
| 70 | +ax['auto'].hist(xdata, **style) |
| 71 | +ax['auto'].plot(xdata, 0*xdata, 'd') |
| 72 | +ax['auto'].set_ylabel('Number per bin') |
| 73 | +ax['auto'].set_xlabel('x bins (auto)') |
| 74 | + |
| 75 | +ax['n4'].hist(xdata, bins=4, **style) |
| 76 | +ax['n4'].plot(xdata, 0*xdata, 'd') |
| 77 | +ax['n4'].set_xlabel('x bins ("bins=4")') |
| 78 | + |
| 79 | +# %% |
| 80 | +# Normalizing histograms: density and weight |
| 81 | +# ========================================== |
| 82 | +# |
| 83 | +# Counts-per-bin is the default length of each bar in the histogram. However, |
| 84 | +# we can also normalize the bar lengths as a probability density function using |
| 85 | +# the ``density`` parameter: |
| 86 | + |
| 87 | +fig, ax = plt.subplots() |
| 88 | +ax.hist(xdata, bins=xbins, density=True, **style) |
| 89 | +ax.set_ylabel('Probability density [$V^{-1}$])') |
| 90 | +ax.set_xlabel('x bins (dx=0.5 $V$)') |
| 91 | + |
| 92 | +# %% |
| 93 | +# This normalization can be a little hard to interpret when just exploring the |
| 94 | +# data. The value attached to each bar is divided by the total number of data |
| 95 | +# points *and* the width of the bin, and thus the values _integrate_ to one |
| 96 | +# when integrating across the full range of data. |
| 97 | +# e.g. :: |
| 98 | +# |
| 99 | +# density = counts / (sum(counts) * np.diff(bins)) |
| 100 | +# np.sum(density * np.diff(bins)) == 1 |
| 101 | +# |
| 102 | +# This normalization is how `probability density functions |
| 103 | +# <https://en.wikipedia.org/wiki/Probability_density_function>`_ are defined in |
| 104 | +# statistics. If :math:`X` is a random variable on :math:`x`, then :math:`f_X` |
| 105 | +# is is the probability density function if :math:`P[a<X<b] = \int_a^b f_X dx`. |
| 106 | +# If the units of x are Volts, then the units of :math:`f_X` are :math:`V^{-1}` |
| 107 | +# or probability per change in voltage. |
| 108 | +# |
| 109 | +# The usefulness of this normalization is a little more clear when we draw from |
| 110 | +# a known distribution and try to compare with theory. So, choose 1000 points |
| 111 | +# from a `normal distribution |
| 112 | +# <https://en.wikipedia.org/wiki/Normal_distribution>`_, and also calculate the |
| 113 | +# known probability density function: |
| 114 | + |
| 115 | +xdata = rng.normal(size=1000) |
| 116 | +xpdf = np.arange(-4, 4, 0.1) |
| 117 | +pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2) |
| 118 | + |
| 119 | +# %% |
| 120 | +# If we don't use ``density=True``, we need to scale the expected probability |
| 121 | +# distribution function by both the length of the data and the width of the |
| 122 | +# bins: |
| 123 | + |
| 124 | +fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
| 125 | +dx = 0.1 |
| 126 | +xbins = np.arange(-4, 4, dx) |
| 127 | +ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') |
| 128 | + |
| 129 | +# scale and plot the expected pdf: |
| 130 | +ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$') |
| 131 | +ax['False'].set_ylabel('Count per bin') |
| 132 | +ax['False'].set_xlabel('x bins [V]') |
| 133 | +ax['False'].legend() |
| 134 | + |
| 135 | +ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density') |
| 136 | +ax['True'].plot(xpdf, pdf, label='$f_X(x)$') |
| 137 | +ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
| 138 | +ax['True'].set_xlabel('x bins [$V$]') |
| 139 | +ax['True'].legend() |
| 140 | + |
| 141 | +# %% |
| 142 | +# One advantage of using the density is therefore that the shape and amplitude |
| 143 | +# of the histogram does not depend on the size of the bins. Consider an |
| 144 | +# extreme case where the bins do not have the same width. In this example, the |
| 145 | +# bins below ``x=-1.25`` are six times wider than the rest of the bins. By |
| 146 | +# normalizing by density, we preserve the shape of the distribution, whereas if |
| 147 | +# we do not, then the wider bins have much higher counts than the thinner bins: |
| 148 | + |
| 149 | +fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
| 150 | +dx = 0.1 |
| 151 | +xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)]) |
| 152 | +ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') |
| 153 | +ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$') |
| 154 | +ax['False'].set_ylabel('Count per bin') |
| 155 | +ax['False'].set_xlabel('x bins [V]') |
| 156 | +ax['False'].legend() |
| 157 | + |
| 158 | +ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density') |
| 159 | +ax['True'].plot(xpdf, pdf, label='$f_X(x)$') |
| 160 | +ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
| 161 | +ax['True'].set_xlabel('x bins [$V$]') |
| 162 | +ax['True'].legend() |
| 163 | + |
| 164 | +# %% |
| 165 | +# Similarly, if we want to compare histograms with different bin widths, we may |
| 166 | +# want to use ``density=True``: |
| 167 | + |
| 168 | +fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
| 169 | + |
| 170 | +# expected PDF |
| 171 | +ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k') |
| 172 | + |
| 173 | +for nn, dx in enumerate([0.1, 0.4, 1.2]): |
| 174 | + xbins = np.arange(-4, 4, dx) |
| 175 | + # expected histogram: |
| 176 | + ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}') |
| 177 | + ax['False'].hist(xdata, bins=xbins, density=False, histtype='step') |
| 178 | + |
| 179 | + ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx) |
| 180 | + |
| 181 | +# Labels: |
| 182 | +ax['False'].set_xlabel('x bins [$V$]') |
| 183 | +ax['False'].set_ylabel('Count per bin') |
| 184 | +ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
| 185 | +ax['True'].set_xlabel('x bins [$V$]') |
| 186 | +ax['True'].legend(fontsize='small', title='bin width:') |
| 187 | + |
| 188 | +# %% |
| 189 | +# Sometimes people want to normalize so that the sum of counts is one. This is |
| 190 | +# analogous to a `probability mass function |
| 191 | +# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete |
| 192 | +# variable where the sum of probabilities for all the values equals one. Using |
| 193 | +# ``hist``, we can get this normalization if we set the *weights* to 1/N. |
| 194 | +# Note that the amplitude of this normalized histogram still depends on |
| 195 | +# width and/or number of the bins: |
| 196 | + |
| 197 | +fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3)) |
| 198 | + |
| 199 | +for nn, dx in enumerate([0.1, 0.4, 1.2]): |
| 200 | + xbins = np.arange(-4, 4, dx) |
| 201 | + ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)), |
| 202 | + histtype='step', label=f'{dx}') |
| 203 | +ax.set_xlabel('x bins [$V$]') |
| 204 | +ax.set_ylabel('Bin count / N') |
| 205 | +ax.legend(fontsize='small', title='bin width:') |
| 206 | + |
| 207 | +# %% |
| 208 | +# The value of normalizing histograms is comparing two distributions that have |
| 209 | +# different sized populations. Here we compare the distribution of ``xdata`` |
| 210 | +# with a population of 1000, and ``xdata2`` with 100 members. |
| 211 | + |
| 212 | +xdata2 = rng.normal(size=100) |
| 213 | + |
| 214 | +fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']], |
| 215 | + layout='constrained', figsize=(8, 4)) |
| 216 | + |
| 217 | +xbins = np.arange(-4, 4, 0.25) |
| 218 | + |
| 219 | +ax['no_norm'].hist(xdata, bins=xbins, histtype='step') |
| 220 | +ax['no_norm'].hist(xdata2, bins=xbins, histtype='step') |
| 221 | +ax['no_norm'].set_ylabel('Counts') |
| 222 | +ax['no_norm'].set_xlabel('x bins [$V$]') |
| 223 | +ax['no_norm'].set_title('No normalization') |
| 224 | + |
| 225 | +ax['density'].hist(xdata, bins=xbins, histtype='step', density=True) |
| 226 | +ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True) |
| 227 | +ax['density'].set_ylabel('Probability density [$V^{-1}$]') |
| 228 | +ax['density'].set_title('Density=True') |
| 229 | +ax['density'].set_xlabel('x bins [$V$]') |
| 230 | + |
| 231 | +ax['weight'].hist(xdata, bins=xbins, histtype='step', |
| 232 | + weights=1 / len(xdata) * np.ones(len(xdata)), |
| 233 | + label='N=1000') |
| 234 | +ax['weight'].hist(xdata2, bins=xbins, histtype='step', |
| 235 | + weights=1 / len(xdata2) * np.ones(len(xdata2)), |
| 236 | + label='N=100') |
| 237 | +ax['weight'].set_xlabel('x bins [$V$]') |
| 238 | +ax['weight'].set_ylabel('Counts / N') |
| 239 | +ax['weight'].legend(fontsize='small') |
| 240 | +ax['weight'].set_title('Weight = 1/N') |
| 241 | + |
| 242 | +plt.show() |
| 243 | + |
| 244 | +# %% |
| 245 | +# |
| 246 | +# .. admonition:: References |
| 247 | +# |
| 248 | +# The use of the following functions, methods, classes and modules is shown |
| 249 | +# in this example: |
| 250 | +# |
| 251 | +# - `matplotlib.axes.Axes.hist` / `matplotlib.pyplot.hist` |
| 252 | +# - `matplotlib.axes.Axes.set_title` |
| 253 | +# - `matplotlib.axes.Axes.set_xlabel` |
| 254 | +# - `matplotlib.axes.Axes.set_ylabel` |
| 255 | +# - `matplotlib.axes.Axes.legend` |
0 commit comments