|
1 | 1 | """
|
2 | 2 | .. redirect-from:: /gallery/statistics/histogram_features
|
3 | 3 |
|
| 4 | +.. _histogram-normalization:: |
| 5 | +
|
4 | 6 | ===================================
|
5 | 7 | Histogram bins, density, and weight
|
6 | 8 | ===================================
|
|
34 | 36 |
|
35 | 37 | # changing the style of the histogram bars just to make it
|
36 | 38 | # very clear where the boundaries of the bins are:
|
37 |
| -style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3} |
| 39 | +style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3, 'alpha': .5} |
| 40 | + |
| 41 | +fig, ax = plt.subplots(figsize=(6, 3)) |
38 | 42 |
|
39 |
| -fig, ax = plt.subplots() |
| 43 | +# count the number of values in xdata between each value in xbins |
40 | 44 | ax.hist(xdata, bins=xbins, **style)
|
41 | 45 |
|
42 | 46 | # plot the xdata locations on the x axis:
|
43 |
| -ax.plot(xdata, 0*xdata, 'd') |
44 |
| -ax.set_ylabel('Number per bin') |
45 |
| -ax.set_xlabel('x bins (dx=1.0)') |
| 47 | +ax.stem(xdata, [.5]*len(xdata), 'd') |
| 48 | + |
| 49 | +ax.set(xlabel='Number per bin', ylabel='x bins (dx=1.0)', |
| 50 | + title='histogram',) |
46 | 51 |
|
47 | 52 | # %%
|
48 | 53 | # Modifying bins
|
49 | 54 | # ==============
|
50 | 55 | #
|
51 | 56 | # Changing the bin size changes the shape of this sparse histogram, so its a
|
52 |
| -# good idea to choose bins with some care with respect to your data. Here we |
53 |
| -# make the bins half as wide. |
| 57 | +# good idea to choose bins with some care with respect to your data. The `.Axes.hist` |
| 58 | +# *bins* parameter accepts either the number of bins or a list of bin edges. |
| 59 | +# |
| 60 | +# |
| 61 | +# Set *bins* using fixed edges |
| 62 | +# ----------------------------- |
| 63 | +# |
| 64 | +# Here the bins are set to the list of edges [1, 1.5, 2, 2.5, 3, 3.5, 4]. |
| 65 | +# This is half as wide as the previous example. |
54 | 66 |
|
55 | 67 | xbins = np.arange(1, 4.5, 0.5)
|
56 | 68 |
|
57 |
| -fig, ax = plt.subplots() |
| 69 | +fig, ax = plt.subplots(figsize=(6, 3)) |
| 70 | + |
58 | 71 | ax.hist(xdata, bins=xbins, **style)
|
59 |
| -ax.plot(xdata, 0*xdata, 'd') |
60 |
| -ax.set_ylabel('Number per bin') |
61 |
| -ax.set_xlabel('x bins (dx=0.5)') |
| 72 | + |
| 73 | +ax.stem(xdata, [.5]*len(xdata), 'd') |
| 74 | + |
| 75 | +ax.set(ylabel='Number per bin', xlabel='x bins (dx=0.5)', |
| 76 | + title='fixed bin edges',) |
62 | 77 |
|
63 | 78 | # %%
|
| 79 | +# |
| 80 | +# Set *bins* using number of bins |
| 81 | +# ------------------------------- |
| 82 | +# |
64 | 83 | # We can also let numpy (via Matplotlib) choose the bins automatically, or
|
65 | 84 | # specify a number of bins to choose automatically:
|
66 | 85 |
|
67 |
| -fig, ax = plt.subplot_mosaic([['auto', 'n4']], |
68 |
| - sharex=True, sharey=True, layout='constrained') |
| 86 | +fig, ax = plt.subplot_mosaic([['auto'], ['n4']], |
| 87 | + sharex=True, sharey=True, |
| 88 | + layout='constrained', figsize=(6, 6)) |
69 | 89 |
|
70 | 90 | ax['auto'].hist(xdata, **style)
|
71 |
| -ax['auto'].plot(xdata, 0*xdata, 'd') |
72 |
| -ax['auto'].set_ylabel('Number per bin') |
73 |
| -ax['auto'].set_xlabel('x bins (auto)') |
| 91 | +ax['auto'].stem(xdata, [.5]*len(xdata), 'd') |
| 92 | + |
| 93 | +ax['auto'].set(ylabel='Number per bin', xlabel='x bins (auto)', |
| 94 | + title='dynamically computed bin edges') |
74 | 95 |
|
75 | 96 | ax['n4'].hist(xdata, bins=4, **style)
|
76 |
| -ax['n4'].plot(xdata, 0*xdata, 'd') |
77 |
| -ax['n4'].set_xlabel('x bins ("bins=4")') |
| 97 | +ax['n4'].stem(xdata, [.5]*len(xdata), 'd') |
| 98 | + |
| 99 | +ax['n4'].set(ylabel='Number per bin', xlabel='x bins ("bins=4")', |
| 100 | + title='fixed number of bins',) |
78 | 101 |
|
79 | 102 | # %%
|
80 |
| -# Normalizing histograms: density and weight |
81 |
| -# ========================================== |
| 103 | +# Normalize histograms using density |
| 104 | +# ================================== |
82 | 105 | #
|
83 | 106 | # Counts-per-bin is the default length of each bar in the histogram. However,
|
84 | 107 | # we can also normalize the bar lengths as a probability density function using
|
85 | 108 | # the ``density`` parameter:
|
86 | 109 |
|
87 |
| -fig, ax = plt.subplots() |
| 110 | +fig, ax = plt.subplots(figsize=(6, 3)) |
| 111 | + |
88 | 112 | ax.hist(xdata, bins=xbins, density=True, **style)
|
89 |
| -ax.set_ylabel('Probability density [$V^{-1}$])') |
90 |
| -ax.set_xlabel('x bins (dx=0.5 $V$)') |
| 113 | + |
| 114 | +ax.set(ylabel='Probability density [$V^{-1}$])', |
| 115 | + xlabel='x bins (dx=0.5 $V$)', |
| 116 | + title='normalizing histogram using density') |
91 | 117 |
|
92 | 118 | # %%
|
93 | 119 | # This normalization can be a little hard to interpret when just exploring the
|
|
117 | 143 | pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
|
118 | 144 |
|
119 | 145 | # %%
|
| 146 | +# *density* parameter |
| 147 | +# ------------------- |
| 148 | +# |
120 | 149 | # If we don't use ``density=True``, we need to scale the expected probability
|
121 | 150 | # distribution function by both the length of the data and the width of the
|
122 | 151 | # bins:
|
123 | 152 |
|
124 |
| -fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
125 | 153 | dx = 0.1
|
126 | 154 | xbins = np.arange(-4, 4, dx)
|
127 |
| -ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') |
128 | 155 |
|
| 156 | +fig, ax = plt.subplot_mosaic([['False'], ['True']], layout='constrained', |
| 157 | + figsize=(6, 6)) |
| 158 | + |
| 159 | + |
| 160 | +ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', |
| 161 | + label='Counts', alpha=style['alpha']) |
129 | 162 | # scale and plot the expected pdf:
|
130 | 163 | ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$')
|
131 |
| -ax['False'].set_ylabel('Count per bin') |
132 |
| -ax['False'].set_xlabel('x bins [V]') |
133 |
| -ax['False'].legend() |
134 | 164 |
|
135 |
| -ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density') |
| 165 | + |
| 166 | +ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', |
| 167 | + label='density', alpha=style['alpha']) |
136 | 168 | ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
|
137 |
| -ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
138 |
| -ax['True'].set_xlabel('x bins [$V$]') |
| 169 | + |
| 170 | + |
| 171 | +ax['False'].set(ylabel='Count per bin', xlabel='x bins [V]', |
| 172 | + title="normalization using scaling, density=False") |
| 173 | +ax['False'].legend() |
| 174 | +ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]', |
| 175 | + title="density=True") |
139 | 176 | ax['True'].legend()
|
140 | 177 |
|
141 | 178 | # %%
|
142 |
| -# One advantage of using the density is therefore that the shape and amplitude |
143 |
| -# of the histogram does not depend on the size of the bins. Consider an |
144 |
| -# extreme case where the bins do not have the same width. In this example, the |
145 |
| -# bins below ``x=-1.25`` are six times wider than the rest of the bins. By |
| 179 | +# Preserving distribution shape |
| 180 | +# ----------------------------- |
| 181 | +# One advantage of using the density is that the shape and amplitude of the histogram |
| 182 | +# does not depend on the size of the bins. |
| 183 | +# |
| 184 | +# Irregularly spaced bins |
| 185 | +# ^^^^^^^^^^^^^^^^^^^^^^^ |
| 186 | +# Consider an extreme case where the bins do not have the same width. In this example, |
| 187 | +# the bins below ``x=-1.25`` are six times wider than the rest of the bins. By |
146 | 188 | # normalizing by density, we preserve the shape of the distribution, whereas if
|
147 | 189 | # we do not, then the wider bins have much higher counts than the thinner bins:
|
148 | 190 |
|
149 |
| -fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
150 | 191 | dx = 0.1
|
151 | 192 | xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
|
152 |
| -ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') |
| 193 | + |
| 194 | +fig, ax = plt.subplot_mosaic([['False'], ['True']], |
| 195 | + layout='constrained', figsize=(6, 6)) |
| 196 | + |
| 197 | + |
| 198 | +ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', |
| 199 | + label='Counts', alpha=style['alpha']) |
153 | 200 | ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$')
|
154 |
| -ax['False'].set_ylabel('Count per bin') |
155 |
| -ax['False'].set_xlabel('x bins [V]') |
156 |
| -ax['False'].legend() |
157 | 201 |
|
158 |
| -ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density') |
| 202 | +ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', |
| 203 | + label='density', alpha=style['alpha']) |
159 | 204 | ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
|
160 |
| -ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
161 |
| -ax['True'].set_xlabel('x bins [$V$]') |
| 205 | + |
| 206 | + |
| 207 | +ax['False'].set(ylabel='Count per bin', xlabel='x bins [V]', |
| 208 | + title="irregularly spaced bins, density=False") |
| 209 | +ax['False'].legend() |
| 210 | + |
| 211 | +ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]', |
| 212 | + title="irregularly spaced bins, density=True",) |
162 | 213 | ax['True'].legend()
|
163 | 214 |
|
164 | 215 | # %%
|
| 216 | +# Histograms with different bin widths |
| 217 | +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
165 | 218 | # Similarly, if we want to compare histograms with different bin widths, we may
|
166 | 219 | # want to use ``density=True``:
|
167 | 220 |
|
168 |
| -fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
| 221 | +fig, ax = plt.subplot_mosaic([['False'], ['True']], |
| 222 | + layout='constrained', figsize=(6, 6)) |
169 | 223 |
|
170 | 224 | # expected PDF
|
171 | 225 | ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k')
|
|
174 | 228 | xbins = np.arange(-4, 4, dx)
|
175 | 229 | # expected histogram:
|
176 | 230 | ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}')
|
177 |
| - ax['False'].hist(xdata, bins=xbins, density=False, histtype='step') |
| 231 | + ax['False'].hist(xdata, bins=xbins, density=False, |
| 232 | + histtype='step', alpha=style['alpha']) |
178 | 233 |
|
179 |
| - ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx) |
| 234 | + ax['True'].hist(xdata, bins=xbins, density=True, |
| 235 | + histtype='step', label=dx, alpha=style['alpha']) |
180 | 236 |
|
181 | 237 | # Labels:
|
182 |
| -ax['False'].set_xlabel('x bins [$V$]') |
183 |
| -ax['False'].set_ylabel('Count per bin') |
184 |
| -ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
185 |
| -ax['True'].set_xlabel('x bins [$V$]') |
| 238 | +ax['False'].set(ylabel='Count per bin', xlabel='x bins [$V$]', |
| 239 | + title="density=False") |
| 240 | +ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]', |
| 241 | + title='density=True') |
186 | 242 | ax['True'].legend(fontsize='small', title='bin width:')
|
187 | 243 |
|
188 | 244 | # %%
|
| 245 | +# Normalize histograms using weights |
| 246 | +# ================================== |
| 247 | +# |
189 | 248 | # Sometimes people want to normalize so that the sum of counts is one. This is
|
190 | 249 | # analogous to a `probability mass function
|
191 | 250 | # <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete
|
192 |
| -# variable where the sum of probabilities for all the values equals one. Using |
193 |
| -# ``hist``, we can get this normalization if we set the *weights* to 1/N. |
| 251 | +# variable where the sum of probabilities for all the values equals one. |
| 252 | +# |
| 253 | +# *weights* parameter |
| 254 | +# ------------------- |
| 255 | +# Using ``hist``, we can get this normalization if we set the *weights* to 1/N. |
194 | 256 | # Note that the amplitude of this normalized histogram still depends on
|
195 |
| -# width and/or number of the bins: |
| 257 | +# width and/or number of bins: |
196 | 258 |
|
197 | 259 | fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
|
198 | 260 |
|
199 | 261 | for nn, dx in enumerate([0.1, 0.4, 1.2]):
|
200 | 262 | xbins = np.arange(-4, 4, dx)
|
201 | 263 | ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
|
202 |
| - histtype='step', label=f'{dx}') |
203 |
| -ax.set_xlabel('x bins [$V$]') |
204 |
| -ax.set_ylabel('Bin count / N') |
| 264 | + histtype='step', label=f'{dx}', alpha=style['alpha']) |
| 265 | + |
| 266 | +ax.set(ylabel='Bin count / N', xlabel='x bins [$V$]', |
| 267 | + title="histogram normalization using weights") |
205 | 268 | ax.legend(fontsize='small', title='bin width:')
|
206 | 269 |
|
207 | 270 | # %%
|
| 271 | +# Populations of different sizes |
| 272 | +# ------------------------------ |
208 | 273 | # The value of normalizing histograms is comparing two distributions that have
|
209 |
| -# different sized populations. Here we compare the distribution of ``xdata`` |
| 274 | +# different sized populations. Here we compare the distribution of ``xdata`` |
210 | 275 | # with a population of 1000, and ``xdata2`` with 100 members.
|
211 | 276 |
|
212 | 277 | xdata2 = rng.normal(size=100)
|
213 | 278 |
|
214 |
| -fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']], |
215 |
| - layout='constrained', figsize=(8, 4)) |
| 279 | +fig, ax = plt.subplot_mosaic([['no_norm'], ['density'], ['weight']], |
| 280 | + layout='constrained', figsize=(6, 9)) |
216 | 281 |
|
217 | 282 | xbins = np.arange(-4, 4, 0.25)
|
218 | 283 |
|
219 |
| -ax['no_norm'].hist(xdata, bins=xbins, histtype='step') |
220 |
| -ax['no_norm'].hist(xdata2, bins=xbins, histtype='step') |
221 |
| -ax['no_norm'].set_ylabel('Counts') |
222 |
| -ax['no_norm'].set_xlabel('x bins [$V$]') |
223 |
| -ax['no_norm'].set_title('No normalization') |
| 284 | +ax['no_norm'].hist(xdata, bins=xbins, histtype='step', alpha=style['alpha']) |
| 285 | +ax['no_norm'].hist(xdata2, bins=xbins, histtype='step', alpha=style['alpha']) |
| 286 | + |
224 | 287 |
|
225 |
| -ax['density'].hist(xdata, bins=xbins, histtype='step', density=True) |
226 |
| -ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True) |
227 |
| -ax['density'].set_ylabel('Probability density [$V^{-1}$]') |
228 |
| -ax['density'].set_title('Density=True') |
229 |
| -ax['density'].set_xlabel('x bins [$V$]') |
| 288 | +ax['density'].hist(xdata, bins=xbins, histtype='step', |
| 289 | + density=True, alpha=style['alpha']) |
| 290 | +ax['density'].hist(xdata2, bins=xbins, histtype='step', |
| 291 | + density=True, alpha=style['alpha']) |
230 | 292 |
|
231 | 293 | ax['weight'].hist(xdata, bins=xbins, histtype='step',
|
232 | 294 | weights=1 / len(xdata) * np.ones(len(xdata)),
|
233 |
| - label='N=1000') |
| 295 | + label='N=1000', alpha=style['alpha']) |
234 | 296 | ax['weight'].hist(xdata2, bins=xbins, histtype='step',
|
235 | 297 | weights=1 / len(xdata2) * np.ones(len(xdata2)),
|
236 |
| - label='N=100') |
237 |
| -ax['weight'].set_xlabel('x bins [$V$]') |
238 |
| -ax['weight'].set_ylabel('Counts / N') |
| 298 | + label='N=100', alpha=style['alpha']) |
| 299 | + |
| 300 | + |
| 301 | +ax['no_norm'].set(ylabel='Counts', xlabel='x bins [$V$]', |
| 302 | + title='No normalization') |
| 303 | +ax['density'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]', |
| 304 | + title='Density=True') |
| 305 | +ax['weight'].set(ylabel='Counts / N', xlabel='x bins [$V$]', |
| 306 | + title='Weight = 1/N') |
239 | 307 | ax['weight'].legend(fontsize='small')
|
240 |
| -ax['weight'].set_title('Weight = 1/N') |
241 | 308 |
|
242 | 309 | plt.show()
|
243 | 310 |
|
|
0 commit comments