|
1 | 1 | """
|
2 | 2 | ==================
|
3 |
| -Anscombe's Quartet |
| 3 | +Anscombe's quartet |
4 | 4 | ==================
|
5 | 5 |
|
6 |
| -""" |
7 |
| -""" |
8 |
| -Edward Tufte uses this example from Anscombe to show 4 datasets of x |
9 |
| -and y that have the same mean, standard deviation, and regression |
10 |
| -line, but which are qualitatively different. |
| 6 | +`Anscombe's quartet`_ is a group of datasets (x, y) that have the same mean, |
| 7 | +standard deviation, and regression line, but which are qualitatively different. |
| 8 | +
|
| 9 | +It is often used to illustrate the importance of looking at a set of data |
| 10 | +graphically and not only relying on basic statistic properties. |
| 11 | +
|
| 12 | +.. _Anscombe's quartet: https://en.wikipedia.org/wiki/Anscombe%27s_quartet |
11 | 13 | """
|
12 | 14 |
|
13 | 15 | import matplotlib.pyplot as plt
|
|
20 | 22 | x4 = [8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8]
|
21 | 23 | y4 = [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89]
|
22 | 24 |
|
| 25 | +datasets = { |
| 26 | + 'I': (x, y1), |
| 27 | + 'II': (x, y2), |
| 28 | + 'III': (x, y3), |
| 29 | + 'IV': (x4, y4) |
| 30 | +} |
23 | 31 |
|
24 |
| -def fit(x): |
25 |
| - return 3 + 0.5 * x |
26 |
| - |
27 |
| - |
28 |
| -fig, axs = plt.subplots(2, 2, sharex=True, sharey=True) |
| 32 | +fig, axs = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(6, 6), |
| 33 | + gridspec_kw={'wspace': 0.08, 'hspace': 0.08}) |
29 | 34 | axs[0, 0].set(xlim=(0, 20), ylim=(2, 14))
|
30 | 35 | axs[0, 0].set(xticks=(0, 10, 20), yticks=(4, 8, 12))
|
31 | 36 |
|
32 |
| -xfit = np.array([np.min(x), np.max(x)]) |
33 |
| -axs[0, 0].plot(x, y1, 'ks', xfit, fit(xfit), 'r-', lw=2) |
34 |
| -axs[0, 1].plot(x, y2, 'ks', xfit, fit(xfit), 'r-', lw=2) |
35 |
| -axs[1, 0].plot(x, y3, 'ks', xfit, fit(xfit), 'r-', lw=2) |
36 |
| -xfit = np.array([np.min(x4), np.max(x4)]) |
37 |
| -axs[1, 1].plot(x4, y4, 'ks', xfit, fit(xfit), 'r-', lw=2) |
38 |
| - |
39 |
| -for ax, label in zip(axs.flat, ['I', 'II', 'III', 'IV']): |
40 |
| - ax.label_outer() |
41 |
| - ax.text(3, 12, label, fontsize=20) |
42 |
| - |
43 |
| -# verify the stats |
44 |
| -pairs = (x, y1), (x, y2), (x, y3), (x4, y4) |
45 |
| -for x, y in pairs: |
46 |
| - print('mean=%1.2f, std=%1.2f, r=%1.2f' % (np.mean(y), np.std(y), |
47 |
| - np.corrcoef(x, y)[0][1])) |
| 37 | +for ax, (label, (x, y)) in zip(axs.flat, datasets.items()): |
| 38 | + ax.text(0.1, 0.9, label, fontsize=20, transform=ax.transAxes, va='top') |
| 39 | + ax.tick_params(direction='in', top=True, right=True) |
| 40 | + ax.plot(x, y, 'o') |
| 41 | + |
| 42 | + # linear regression |
| 43 | + p1, p0 = np.polyfit(x, y, deg=1) |
| 44 | + x_lin = np.array([np.min(x), np.max(x)]) |
| 45 | + y_lin = p1 * x_lin + p0 |
| 46 | + ax.plot(x_lin, y_lin, 'r-', lw=2) |
| 47 | + |
| 48 | + # add text box for the statistics |
| 49 | + stats = (f'$\\mu$ = {np.mean(y):.2f}\n' |
| 50 | + f'$\\sigma$ = {np.std(y):.2f}\n' |
| 51 | + f'$r$ = {np.corrcoef(x, y)[0][1]:.2f}') |
| 52 | + bbox = dict(boxstyle='round', fc='blanchedalmond', ec='orange', alpha=0.5) |
| 53 | + ax.text(0.95, 0.07, stats, fontsize=9, bbox=bbox, |
| 54 | + transform=ax.transAxes, horizontalalignment='right') |
48 | 55 |
|
49 | 56 | plt.show()
|
0 commit comments