Merge pull request #15485 from meeseeksmachine/auto-backport-of-pr-15454-on-v3.2.x

timhoffm · web-flow · commit c52c6c56813f · 2019-10-23T02:55:37.000+02:00
Backport PR #15454 on branch v3.2.x (Rewrite Anscombe's quartet example)
diff --git a/examples/specialty_plots/anscombe.py b/examples/specialty_plots/anscombe.py
@@ -1,13 +1,15 @@
 """
 ==================
-Anscombe's Quartet
+Anscombe's quartet
 ==================
 
-"""
-"""
-Edward Tufte uses this example from Anscombe to show 4 datasets of x
-and y that have the same mean, standard deviation, and regression
-line, but which are qualitatively different.
+`Anscombe's quartet`_ is a group of datasets (x, y) that have the same mean,
+standard deviation, and regression line, but which are qualitatively different.
+
+It is often used to illustrate the importance of looking at a set of data
+graphically and not only relying on basic statistic properties.
+
+.. _Anscombe's quartet: https://en.wikipedia.org/wiki/Anscombe%27s_quartet
 """
 
 import matplotlib.pyplot as plt
@@ -20,30 +22,35 @@
 x4 = [8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8]
 y4 = [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89]
 
+datasets = {
+    'I': (x, y1),
+    'II': (x, y2),
+    'III': (x, y3),
+    'IV': (x4, y4)
+}
 
-def fit(x):
-    return 3 + 0.5 * x
-
-
-fig, axs = plt.subplots(2, 2, sharex=True, sharey=True)
+fig, axs = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(6, 6),
+                        gridspec_kw={'wspace': 0.08, 'hspace': 0.08})
 axs[0, 0].set(xlim=(0, 20), ylim=(2, 14))
 axs[0, 0].set(xticks=(0, 10, 20), yticks=(4, 8, 12))
 
-xfit = np.array([np.min(x), np.max(x)])
-axs[0, 0].plot(x, y1, 'ks', xfit, fit(xfit), 'r-', lw=2)
-axs[0, 1].plot(x, y2, 'ks', xfit, fit(xfit), 'r-', lw=2)
-axs[1, 0].plot(x, y3, 'ks', xfit, fit(xfit), 'r-', lw=2)
-xfit = np.array([np.min(x4), np.max(x4)])
-axs[1, 1].plot(x4, y4, 'ks', xfit, fit(xfit), 'r-', lw=2)
-
-for ax, label in zip(axs.flat, ['I', 'II', 'III', 'IV']):
-    ax.label_outer()
-    ax.text(3, 12, label, fontsize=20)
-
-# verify the stats
-pairs = (x, y1), (x, y2), (x, y3), (x4, y4)
-for x, y in pairs:
-    print('mean=%1.2f, std=%1.2f, r=%1.2f' % (np.mean(y), np.std(y),
-          np.corrcoef(x, y)[0][1]))
+for ax, (label, (x, y)) in zip(axs.flat, datasets.items()):
+    ax.text(0.1, 0.9, label, fontsize=20, transform=ax.transAxes, va='top')
+    ax.tick_params(direction='in', top=True, right=True)
+    ax.plot(x, y, 'o')
+
+    # linear regression
+    p1, p0 = np.polyfit(x, y, deg=1)
+    x_lin = np.array([np.min(x), np.max(x)])
+    y_lin = p1 * x_lin + p0
+    ax.plot(x_lin, y_lin, 'r-', lw=2)
+
+    # add text box for the statistics
+    stats = (f'$\\mu$ = {np.mean(y):.2f}\n'
+             f'$\\sigma$ = {np.std(y):.2f}\n'
+             f'$r$ = {np.corrcoef(x, y)[0][1]:.2f}')
+    bbox = dict(boxstyle='round', fc='blanchedalmond', ec='orange', alpha=0.5)
+    ax.text(0.95, 0.07, stats, fontsize=9, bbox=bbox,
+            transform=ax.transAxes, horizontalalignment='right')
 
 plt.show()