|
| 1 | +""" |
| 2 | +Thanks Josh Hemann for the example |
| 3 | +""" |
| 4 | + |
| 5 | +import numpy as np |
| 6 | +import matplotlib.pyplot as plt |
| 7 | +from matplotlib.patches import Polygon |
| 8 | + |
| 9 | + |
| 10 | +# Generate some data from five different probability distributions, |
| 11 | +# each with different characteristics. We want to play with how an IID |
| 12 | +# bootstrap resample of the data preserves the distributional |
| 13 | +# properties of the original sample, and a boxplot is one visual tool |
| 14 | +# to make this assessment |
| 15 | +numDists = 5 |
| 16 | +randomDists = ['Normal(1,1)',' Lognormal(1,1)', 'Exp(1)', 'Gumbel(6,4)', |
| 17 | + 'Triangular(2,9,11)'] |
| 18 | +N = 500 |
| 19 | +norm = np.random.normal(1,1, N) |
| 20 | +logn = np.random.lognormal(1,1, N) |
| 21 | +expo = np.random.exponential(1, N) |
| 22 | +gumb = np.random.gumbel(6, 4, N) |
| 23 | +tria = np.random.triangular(2, 9, 11, N) |
| 24 | + |
| 25 | +# Generate some random indices that we'll use to resample the original data |
| 26 | +# arrays. For code brevity, just use the same random indices for each array |
| 27 | +bootstrapIndices = np.random.random_integers(0, N-1, N) |
| 28 | +normBoot = norm[bootstrapIndices] |
| 29 | +expoBoot = expo[bootstrapIndices] |
| 30 | +gumbBoot = gumb[bootstrapIndices] |
| 31 | +lognBoot = logn[bootstrapIndices] |
| 32 | +triaBoot = tria[bootstrapIndices] |
| 33 | + |
| 34 | +data = [norm, normBoot, logn, lognBoot, expo, expoBoot, gumb, gumbBoot, |
| 35 | + tria, triaBoot] |
| 36 | + |
| 37 | +fig = plt.figure(figsize=(10,6)) |
| 38 | +fig.canvas.set_window_title('A Boxplot Example') |
| 39 | +ax1 = fig.add_subplot(111) |
| 40 | +plt.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.25) |
| 41 | + |
| 42 | +bp = plt.boxplot(data, notch=0, sym='+', vert=1, whis=1.5) |
| 43 | +plt.setp(bp['boxes'], color='black') |
| 44 | +plt.setp(bp['whiskers'], color='black') |
| 45 | +plt.setp(bp['fliers'], color='red', marker='+') |
| 46 | + |
| 47 | +# Add a horizontal grid to the plot, but make it very light in color |
| 48 | +# so we can use it for reading data values but not be distracting |
| 49 | +ax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', |
| 50 | + alpha=0.5) |
| 51 | + |
| 52 | +# Hide these grid behind plot objects |
| 53 | +ax1.set_axisbelow(True) |
| 54 | +ax1.set_title('Comparison of IID Bootstrap Resampling Across Five Distributions') |
| 55 | +ax1.set_xlabel('Distribution') |
| 56 | +ax1.set_ylabel('Value') |
| 57 | + |
| 58 | +# Now fill the boxes with desired colors |
| 59 | +boxColors = ['darkkhaki','royalblue'] |
| 60 | +numBoxes = numDists*2 |
| 61 | +medians = range(numBoxes) |
| 62 | +for i in range(numBoxes): |
| 63 | + box = bp['boxes'][i] |
| 64 | + boxX = [] |
| 65 | + boxY = [] |
| 66 | + for j in range(5): |
| 67 | + boxX.append(box.get_xdata()[j]) |
| 68 | + boxY.append(box.get_ydata()[j]) |
| 69 | + boxCoords = zip(boxX,boxY) |
| 70 | + # Alternate between Dark Khaki and Royal Blue |
| 71 | + k = i % 2 |
| 72 | + boxPolygon = Polygon(boxCoords, facecolor=boxColors[k]) |
| 73 | + ax1.add_patch(boxPolygon) |
| 74 | + # Now draw the median lines back over what we just filled in |
| 75 | + med = bp['medians'][i] |
| 76 | + medianX = [] |
| 77 | + medianY = [] |
| 78 | + for j in range(2): |
| 79 | + medianX.append(med.get_xdata()[j]) |
| 80 | + medianY.append(med.get_ydata()[j]) |
| 81 | + plt.plot(medianX, medianY, 'k') |
| 82 | + medians[i] = medianY[0] |
| 83 | + # Finally, overplot the sample averages, with horixzontal alignment |
| 84 | + # in the center of each box |
| 85 | + plt.plot([np.average(med.get_xdata())], [np.average(data[i])], |
| 86 | + color='w', marker='*', markeredgecolor='k') |
| 87 | + |
| 88 | +# Set the axes ranges and axes labels |
| 89 | +ax1.set_xlim(0.5, numBoxes+0.5) |
| 90 | +top = 40 |
| 91 | +bottom = -5 |
| 92 | +ax1.set_ylim(bottom, top) |
| 93 | +xtickNames = plt.setp(ax1, xticklabels=np.repeat(randomDists, 2)) |
| 94 | +plt.setp(xtickNames, rotation=45, fontsize=8) |
| 95 | + |
| 96 | +# Due to the Y-axis scale being different across samples, it can be |
| 97 | +# hard to compare differences in medians across the samples. Add upper |
| 98 | +# X-axis tick labels with the sample medians to aid in comparison |
| 99 | +# (just use two decimal places of precision) |
| 100 | +pos = np.arange(numBoxes)+1 |
| 101 | +upperLabels = [str(np.round(s, 2)) for s in medians] |
| 102 | +weights = ['bold', 'semibold'] |
| 103 | +for tick,label in zip(range(numBoxes),ax1.get_xticklabels()): |
| 104 | + k = tick % 2 |
| 105 | + ax1.text(pos[tick], top-(top*0.05), upperLabels[tick], |
| 106 | + horizontalalignment='center', size='x-small', weight=weights[k], |
| 107 | + color=boxColors[k]) |
| 108 | + |
| 109 | +# Finally, add a basic legend |
| 110 | +plt.figtext(0.80, 0.08, str(N) + ' Random Numbers' , |
| 111 | + backgroundcolor=boxColors[0], color='black', weight='roman', |
| 112 | + size='x-small') |
| 113 | +plt.figtext(0.80, 0.045, 'IID Bootstrap Resample', |
| 114 | +backgroundcolor=boxColors[1], |
| 115 | + color='white', weight='roman', size='x-small') |
| 116 | +plt.figtext(0.80, 0.015, '*', color='white', backgroundcolor='silver', |
| 117 | + weight='roman', size='medium') |
| 118 | +plt.figtext(0.815, 0.013, ' Average Value', color='black', weight='roman', |
| 119 | + size='x-small') |
| 120 | + |
| 121 | +plt.show() |
0 commit comments