Merge pull request #24728 from anntzer/ecdf

greglucas · web-flow · commit 5b85655c84fb · 2023-03-26T10:54:29.000-06:00
Add Axes.ecdf() method.
diff --git a/doc/api/axes_api.rst b/doc/api/axes_api.rst
@@ -110,11 +110,12 @@ Statistics
    :template: autosummary.rst
    :nosignatures:
 
+   Axes.ecdf
    Axes.boxplot
    Axes.violinplot
 
-   Axes.violin
    Axes.bxp
+   Axes.violin
 
 Binned
 ------
diff --git a/doc/api/pyplot_summary.rst b/doc/api/pyplot_summary.rst
@@ -114,6 +114,7 @@ Statistics
    :template: autosummary.rst
    :nosignatures:
 
+   ecdf
    boxplot
    violinplot
 
diff --git a/doc/users/next_whats_new/ecdf.rst b/doc/users/next_whats_new/ecdf.rst
@@ -0,0 +1,13 @@
+``Axes.ecdf``
+~~~~~~~~~~~~~
+A new Axes method, `~.Axes.ecdf`, allows plotting empirical cumulative
+distribution functions without any binning.
+
+.. plot::
+   :include-source:
+
+   import matplotlib.pyplot as plt
+   import numpy as np
+
+   fig, ax = plt.subplots()
+   ax.ecdf(np.random.randn(100))
diff --git a/galleries/examples/statistics/histogram_cumulative.py b/galleries/examples/statistics/histogram_cumulative.py
@@ -1,36 +1,27 @@
 """
-==================================================
-Using histograms to plot a cumulative distribution
-==================================================
-
-This shows how to plot a cumulative, normalized histogram as a
-step function in order to visualize the empirical cumulative
-distribution function (CDF) of a sample. We also show the theoretical CDF.
-
-A couple of other options to the ``hist`` function are demonstrated. Namely, we
-use the *density* parameter to normalize the histogram and a couple of different
-options to the *cumulative* parameter. The *density* parameter takes a boolean
-value. When ``True``, the bin heights are scaled such that the total area of
-the histogram is 1. The *cumulative* keyword argument is a little more nuanced.
-Like *density*, you can pass it True or False, but you can also pass it -1 to
-reverse the distribution.
-
-Since we're showing a normalized and cumulative histogram, these curves
-are effectively the cumulative distribution functions (CDFs) of the
-samples. In engineering, empirical CDFs are sometimes called
-"non-exceedance" curves. In other words, you can look at the
-y-value for a given-x-value to get the probability of and observation
-from the sample not exceeding that x-value. For example, the value of
-225 on the x-axis corresponds to about 0.85 on the y-axis, so there's an
-85% chance that an observation in the sample does not exceed 225.
-Conversely, setting, ``cumulative`` to -1 as is done in the
-last series for this example, creates an "exceedance" curve.
-
-Selecting different bin counts and sizes can significantly affect the
-shape of a histogram. The Astropy docs have a great section on how to
-select these parameters:
-http://docs.astropy.org/en/stable/visualization/histogram.html
-
+=================================
+Plotting cumulative distributions
+=================================
+
+This example shows how to plot the empirical cumulative distribution function
+(ECDF) of a sample. We also show the theoretical CDF.
+
+In engineering, ECDFs are sometimes called "non-exceedance" curves: the y-value
+for a given x-value gives probability that an observation from the sample is
+below that x-value. For example, the value of 220 on the x-axis corresponds to
+about 0.80 on the y-axis, so there is an 80% chance that an observation in the
+sample does not exceed 220. Conversely, the empirical *complementary*
+cumulative distribution function (the ECCDF, or "exceedance" curve) shows the
+probability y that an observation from the sample is above a value x.
+
+A direct method to plot ECDFs is `.Axes.ecdf`.  Passing ``complementary=True``
+results in an ECCDF instead.
+
+Alternatively, one can use ``ax.hist(data, density=True, cumulative=True)`` to
+first bin the data, as if plotting a histogram, and then compute and plot the
+cumulative sums of the frequencies of entries in each bin.  Here, to plot the
+ECCDF, pass ``cumulative=-1``.  Note that this approach results in an
+approximation of the E(C)CDF, whereas `.Axes.ecdf` is exact.
 """
 
 import matplotlib.pyplot as plt
@@ -40,33 +31,37 @@
 
 mu = 200
 sigma = 25
-n_bins = 50
-x = np.random.normal(mu, sigma, size=100)
+n_bins = 25
+data = np.random.normal(mu, sigma, size=100)
 
-fig, ax = plt.subplots(figsize=(8, 4))
+fig = plt.figure(figsize=(9, 4), layout="constrained")
+axs = fig.subplots(1, 2, sharex=True, sharey=True)
 
-# plot the cumulative histogram
-n, bins, patches = ax.hist(x, n_bins, density=True, histtype='step',
-                           cumulative=True, label='Empirical')
-
-# Add a line showing the expected distribution.
+# Cumulative distributions.
+axs[0].ecdf(data, label="CDF")
+n, bins, patches = axs[0].hist(data, n_bins, density=True, histtype="step",
+                               cumulative=True, label="Cumulative histogram")
+x = np.linspace(data.min(), data.max())
 y = ((1 / (np.sqrt(2 * np.pi) * sigma)) *
-     np.exp(-0.5 * (1 / sigma * (bins - mu))**2))
+     np.exp(-0.5 * (1 / sigma * (x - mu))**2))
 y = y.cumsum()
 y /= y[-1]
-
-ax.plot(bins, y, 'k--', linewidth=1.5, label='Theoretical')
-
-# Overlay a reversed cumulative histogram.
-ax.hist(x, bins=bins, density=True, histtype='step', cumulative=-1,
-        label='Reversed emp.')
-
-# tidy up the figure
-ax.grid(True)
-ax.legend(loc='right')
-ax.set_title('Cumulative step histograms')
-ax.set_xlabel('Annual rainfall (mm)')
-ax.set_ylabel('Likelihood of occurrence')
+axs[0].plot(x, y, "k--", linewidth=1.5, label="Theory")
+
+# Complementary cumulative distributions.
+axs[1].ecdf(data, complementary=True, label="CCDF")
+axs[1].hist(data, bins=bins, density=True, histtype="step", cumulative=-1,
+            label="Reversed cumulative histogram")
+axs[1].plot(x, 1 - y, "k--", linewidth=1.5, label="Theory")
+
+# Label the figure.
+fig.suptitle("Cumulative distributions")
+for ax in axs:
+    ax.grid(True)
+    ax.legend()
+    ax.set_xlabel("Annual rainfall (mm)")
+    ax.set_ylabel("Probability of occurrence")
+    ax.label_outer()
 
 plt.show()
 
@@ -78,3 +73,4 @@
 #    in this example:
 #
 #    - `matplotlib.axes.Axes.hist` / `matplotlib.pyplot.hist`
+#    - `matplotlib.axes.Axes.ecdf` / `matplotlib.pyplot.ecdf`
diff --git a/galleries/plot_types/stats/ecdf.py b/galleries/plot_types/stats/ecdf.py
@@ -0,0 +1,21 @@
+"""
+=======
+ecdf(x)
+=======
+
+See `~matplotlib.axes.Axes.ecdf`.
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+plt.style.use('_mpl-gallery')
+
+# make data
+np.random.seed(1)
+x = 4 + np.random.normal(0, 1.5, 200)
+
+# plot:
+fig, ax = plt.subplots()
+ax.ecdf(x)
+plt.show()
diff --git a/lib/matplotlib/axes/_axes.py b/lib/matplotlib/axes/_axes.py
@@ -7112,6 +7112,108 @@ def hist2d(self, x, y, bins=10, range=None, density=False, weights=None,
 
         return h, xedges, yedges, pc
 
+    @_preprocess_data(replace_names=["x", "weights"], label_namer="x")
+    @_docstring.dedent_interpd
+    def ecdf(self, x, weights=None, *, complementary=False,
+             orientation="vertical", compress=False, **kwargs):
+        """
+        Compute and plot the empirical cumulative distribution function of *x*.
+
+        .. versionadded:: 3.8
+
+        Parameters
+        ----------
+        x : 1d array-like
+            The input data.  Infinite entries are kept (and move the relevant
+            end of the ecdf from 0/1), but NaNs and masked values are errors.
+
+        weights : 1d array-like or None, default: None
+            The weights of the entries; must have the same shape as *x*.
+            Weights corresponding to NaN data points are dropped, and then the
+            remaining weights are normalized to sum to 1.  If unset, all
+            entries have the same weight.
+
+        complementary : bool, default: False
+            Whether to plot a cumulative distribution function, which increases
+            from 0 to 1 (the default), or a complementary cumulative
+            distribution function, which decreases from 1 to 0.
+
+        orientation : {"vertical", "horizontal"}, default: "vertical"
+            Whether the entries are plotted along the x-axis ("vertical", the
+            default) or the y-axis ("horizontal").  This parameter takes the
+            same values as in `~.Axes.hist`.
+
+        compress : bool, default: False
+            Whether multiple entries with the same values are grouped together
+            (with a summed weight) before plotting.  This is mainly useful if
+            *x* contains many identical data points, to decrease the rendering
+            complexity of the plot. If *x* contains no duplicate points, this
+            has no effect and just uses some time and memory.
+
+        Other Parameters
+        ----------------
+        data : indexable object, optional
+            DATA_PARAMETER_PLACEHOLDER
+
+        **kwargs
+            Keyword arguments control the `.Line2D` properties:
+
+            %(Line2D:kwdoc)s
+
+        Returns
+        -------
+        `.Line2D`
+
+        Notes
+        -----
+        The ecdf plot can be thought of as a cumulative histogram with one bin
+        per data entry; i.e. it reports on the entire dataset without any
+        arbitrary binning.
+
+        If *x* contains NaNs or masked entries, either remove them first from
+        the array (if they should not taken into account), or replace them by
+        -inf or +inf (if they should be sorted at the beginning or the end of
+        the array).
+        """
+        _api.check_in_list(["horizontal", "vertical"], orientation=orientation)
+        if "drawstyle" in kwargs or "ds" in kwargs:
+            raise TypeError("Cannot pass 'drawstyle' or 'ds' to ecdf()")
+        if np.ma.getmask(x).any():
+            raise ValueError("ecdf() does not support masked entries")
+        x = np.asarray(x)
+        if np.isnan(x).any():
+            raise ValueError("ecdf() does not support NaNs")
+        argsort = np.argsort(x)
+        x = x[argsort]
+        if weights is None:
+            # Ensure that we end at exactly 1, avoiding floating point errors.
+            cum_weights = (1 + np.arange(len(x))) / len(x)
+        else:
+            weights = np.take(weights, argsort)   # Reorder weights like we reordered x.
+            cum_weights = np.cumsum(weights / np.sum(weights))
+        if compress:
+            # Get indices of unique x values.
+            compress_idxs = [0, *(x[:-1] != x[1:]).nonzero()[0] + 1]
+            x = x[compress_idxs]
+            cum_weights = cum_weights[compress_idxs]
+        if orientation == "vertical":
+            if not complementary:
+                line, = self.plot([x[0], *x], [0, *cum_weights],
+                                  drawstyle="steps-post", **kwargs)
+            else:
+                line, = self.plot([*x, x[-1]], [1, *1 - cum_weights],
+                                  drawstyle="steps-pre", **kwargs)
+            line.sticky_edges.y[:] = [0, 1]
+        else:  # orientation == "horizontal":
+            if not complementary:
+                line, = self.plot([0, *cum_weights], [x[0], *x],
+                                  drawstyle="steps-pre", **kwargs)
+            else:
+                line, = self.plot([1, *1 - cum_weights], [*x, x[-1]],
+                                  drawstyle="steps-post", **kwargs)
+            line.sticky_edges.x[:] = [0, 1]
+        return line
+
     @_preprocess_data(replace_names=["x"])
     @_docstring.dedent_interpd
     def psd(self, x, NFFT=None, Fs=None, Fc=None, detrend=None,
diff --git a/lib/matplotlib/pyplot.py b/lib/matplotlib/pyplot.py
@@ -2515,6 +2515,17 @@ def csd(
         **({"data": data} if data is not None else {}), **kwargs)
 
 
+# Autogenerated by boilerplate.py.  Do not edit as changes will be lost.
+@_copy_docstring_and_deprecators(Axes.ecdf)
+def ecdf(
+        x, weights=None, *, complementary=False,
+        orientation='vertical', compress=False, data=None, **kwargs):
+    return gca().ecdf(
+        x, weights=weights, complementary=complementary,
+        orientation=orientation, compress=compress,
+        **({"data": data} if data is not None else {}), **kwargs)
+
+
 # Autogenerated by boilerplate.py.  Do not edit as changes will be lost.
 @_copy_docstring_and_deprecators(Axes.errorbar)
 def errorbar(
diff --git a/lib/matplotlib/tests/test_axes.py b/lib/matplotlib/tests/test_axes.py
@@ -8448,3 +8448,36 @@ def test_rc_axes_label_formatting():
     assert ax.xaxis.label.get_color() == 'red'
     assert ax.xaxis.label.get_fontsize() == 20
     assert ax.xaxis.label.get_fontweight() == 'bold'
+
+
+@check_figures_equal(extensions=["png"])
+def test_ecdf(fig_test, fig_ref):
+    data = np.array([0, -np.inf, -np.inf, np.inf, 1, 1, 2])
+    weights = range(len(data))
+    axs_test = fig_test.subplots(1, 2)
+    for ax, orientation in zip(axs_test, ["vertical", "horizontal"]):
+        l0 = ax.ecdf(data, orientation=orientation)
+        l1 = ax.ecdf("d", "w", data={"d": np.ma.array(data), "w": weights},
+                     orientation=orientation,
+                     complementary=True, compress=True, ls=":")
+        assert len(l0.get_xdata()) == (~np.isnan(data)).sum() + 1
+        assert len(l1.get_xdata()) == len({*data[~np.isnan(data)]}) + 1
+    axs_ref = fig_ref.subplots(1, 2)
+    axs_ref[0].plot([-np.inf, -np.inf, -np.inf, 0, 1, 1, 2, np.inf],
+                    np.arange(8) / 7, ds="steps-post")
+    axs_ref[0].plot([-np.inf, 0, 1, 2, np.inf, np.inf],
+                    np.array([21, 20, 18, 14, 3, 0]) / 21,
+                    ds="steps-pre", ls=":")
+    axs_ref[1].plot(np.arange(8) / 7,
+                    [-np.inf, -np.inf, -np.inf, 0, 1, 1, 2, np.inf],
+                    ds="steps-pre")
+    axs_ref[1].plot(np.array([21, 20, 18, 14, 3, 0]) / 21,
+                    [-np.inf, 0, 1, 2, np.inf, np.inf],
+                    ds="steps-post", ls=":")
+
+
+def test_ecdf_invalid():
+    with pytest.raises(ValueError):
+        plt.ecdf([1, np.nan])
+    with pytest.raises(ValueError):
+        plt.ecdf(np.ma.array([1, 2], mask=[True, False]))
diff --git a/tools/boilerplate.py b/tools/boilerplate.py
@@ -246,6 +246,7 @@ def boilerplate_gen():
         'contour',
         'contourf',
         'csd',
+        'ecdf',
         'errorbar',
         'eventplot',
         'fill',