bpo-35892: Fix mode() and add multimode() (#12089)

rhettinger · web-flow · commit fc06a192fdc4 · 2019-03-12T00:43:27.000-07:00
diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
@@ -37,16 +37,17 @@ Averages and measures of central location
 These functions calculate an average or typical value from a population
 or sample.
 
-=======================  =============================================
+=======================  ===============================================================
 :func:`mean`             Arithmetic mean ("average") of data.
 :func:`fmean`            Fast, floating point arithmetic mean.
 :func:`harmonic_mean`    Harmonic mean of data.
 :func:`median`           Median (middle value) of data.
 :func:`median_low`       Low median of data.
 :func:`median_high`      High median of data.
 :func:`median_grouped`   Median, or 50th percentile, of grouped data.
-:func:`mode`             Mode (most common value) of discrete data.
-=======================  =============================================
+:func:`mode`             Single mode (most common value) of discrete or nominal data.
+:func:`multimode`        List of modes (most common values) of discrete or nomimal data.
+=======================  ===============================================================
 
 Measures of spread
 ------------------
@@ -287,12 +288,12 @@ However, for reading convenience, most of the examples show sorted sequences.
 
 .. function:: mode(data)
 
-   Return the most common data point from discrete or nominal *data*.  The mode
-   (when it exists) is the most typical value, and is a robust measure of
-   central location.
+   Return the single most common data point from discrete or nominal *data*.
+   The mode (when it exists) is the most typical value and serves as a
+   measure of central location.
 
-   If *data* is empty, or if there is not exactly one most common value,
-   :exc:`StatisticsError` is raised.
+   If there are multiple modes, returns the first one encountered in the *data*.
+   If *data* is empty, :exc:`StatisticsError` is raised.
 
    ``mode`` assumes discrete data, and returns a single value. This is the
    standard treatment of the mode as commonly taught in schools:
@@ -310,6 +311,27 @@ However, for reading convenience, most of the examples show sorted sequences.
       >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
       'red'
 
+   .. versionchanged:: 3.8
+      Now handles multimodal datasets by returning the first mode encountered.
+      Formerly, it raised :exc:`StatisticsError` when more than one mode was
+      found.
+
+
+.. function:: multimode(data)
+
+   Return a list of the most frequently occurring values in the order they
+   were first encountered in the *data*.  Will return more than one result if
+   there are multiple modes or an empty list if the *data* is empty:
+
+   .. doctest::
+
+        >>> multimode('aabbbbccddddeeffffgg')
+        ['b', 'd', 'f']
+        >>> multimode('')
+        []
+
+   .. versionadded:: 3.8
+
 
 .. function:: pstdev(data, mu=None)
 
diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst
@@ -282,6 +282,9 @@ Added :func:`statistics.fmean` as a faster, floating point variant of
 :func:`statistics.mean()`.  (Contributed by Raymond Hettinger and
 Steven D'Aprano in :issue:`35904`.)
 
+Added :func:`statistics.multimode` that returns a list of the most
+common values. (Contributed by Raymond Hettinger in :issue:`35892`.)
+
 Added :class:`statistics.NormalDist`, a tool for creating
 and manipulating normal distributions of a random variable.
 (Contributed by Raymond Hettinger in :issue:`36018`.)
@@ -591,6 +594,11 @@ Changes in the Python API
 * The function :func:`platform.popen` has been removed, it was deprecated since
   Python 3.3: use :func:`os.popen` instead.
 
+* The :func:`statistics.mode` function no longer raises an exception
+  when given multimodal data.  Instead, it returns the first mode
+  encountered in the input data.  (Contributed by Raymond Hettinger
+  in :issue:`35892`.)
+
 * The :meth:`~tkinter.ttk.Treeview.selection` method of the
   :class:`tkinter.ttk.Treeview` class no longer takes arguments.  Using it with
   arguments for changing the selection was deprecated in Python 3.6.  Use
diff --git a/Lib/statistics.py b/Lib/statistics.py
@@ -17,6 +17,7 @@
 median_high         High median of data.
 median_grouped      Median, or 50th percentile, of grouped data.
 mode                Mode (most common value) of data.
+multimode           List of modes (most common values of data)
 ==================  =============================================
 
 Calculate the arithmetic mean ("the average") of data:
@@ -79,10 +80,9 @@
 __all__ = [ 'StatisticsError', 'NormalDist',
             'pstdev', 'pvariance', 'stdev', 'variance',
             'median',  'median_low', 'median_high', 'median_grouped',
-            'mean', 'mode', 'harmonic_mean', 'fmean',
+            'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
           ]
 
-import collections
 import math
 import numbers
 import random
@@ -92,8 +92,8 @@
 from itertools import groupby
 from bisect import bisect_left, bisect_right
 from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
-
-
+from operator import itemgetter
+from collections import Counter
 
 # === Exceptions ===
 
@@ -249,20 +249,6 @@ def _convert(value, T):
             raise
 
 
-def _counts(data):
-    # Generate a table of sorted (value, frequency) pairs.
-    table = collections.Counter(iter(data)).most_common()
-    if not table:
-        return table
-    # Extract the values with the highest frequency.
-    maxfreq = table[0][1]
-    for i in range(1, len(table)):
-        if table[i][1] != maxfreq:
-            table = table[:i]
-            break
-    return table
-
-
 def _find_lteq(a, x):
     'Locate the leftmost value exactly equal to x'
     i = bisect_left(a, x)
@@ -334,9 +320,9 @@ def count(x):
             nonlocal n
             n += 1
             return x
-        total = math.fsum(map(count, data))
+        total = fsum(map(count, data))
     else:
-        total = math.fsum(data)
+        total = fsum(data)
     try:
         return total / n
     except ZeroDivisionError:
@@ -523,19 +509,38 @@ def mode(data):
     >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
     'red'
 
-    If there is not exactly one most common value, ``mode`` will raise
-    StatisticsError.
+    If there are multiple modes, return the first one encountered.
+
+        >>> mode(['red', 'red', 'green', 'blue', 'blue'])
+        'red'
+
+    If *data* is empty, ``mode``, raises StatisticsError.
+
     """
-    # Generate a table of sorted (value, frequency) pairs.
-    table = _counts(data)
-    if len(table) == 1:
-        return table[0][0]
-    elif table:
-        raise StatisticsError(
-                'no unique mode; found %d equally common values' % len(table)
-                )
-    else:
-        raise StatisticsError('no mode for empty data')
+    data = iter(data)
+    try:
+        return Counter(data).most_common(1)[0][0]
+    except IndexError:
+        raise StatisticsError('no mode for empty data') from None
+
+
+def multimode(data):
+    """ Return a list of the most frequently occurring values.
+
+        Will return more than one result if there are multiple modes
+        or an empty list if *data* is empty.
+
+        >>> multimode('aabbbbbbbbcc')
+        ['b']
+        >>> multimode('aabbbbccddddeeffffgg')
+        ['b', 'd', 'f']
+        >>> multimode('')
+        []
+
+    """
+    counts = Counter(iter(data)).most_common()
+    maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
+    return list(map(itemgetter(0), mode_items))
 
 
 # === Measures of spread ===
@@ -836,6 +841,7 @@ def __repr__(self):
     from math import isclose
     from operator import add, sub, mul, truediv
     from itertools import repeat
+    import doctest
 
     g1 = NormalDist(10, 20)
     g2 = NormalDist(-5, 25)
@@ -893,3 +899,5 @@ def assert_close(G1, G2):
     S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
                                                        Y.samples(n))])
     assert_close(X - Y, S)
+
+    print(doctest.testmod())
diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
@@ -1769,7 +1769,7 @@ def prepare_data(self):
     def test_range_data(self):
         # Override test from UnivariateCommonMixin.
         data = range(20, 50, 3)
-        self.assertRaises(statistics.StatisticsError, self.func, data)
+        self.assertEqual(self.func(data), 20)
 
     def test_nominal_data(self):
         # Test mode with nominal data.
@@ -1790,13 +1790,14 @@ def test_bimodal_data(self):
         # Test mode with bimodal data.
         data = [1, 1, 2, 2, 2, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8, 9, 9]
         assert data.count(2) == data.count(6) == 4
-        # Check for an exception.
-        self.assertRaises(statistics.StatisticsError, self.func, data)
+        # mode() should return 2, the first encounted mode
+        self.assertEqual(self.func(data), 2)
 
-    def test_unique_data_failure(self):
-        # Test mode exception when data points are all unique.
+    def test_unique_data(self):
+        # Test mode when data points are all unique.
         data = list(range(10))
-        self.assertRaises(statistics.StatisticsError, self.func, data)
+        # mode() should return 0, the first encounted mode
+        self.assertEqual(self.func(data), 0)
 
     def test_none_data(self):
         # Test that mode raises TypeError if given None as data.
@@ -1809,8 +1810,18 @@ def test_counter_data(self):
         # Test that a Counter is treated like any other iterable.
         data = collections.Counter([1, 1, 1, 2])
         # Since the keys of the counter are treated as data points, not the
-        # counts, this should raise.
-        self.assertRaises(statistics.StatisticsError, self.func, data)
+        # counts, this should return the first mode encountered, 1
+        self.assertEqual(self.func(data), 1)
+
+
+class TestMultiMode(unittest.TestCase):
+
+    def test_basics(self):
+        multimode = statistics.multimode
+        self.assertEqual(multimode('aabbbbbbbbcc'), ['b'])
+        self.assertEqual(multimode('aabbbbccddddeeffffgg'), ['b', 'd', 'f'])
+        self.assertEqual(multimode(''), [])
+
 
 class TestFMean(unittest.TestCase):