From 70df945b240a8afac913e5535f4753f6c9b5bdd9 Mon Sep 17 00:00:00 2001
From: Antony Lee <anntzer.lee@gmail.com>
Date: Sat, 7 Oct 2017 22:46:34 -0700
Subject: [PATCH 1/3] Don't sort categorical keys.

---
 lib/matplotlib/category.py            | 49 ++++++++++++---------------
 lib/matplotlib/tests/test_category.py | 24 +++++--------
 2 files changed, 29 insertions(+), 44 deletions(-)

diff --git a/lib/matplotlib/category.py b/lib/matplotlib/category.py
index d043c5b154a5..ea72c277b5ee 100644
--- a/lib/matplotlib/category.py
+++ b/lib/matplotlib/category.py
@@ -1,20 +1,19 @@
-# -*- coding: utf-8 OA-*-za
-"""
-catch all for categorical functions
+"""Helpers for categorical data.
 """
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 import six
 
+import collections
+from collections import OrderedDict
+from distutils.version import LooseVersion
+import itertools
+
 import numpy as np
 
 import matplotlib.units as units
 import matplotlib.ticker as ticker
 
-# np 1.6/1.7 support
-from distutils.version import LooseVersion
-import collections
-
 
 if LooseVersion(np.__version__) >= LooseVersion('1.8.0'):
     def shim_array(data):
@@ -37,9 +36,7 @@ def shim_array(data):
 class StrCategoryConverter(units.ConversionInterface):
     @staticmethod
     def convert(value, unit, axis):
-        """Uses axis.unit_data map to encode
-        data as floats
-        """
+        """Uses axis.unit_data map to encode data as floats."""
         vmap = dict(zip(axis.unit_data.seq, axis.unit_data.locs))
 
         if isinstance(value, six.string_types):
@@ -86,8 +83,7 @@ class UnitData(object):
     spdict = {'nan': -1.0, 'inf': -2.0, '-inf': -3.0}
 
     def __init__(self, data):
-        """Create mapping between unique categorical values
-        and numerical identifier
+        """Create mapping between unique categorical values and numerical id.
 
         Parameters
         ----------
@@ -95,23 +91,20 @@ def __init__(self, data):
             sequence of values
         """
         self.seq, self.locs = [], []
-        self._set_seq_locs(data, 0)
-
-    def update(self, new_data):
-        # so as not to conflict with spdict
-        value = max(max(self.locs) + 1, 0)
-        self._set_seq_locs(new_data, value)
-
-    def _set_seq_locs(self, data, value):
-        strdata = shim_array(data)
-        new_s = [d for d in np.unique(strdata) if d not in self.seq]
-        for ns in new_s:
-            self.seq.append(ns)
-            if ns in UnitData.spdict:
-                self.locs.append(UnitData.spdict[ns])
+        self._counter = itertools.count()
+        self.update(data)
+
+    def update(self, data):
+        data = np.atleast_1d(shim_array(data))
+        sorted_unique = list(OrderedDict(zip(data, itertools.repeat(None))))
+        for s in sorted_unique:
+            if s in self.seq:
+                continue
+            self.seq.append(s)
+            if s in UnitData.spdict:
+                self.locs.append(UnitData.spdict[s])
             else:
-                self.locs.append(value)
-                value += 1
+                self.locs.append(next(self._counter))
 
 
 # Connects the convertor to matplotlib
diff --git a/lib/matplotlib/tests/test_category.py b/lib/matplotlib/tests/test_category.py
index 6e5c43d76fb9..0080d86fe04e 100644
--- a/lib/matplotlib/tests/test_category.py
+++ b/lib/matplotlib/tests/test_category.py
@@ -16,8 +16,8 @@ class TestUnitData(object):
     testdata = [("hello world", ["hello world"], [0]),
                 ("Здравствуйте мир", ["Здравствуйте мир"], [0]),
                 (['A', 'A', np.nan, 'B', -np.inf, 3.14, np.inf],
-                 ['-inf', '3.14', 'A', 'B', 'inf', 'nan'],
-                 [-3.0, 0, 1, 2, -2.0, -1.0])]
+                 ['A', 'nan', 'B', '-inf', '3.14', 'inf'],
+                 [0, -1, 1, -3, 2, -2])]
 
     ids = ["single", "unicode", "mixed"]
 
@@ -28,21 +28,13 @@ def test_unit(self, data, seq, locs):
         assert act.locs == locs
 
     def test_update_map(self):
-        data = ['a', 'd']
-        oseq = ['a', 'd']
-        olocs = [0, 1]
+        unitdata = cat.UnitData(['a', 'd'])
+        assert unitdata.seq == ['a', 'd']
+        assert unitdata.locs == [0, 1]
 
-        data_update = ['b', 'd', 'e', np.inf]
-        useq = ['a', 'd', 'b', 'e', 'inf']
-        ulocs = [0, 1, 2, 3, -2]
-
-        unitdata = cat.UnitData(data)
-        assert unitdata.seq == oseq
-        assert unitdata.locs == olocs
-
-        unitdata.update(data_update)
-        assert unitdata.seq == useq
-        assert unitdata.locs == ulocs
+        unitdata.update(['b', 'd', 'e', np.inf])
+        assert unitdata.seq == ['a', 'd', 'b', 'e', 'inf']
+        assert unitdata.locs == [0, 1, 2, 3, -2]
 
 
 class FakeAxis(object):

From 142b78b64bff1c7ef0d58fbf5daf416c8bedf0a9 Mon Sep 17 00:00:00 2001
From: Antony Lee <anntzer.lee@gmail.com>
Date: Sun, 8 Oct 2017 23:43:26 -0700
Subject: [PATCH 2/3] Rewrite category.py.

---
 lib/matplotlib/category.py            |  90 ++++++++++-------------
 lib/matplotlib/tests/test_category.py | 100 +++++++-------------------
 2 files changed, 65 insertions(+), 125 deletions(-)

diff --git a/lib/matplotlib/category.py b/lib/matplotlib/category.py
index ea72c277b5ee..f93520838208 100644
--- a/lib/matplotlib/category.py
+++ b/lib/matplotlib/category.py
@@ -4,61 +4,45 @@
                         unicode_literals)
 import six
 
-import collections
 from collections import OrderedDict
-from distutils.version import LooseVersion
 import itertools
+from numbers import Number
 
 import numpy as np
 
-import matplotlib.units as units
-import matplotlib.ticker as ticker
+from matplotlib import units, ticker
 
 
-if LooseVersion(np.__version__) >= LooseVersion('1.8.0'):
-    def shim_array(data):
-        return np.array(data, dtype=np.unicode)
-else:
-    def shim_array(data):
-        if (isinstance(data, six.string_types) or
-                not isinstance(data, collections.Iterable)):
-            data = [data]
-        try:
-            data = [str(d) for d in data]
-        except UnicodeEncodeError:
-            # this yields gibberish but unicode text doesn't
-            # render under numpy1.6 anyway
-            data = [d.encode('utf-8', 'ignore').decode('utf-8')
-                    for d in data]
-        return np.array(data, dtype=np.unicode)
+def _to_str(s):
+    return s.decode("ascii") if isinstance(s, bytes) else str(s)
 
 
 class StrCategoryConverter(units.ConversionInterface):
     @staticmethod
     def convert(value, unit, axis):
         """Uses axis.unit_data map to encode data as floats."""
-        vmap = dict(zip(axis.unit_data.seq, axis.unit_data.locs))
-
-        if isinstance(value, six.string_types):
-            return vmap[value]
-
-        vals = shim_array(value)
-
-        for lab, loc in vmap.items():
-            vals[vals == lab] = loc
-
-        return vals.astype('float')
+        mapping = axis.unit_data._mapping
+        if isinstance(value, (Number, np.number)):
+            return value
+        elif isinstance(value, (str, bytes)):
+            return mapping[_to_str(value)]
+        else:
+            return np.array([v if isinstance(v, (Number, np.number))
+                             else mapping[_to_str(v)]
+                             for v in value],
+                            float)
 
     @staticmethod
     def axisinfo(unit, axis):
-        majloc = StrCategoryLocator(axis.unit_data.locs)
-        majfmt = StrCategoryFormatter(axis.unit_data.seq)
+        # Note that mapping may get mutated by later calls to plotting methods,
+        # so the locator and formatter must dynamically recompute locs and seq.
+        majloc = StrCategoryLocator(axis.unit_data._mapping)
+        majfmt = StrCategoryFormatter(axis.unit_data._mapping)
         return units.AxisInfo(majloc=majloc, majfmt=majfmt)
 
     @staticmethod
     def default_units(data, axis):
-        # the conversion call stack is:
-        # default_units->axis_info->convert
+        # the conversion call stack is default_units->axis_info->convert
         if axis.unit_data is None:
             axis.unit_data = UnitData(data)
         else:
@@ -67,21 +51,26 @@ def default_units(data, axis):
 
 
 class StrCategoryLocator(ticker.FixedLocator):
-    def __init__(self, locs):
-        self.locs = locs
+    def __init__(self, mapping):
+        self._mapping = mapping
         self.nbins = None
 
+    @property
+    def locs(self):
+        return list(self._mapping.values())
+
 
 class StrCategoryFormatter(ticker.FixedFormatter):
-    def __init__(self, seq):
-        self.seq = seq
-        self.offset_string = ''
+    def __init__(self, mapping):
+        self._mapping = mapping
+        self.offset_string = ""
 
+    @property
+    def seq(self):
+        return list(self._mapping)
 
-class UnitData(object):
-    # debatable makes sense to special code missing values
-    spdict = {'nan': -1.0, 'inf': -2.0, '-inf': -3.0}
 
+class UnitData(object):
     def __init__(self, data):
         """Create mapping between unique categorical values and numerical id.
 
@@ -90,21 +79,18 @@ def __init__(self, data):
         data: iterable
             sequence of values
         """
-        self.seq, self.locs = [], []
+        self._mapping = {}
         self._counter = itertools.count()
         self.update(data)
 
     def update(self, data):
-        data = np.atleast_1d(shim_array(data))
-        sorted_unique = list(OrderedDict(zip(data, itertools.repeat(None))))
+        if isinstance(data, six.string_types):
+            data = [data]
+        sorted_unique = OrderedDict.fromkeys(map(_to_str, data))
         for s in sorted_unique:
-            if s in self.seq:
+            if s in self._mapping:
                 continue
-            self.seq.append(s)
-            if s in UnitData.spdict:
-                self.locs.append(UnitData.spdict[s])
-            else:
-                self.locs.append(next(self._counter))
+            self._mapping[s] = next(self._counter)
 
 
 # Connects the convertor to matplotlib
diff --git a/lib/matplotlib/tests/test_category.py b/lib/matplotlib/tests/test_category.py
index 0080d86fe04e..adbfa15db374 100644
--- a/lib/matplotlib/tests/test_category.py
+++ b/lib/matplotlib/tests/test_category.py
@@ -13,28 +13,26 @@
 
 
 class TestUnitData(object):
-    testdata = [("hello world", ["hello world"], [0]),
-                ("Здравствуйте мир", ["Здравствуйте мир"], [0]),
+    testdata = [("hello world", {"hello world": 0}),
+                ("Здравствуйте мир", {"Здравствуйте мир": 0}),
                 (['A', 'A', np.nan, 'B', -np.inf, 3.14, np.inf],
-                 ['A', 'nan', 'B', '-inf', '3.14', 'inf'],
-                 [0, -1, 1, -3, 2, -2])]
-
+                 {'A': 0, 'nan': 1, 'B': 2, '-inf': 3, '3.14': 4, 'inf': 5})]
     ids = ["single", "unicode", "mixed"]
 
-    @pytest.mark.parametrize("data, seq, locs", testdata, ids=ids)
-    def test_unit(self, data, seq, locs):
-        act = cat.UnitData(data)
-        assert act.seq == seq
-        assert act.locs == locs
+    @pytest.mark.parametrize("data, mapping", testdata, ids=ids)
+    def test_unit(self, data, mapping):
+        assert cat.UnitData(data)._mapping == mapping
 
     def test_update_map(self):
         unitdata = cat.UnitData(['a', 'd'])
-        assert unitdata.seq == ['a', 'd']
-        assert unitdata.locs == [0, 1]
+        assert unitdata._mapping == {'a': 0, 'd': 1}
+        unitdata.update(['b', 'd', 'e'])
+        assert unitdata._mapping == {'a': 0, 'd': 1, 'b': 2, 'e': 3}
+
 
-        unitdata.update(['b', 'd', 'e', np.inf])
-        assert unitdata.seq == ['a', 'd', 'b', 'e', 'inf']
-        assert unitdata.locs == [0, 1, 2, 3, -2]
+class MockUnitData:
+    def __init__(self, mapping):
+        self._mapping = mapping
 
 
 class FakeAxis(object):
@@ -42,28 +40,20 @@ def __init__(self, unit_data):
         self.unit_data = unit_data
 
 
-class MockUnitData(object):
-    def __init__(self, data):
-        seq, locs = zip(*data)
-        self.seq = list(seq)
-        self.locs = list(locs)
-
-
 class TestStrCategoryConverter(object):
     """Based on the pandas conversion and factorization tests:
 
     ref: /pandas/tseries/tests/test_converter.py
          /pandas/tests/test_algos.py:TestFactorize
     """
-    testdata = [("Здравствуйте мир", [("Здравствуйте мир", 42)], 42),
-                ("hello world", [("hello world", 42)], 42),
+    testdata = [("Здравствуйте мир", {"Здравствуйте мир": 42}, 42),
+                ("hello world", {"hello world": 42}, 42),
                 (['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'],
-                 [('a', 0), ('b', 1), ('c', 2)],
+                 {'a': 0, 'b': 1, 'c': 2},
                  [0, 1, 1, 0, 0, 2, 2, 2]),
-                (['A', 'A', np.nan, 'B', -np.inf, 3.14, np.inf],
-                 [('nan', -1), ('3.14', 0), ('A', 1), ('B', 2),
-                  ('-inf', 100), ('inf', 200)],
-                 [1, 1, -1, 2, 100, 0, 200])]
+                (['A', 'A', 'B', 3.14],
+                 {'A': 1, 'B': 2},
+                 [1, 1, 2, 3.14])]
     ids = ["unicode", "single", "basic", "mixed"]
 
     @pytest.fixture(autouse=True)
@@ -78,7 +68,7 @@ def test_convert(self, data, unitmap, exp):
         np.testing.assert_array_equal(act, exp)
 
     def test_axisinfo(self):
-        MUD = MockUnitData([(None, None)])
+        MUD = MockUnitData({None: None})
         axis = FakeAxis(MUD)
         ax = self.cc.axisinfo(None, axis)
         assert isinstance(ax.majloc, cat.StrCategoryLocator)
@@ -91,8 +81,8 @@ def test_default_units(self):
 
 class TestStrCategoryLocator(object):
     def test_StrCategoryLocator(self):
-        locs = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        ticks = cat.StrCategoryLocator(locs)
+        locs = list(range(10))
+        ticks = cat.StrCategoryLocator({str(x): x for x in locs})
         np.testing.assert_array_equal(ticks.tick_values(None, None), locs)
 
 
@@ -137,27 +127,18 @@ def data(self):
         self.d = ['a', 'b', 'c', 'a']
         self.dticks = [0, 1, 2]
         self.dlabels = ['a', 'b', 'c']
-        unitmap = [('a', 0), ('b', 1), ('c', 2)]
+        unitmap = {'a': 0, 'b': 1, 'c': 2}
         self.dunit_data = MockUnitData(unitmap)
 
-    @pytest.fixture
-    def missing_data(self):
-        self.dm = ['here', np.nan, 'here', 'there']
-        self.dmticks = [0, -1, 1]
-        self.dmlabels = ['here', 'nan', 'there']
-        unitmap = [('here', 0), ('nan', -1), ('there', 1)]
-        self.dmunit_data = MockUnitData(unitmap)
-
     def axis_test(self, axis, ticks, labels, unit_data):
         np.testing.assert_array_equal(axis.get_majorticklocs(), ticks)
         assert lt(axis.get_majorticklabels()) == labels
-        np.testing.assert_array_equal(axis.unit_data.locs, unit_data.locs)
-        assert axis.unit_data.seq == unit_data.seq
+        assert axis.unit_data._mapping == unit_data._mapping
 
     def test_plot_unicode(self):
         words = ['Здравствуйте', 'привет']
         locs = [0.0, 1.0]
-        unit_data = MockUnitData(zip(words, locs))
+        unit_data = MockUnitData(dict(zip(words, locs)))
 
         fig, ax = plt.subplots()
         ax.plot(words)
@@ -173,14 +154,6 @@ def test_plot_1d(self):
 
         self.axis_test(ax.yaxis, self.dticks, self.dlabels, self.dunit_data)
 
-    @pytest.mark.usefixtures("missing_data")
-    def test_plot_1d_missing(self):
-        fig, ax = plt.subplots()
-        ax.plot(self.dm)
-        fig.canvas.draw()
-
-        self.axis_test(ax.yaxis, self.dmticks, self.dmlabels, self.dmunit_data)
-
     @pytest.mark.usefixtures("data")
     @pytest.mark.parametrize("bars", bytes_data, ids=bytes_ids)
     def test_plot_bytes(self, bars):
@@ -200,28 +173,9 @@ def test_plot_numlike(self, bars):
         ax.bar(bars, counts)
         fig.canvas.draw()
 
-        unitmap = MockUnitData([('1', 0), ('11', 1), ('3', 2)])
+        unitmap = MockUnitData({'1': 0, '11': 1, '3': 2})
         self.axis_test(ax.xaxis, [0, 1, 2], ['1', '11', '3'], unitmap)
 
-    @pytest.mark.usefixtures("data", "missing_data")
-    def test_plot_2d(self):
-        fig, ax = plt.subplots()
-        ax.plot(self.dm, self.d)
-        fig.canvas.draw()
-
-        self.axis_test(ax.xaxis, self.dmticks, self.dmlabels, self.dmunit_data)
-        self.axis_test(ax.yaxis, self.dticks, self.dlabels, self.dunit_data)
-
-    @pytest.mark.usefixtures("data", "missing_data")
-    def test_scatter_2d(self):
-
-        fig, ax = plt.subplots()
-        ax.scatter(self.dm, self.d)
-        fig.canvas.draw()
-
-        self.axis_test(ax.xaxis, self.dmticks, self.dmlabels, self.dmunit_data)
-        self.axis_test(ax.yaxis, self.dticks, self.dlabels, self.dunit_data)
-
     def test_plot_update(self):
         fig, ax = plt.subplots()
 
@@ -232,6 +186,6 @@ def test_plot_update(self):
 
         labels = ['a', 'b', 'd', 'c']
         ticks = [0, 1, 2, 3]
-        unit_data = MockUnitData(list(zip(labels, ticks)))
+        unit_data = MockUnitData(dict(zip(labels, ticks)))
 
         self.axis_test(ax.yaxis, ticks, labels, unit_data)

From a607fe6564e8dd761707edd95a0f7f87f0ce5163 Mon Sep 17 00:00:00 2001
From: Antony Lee <anntzer.lee@gmail.com>
Date: Mon, 9 Oct 2017 01:11:46 -0700
Subject: [PATCH 3/3] WIP: try to make category support saner...

---
 lib/matplotlib/category.py            | 12 ++----------
 lib/matplotlib/tests/test_category.py |  4 ++--
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/lib/matplotlib/category.py b/lib/matplotlib/category.py
index f93520838208..5d651b26c5b0 100644
--- a/lib/matplotlib/category.py
+++ b/lib/matplotlib/category.py
@@ -6,7 +6,6 @@
 
 from collections import OrderedDict
 import itertools
-from numbers import Number
 
 import numpy as np
 
@@ -22,15 +21,8 @@ class StrCategoryConverter(units.ConversionInterface):
     def convert(value, unit, axis):
         """Uses axis.unit_data map to encode data as floats."""
         mapping = axis.unit_data._mapping
-        if isinstance(value, (Number, np.number)):
-            return value
-        elif isinstance(value, (str, bytes)):
-            return mapping[_to_str(value)]
-        else:
-            return np.array([v if isinstance(v, (Number, np.number))
-                             else mapping[_to_str(v)]
-                             for v in value],
-                            float)
+        return (mapping[_to_str(value)] if np.isscalar(value)
+                else np.array([mapping[_to_str(v)] for v in value], float))
 
     @staticmethod
     def axisinfo(unit, axis):
diff --git a/lib/matplotlib/tests/test_category.py b/lib/matplotlib/tests/test_category.py
index adbfa15db374..06ef63dae215 100644
--- a/lib/matplotlib/tests/test_category.py
+++ b/lib/matplotlib/tests/test_category.py
@@ -52,8 +52,8 @@ class TestStrCategoryConverter(object):
                  {'a': 0, 'b': 1, 'c': 2},
                  [0, 1, 1, 0, 0, 2, 2, 2]),
                 (['A', 'A', 'B', 3.14],
-                 {'A': 1, 'B': 2},
-                 [1, 1, 2, 3.14])]
+                 {'3.14': 0, 'A': 1, 'B': 2},
+                 [1, 1, 2, 0])]
     ids = ["unicode", "single", "basic", "mixed"]
 
     @pytest.fixture(autouse=True)