From 70df945b240a8afac913e5535f4753f6c9b5bdd9 Mon Sep 17 00:00:00 2001 From: Antony Lee Date: Sat, 7 Oct 2017 22:46:34 -0700 Subject: [PATCH 1/3] Don't sort categorical keys. --- lib/matplotlib/category.py | 49 ++++++++++++--------------- lib/matplotlib/tests/test_category.py | 24 +++++-------- 2 files changed, 29 insertions(+), 44 deletions(-) diff --git a/lib/matplotlib/category.py b/lib/matplotlib/category.py index d043c5b154a5..ea72c277b5ee 100644 --- a/lib/matplotlib/category.py +++ b/lib/matplotlib/category.py @@ -1,20 +1,19 @@ -# -*- coding: utf-8 OA-*-za -""" -catch all for categorical functions +"""Helpers for categorical data. """ from __future__ import (absolute_import, division, print_function, unicode_literals) import six +import collections +from collections import OrderedDict +from distutils.version import LooseVersion +import itertools + import numpy as np import matplotlib.units as units import matplotlib.ticker as ticker -# np 1.6/1.7 support -from distutils.version import LooseVersion -import collections - if LooseVersion(np.__version__) >= LooseVersion('1.8.0'): def shim_array(data): @@ -37,9 +36,7 @@ def shim_array(data): class StrCategoryConverter(units.ConversionInterface): @staticmethod def convert(value, unit, axis): - """Uses axis.unit_data map to encode - data as floats - """ + """Uses axis.unit_data map to encode data as floats.""" vmap = dict(zip(axis.unit_data.seq, axis.unit_data.locs)) if isinstance(value, six.string_types): @@ -86,8 +83,7 @@ class UnitData(object): spdict = {'nan': -1.0, 'inf': -2.0, '-inf': -3.0} def __init__(self, data): - """Create mapping between unique categorical values - and numerical identifier + """Create mapping between unique categorical values and numerical id. Parameters ---------- @@ -95,23 +91,20 @@ def __init__(self, data): sequence of values """ self.seq, self.locs = [], [] - self._set_seq_locs(data, 0) - - def update(self, new_data): - # so as not to conflict with spdict - value = max(max(self.locs) + 1, 0) - self._set_seq_locs(new_data, value) - - def _set_seq_locs(self, data, value): - strdata = shim_array(data) - new_s = [d for d in np.unique(strdata) if d not in self.seq] - for ns in new_s: - self.seq.append(ns) - if ns in UnitData.spdict: - self.locs.append(UnitData.spdict[ns]) + self._counter = itertools.count() + self.update(data) + + def update(self, data): + data = np.atleast_1d(shim_array(data)) + sorted_unique = list(OrderedDict(zip(data, itertools.repeat(None)))) + for s in sorted_unique: + if s in self.seq: + continue + self.seq.append(s) + if s in UnitData.spdict: + self.locs.append(UnitData.spdict[s]) else: - self.locs.append(value) - value += 1 + self.locs.append(next(self._counter)) # Connects the convertor to matplotlib diff --git a/lib/matplotlib/tests/test_category.py b/lib/matplotlib/tests/test_category.py index 6e5c43d76fb9..0080d86fe04e 100644 --- a/lib/matplotlib/tests/test_category.py +++ b/lib/matplotlib/tests/test_category.py @@ -16,8 +16,8 @@ class TestUnitData(object): testdata = [("hello world", ["hello world"], [0]), ("Здравствуйте мир", ["Здравствуйте мир"], [0]), (['A', 'A', np.nan, 'B', -np.inf, 3.14, np.inf], - ['-inf', '3.14', 'A', 'B', 'inf', 'nan'], - [-3.0, 0, 1, 2, -2.0, -1.0])] + ['A', 'nan', 'B', '-inf', '3.14', 'inf'], + [0, -1, 1, -3, 2, -2])] ids = ["single", "unicode", "mixed"] @@ -28,21 +28,13 @@ def test_unit(self, data, seq, locs): assert act.locs == locs def test_update_map(self): - data = ['a', 'd'] - oseq = ['a', 'd'] - olocs = [0, 1] + unitdata = cat.UnitData(['a', 'd']) + assert unitdata.seq == ['a', 'd'] + assert unitdata.locs == [0, 1] - data_update = ['b', 'd', 'e', np.inf] - useq = ['a', 'd', 'b', 'e', 'inf'] - ulocs = [0, 1, 2, 3, -2] - - unitdata = cat.UnitData(data) - assert unitdata.seq == oseq - assert unitdata.locs == olocs - - unitdata.update(data_update) - assert unitdata.seq == useq - assert unitdata.locs == ulocs + unitdata.update(['b', 'd', 'e', np.inf]) + assert unitdata.seq == ['a', 'd', 'b', 'e', 'inf'] + assert unitdata.locs == [0, 1, 2, 3, -2] class FakeAxis(object): From 142b78b64bff1c7ef0d58fbf5daf416c8bedf0a9 Mon Sep 17 00:00:00 2001 From: Antony Lee Date: Sun, 8 Oct 2017 23:43:26 -0700 Subject: [PATCH 2/3] Rewrite category.py. --- lib/matplotlib/category.py | 90 ++++++++++------------- lib/matplotlib/tests/test_category.py | 100 +++++++------------------- 2 files changed, 65 insertions(+), 125 deletions(-) diff --git a/lib/matplotlib/category.py b/lib/matplotlib/category.py index ea72c277b5ee..f93520838208 100644 --- a/lib/matplotlib/category.py +++ b/lib/matplotlib/category.py @@ -4,61 +4,45 @@ unicode_literals) import six -import collections from collections import OrderedDict -from distutils.version import LooseVersion import itertools +from numbers import Number import numpy as np -import matplotlib.units as units -import matplotlib.ticker as ticker +from matplotlib import units, ticker -if LooseVersion(np.__version__) >= LooseVersion('1.8.0'): - def shim_array(data): - return np.array(data, dtype=np.unicode) -else: - def shim_array(data): - if (isinstance(data, six.string_types) or - not isinstance(data, collections.Iterable)): - data = [data] - try: - data = [str(d) for d in data] - except UnicodeEncodeError: - # this yields gibberish but unicode text doesn't - # render under numpy1.6 anyway - data = [d.encode('utf-8', 'ignore').decode('utf-8') - for d in data] - return np.array(data, dtype=np.unicode) +def _to_str(s): + return s.decode("ascii") if isinstance(s, bytes) else str(s) class StrCategoryConverter(units.ConversionInterface): @staticmethod def convert(value, unit, axis): """Uses axis.unit_data map to encode data as floats.""" - vmap = dict(zip(axis.unit_data.seq, axis.unit_data.locs)) - - if isinstance(value, six.string_types): - return vmap[value] - - vals = shim_array(value) - - for lab, loc in vmap.items(): - vals[vals == lab] = loc - - return vals.astype('float') + mapping = axis.unit_data._mapping + if isinstance(value, (Number, np.number)): + return value + elif isinstance(value, (str, bytes)): + return mapping[_to_str(value)] + else: + return np.array([v if isinstance(v, (Number, np.number)) + else mapping[_to_str(v)] + for v in value], + float) @staticmethod def axisinfo(unit, axis): - majloc = StrCategoryLocator(axis.unit_data.locs) - majfmt = StrCategoryFormatter(axis.unit_data.seq) + # Note that mapping may get mutated by later calls to plotting methods, + # so the locator and formatter must dynamically recompute locs and seq. + majloc = StrCategoryLocator(axis.unit_data._mapping) + majfmt = StrCategoryFormatter(axis.unit_data._mapping) return units.AxisInfo(majloc=majloc, majfmt=majfmt) @staticmethod def default_units(data, axis): - # the conversion call stack is: - # default_units->axis_info->convert + # the conversion call stack is default_units->axis_info->convert if axis.unit_data is None: axis.unit_data = UnitData(data) else: @@ -67,21 +51,26 @@ def default_units(data, axis): class StrCategoryLocator(ticker.FixedLocator): - def __init__(self, locs): - self.locs = locs + def __init__(self, mapping): + self._mapping = mapping self.nbins = None + @property + def locs(self): + return list(self._mapping.values()) + class StrCategoryFormatter(ticker.FixedFormatter): - def __init__(self, seq): - self.seq = seq - self.offset_string = '' + def __init__(self, mapping): + self._mapping = mapping + self.offset_string = "" + @property + def seq(self): + return list(self._mapping) -class UnitData(object): - # debatable makes sense to special code missing values - spdict = {'nan': -1.0, 'inf': -2.0, '-inf': -3.0} +class UnitData(object): def __init__(self, data): """Create mapping between unique categorical values and numerical id. @@ -90,21 +79,18 @@ def __init__(self, data): data: iterable sequence of values """ - self.seq, self.locs = [], [] + self._mapping = {} self._counter = itertools.count() self.update(data) def update(self, data): - data = np.atleast_1d(shim_array(data)) - sorted_unique = list(OrderedDict(zip(data, itertools.repeat(None)))) + if isinstance(data, six.string_types): + data = [data] + sorted_unique = OrderedDict.fromkeys(map(_to_str, data)) for s in sorted_unique: - if s in self.seq: + if s in self._mapping: continue - self.seq.append(s) - if s in UnitData.spdict: - self.locs.append(UnitData.spdict[s]) - else: - self.locs.append(next(self._counter)) + self._mapping[s] = next(self._counter) # Connects the convertor to matplotlib diff --git a/lib/matplotlib/tests/test_category.py b/lib/matplotlib/tests/test_category.py index 0080d86fe04e..adbfa15db374 100644 --- a/lib/matplotlib/tests/test_category.py +++ b/lib/matplotlib/tests/test_category.py @@ -13,28 +13,26 @@ class TestUnitData(object): - testdata = [("hello world", ["hello world"], [0]), - ("Здравствуйте мир", ["Здравствуйте мир"], [0]), + testdata = [("hello world", {"hello world": 0}), + ("Здравствуйте мир", {"Здравствуйте мир": 0}), (['A', 'A', np.nan, 'B', -np.inf, 3.14, np.inf], - ['A', 'nan', 'B', '-inf', '3.14', 'inf'], - [0, -1, 1, -3, 2, -2])] - + {'A': 0, 'nan': 1, 'B': 2, '-inf': 3, '3.14': 4, 'inf': 5})] ids = ["single", "unicode", "mixed"] - @pytest.mark.parametrize("data, seq, locs", testdata, ids=ids) - def test_unit(self, data, seq, locs): - act = cat.UnitData(data) - assert act.seq == seq - assert act.locs == locs + @pytest.mark.parametrize("data, mapping", testdata, ids=ids) + def test_unit(self, data, mapping): + assert cat.UnitData(data)._mapping == mapping def test_update_map(self): unitdata = cat.UnitData(['a', 'd']) - assert unitdata.seq == ['a', 'd'] - assert unitdata.locs == [0, 1] + assert unitdata._mapping == {'a': 0, 'd': 1} + unitdata.update(['b', 'd', 'e']) + assert unitdata._mapping == {'a': 0, 'd': 1, 'b': 2, 'e': 3} + - unitdata.update(['b', 'd', 'e', np.inf]) - assert unitdata.seq == ['a', 'd', 'b', 'e', 'inf'] - assert unitdata.locs == [0, 1, 2, 3, -2] +class MockUnitData: + def __init__(self, mapping): + self._mapping = mapping class FakeAxis(object): @@ -42,28 +40,20 @@ def __init__(self, unit_data): self.unit_data = unit_data -class MockUnitData(object): - def __init__(self, data): - seq, locs = zip(*data) - self.seq = list(seq) - self.locs = list(locs) - - class TestStrCategoryConverter(object): """Based on the pandas conversion and factorization tests: ref: /pandas/tseries/tests/test_converter.py /pandas/tests/test_algos.py:TestFactorize """ - testdata = [("Здравствуйте мир", [("Здравствуйте мир", 42)], 42), - ("hello world", [("hello world", 42)], 42), + testdata = [("Здравствуйте мир", {"Здравствуйте мир": 42}, 42), + ("hello world", {"hello world": 42}, 42), (['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], - [('a', 0), ('b', 1), ('c', 2)], + {'a': 0, 'b': 1, 'c': 2}, [0, 1, 1, 0, 0, 2, 2, 2]), - (['A', 'A', np.nan, 'B', -np.inf, 3.14, np.inf], - [('nan', -1), ('3.14', 0), ('A', 1), ('B', 2), - ('-inf', 100), ('inf', 200)], - [1, 1, -1, 2, 100, 0, 200])] + (['A', 'A', 'B', 3.14], + {'A': 1, 'B': 2}, + [1, 1, 2, 3.14])] ids = ["unicode", "single", "basic", "mixed"] @pytest.fixture(autouse=True) @@ -78,7 +68,7 @@ def test_convert(self, data, unitmap, exp): np.testing.assert_array_equal(act, exp) def test_axisinfo(self): - MUD = MockUnitData([(None, None)]) + MUD = MockUnitData({None: None}) axis = FakeAxis(MUD) ax = self.cc.axisinfo(None, axis) assert isinstance(ax.majloc, cat.StrCategoryLocator) @@ -91,8 +81,8 @@ def test_default_units(self): class TestStrCategoryLocator(object): def test_StrCategoryLocator(self): - locs = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - ticks = cat.StrCategoryLocator(locs) + locs = list(range(10)) + ticks = cat.StrCategoryLocator({str(x): x for x in locs}) np.testing.assert_array_equal(ticks.tick_values(None, None), locs) @@ -137,27 +127,18 @@ def data(self): self.d = ['a', 'b', 'c', 'a'] self.dticks = [0, 1, 2] self.dlabels = ['a', 'b', 'c'] - unitmap = [('a', 0), ('b', 1), ('c', 2)] + unitmap = {'a': 0, 'b': 1, 'c': 2} self.dunit_data = MockUnitData(unitmap) - @pytest.fixture - def missing_data(self): - self.dm = ['here', np.nan, 'here', 'there'] - self.dmticks = [0, -1, 1] - self.dmlabels = ['here', 'nan', 'there'] - unitmap = [('here', 0), ('nan', -1), ('there', 1)] - self.dmunit_data = MockUnitData(unitmap) - def axis_test(self, axis, ticks, labels, unit_data): np.testing.assert_array_equal(axis.get_majorticklocs(), ticks) assert lt(axis.get_majorticklabels()) == labels - np.testing.assert_array_equal(axis.unit_data.locs, unit_data.locs) - assert axis.unit_data.seq == unit_data.seq + assert axis.unit_data._mapping == unit_data._mapping def test_plot_unicode(self): words = ['Здравствуйте', 'привет'] locs = [0.0, 1.0] - unit_data = MockUnitData(zip(words, locs)) + unit_data = MockUnitData(dict(zip(words, locs))) fig, ax = plt.subplots() ax.plot(words) @@ -173,14 +154,6 @@ def test_plot_1d(self): self.axis_test(ax.yaxis, self.dticks, self.dlabels, self.dunit_data) - @pytest.mark.usefixtures("missing_data") - def test_plot_1d_missing(self): - fig, ax = plt.subplots() - ax.plot(self.dm) - fig.canvas.draw() - - self.axis_test(ax.yaxis, self.dmticks, self.dmlabels, self.dmunit_data) - @pytest.mark.usefixtures("data") @pytest.mark.parametrize("bars", bytes_data, ids=bytes_ids) def test_plot_bytes(self, bars): @@ -200,28 +173,9 @@ def test_plot_numlike(self, bars): ax.bar(bars, counts) fig.canvas.draw() - unitmap = MockUnitData([('1', 0), ('11', 1), ('3', 2)]) + unitmap = MockUnitData({'1': 0, '11': 1, '3': 2}) self.axis_test(ax.xaxis, [0, 1, 2], ['1', '11', '3'], unitmap) - @pytest.mark.usefixtures("data", "missing_data") - def test_plot_2d(self): - fig, ax = plt.subplots() - ax.plot(self.dm, self.d) - fig.canvas.draw() - - self.axis_test(ax.xaxis, self.dmticks, self.dmlabels, self.dmunit_data) - self.axis_test(ax.yaxis, self.dticks, self.dlabels, self.dunit_data) - - @pytest.mark.usefixtures("data", "missing_data") - def test_scatter_2d(self): - - fig, ax = plt.subplots() - ax.scatter(self.dm, self.d) - fig.canvas.draw() - - self.axis_test(ax.xaxis, self.dmticks, self.dmlabels, self.dmunit_data) - self.axis_test(ax.yaxis, self.dticks, self.dlabels, self.dunit_data) - def test_plot_update(self): fig, ax = plt.subplots() @@ -232,6 +186,6 @@ def test_plot_update(self): labels = ['a', 'b', 'd', 'c'] ticks = [0, 1, 2, 3] - unit_data = MockUnitData(list(zip(labels, ticks))) + unit_data = MockUnitData(dict(zip(labels, ticks))) self.axis_test(ax.yaxis, ticks, labels, unit_data) From a607fe6564e8dd761707edd95a0f7f87f0ce5163 Mon Sep 17 00:00:00 2001 From: Antony Lee Date: Mon, 9 Oct 2017 01:11:46 -0700 Subject: [PATCH 3/3] WIP: try to make category support saner... --- lib/matplotlib/category.py | 12 ++---------- lib/matplotlib/tests/test_category.py | 4 ++-- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/lib/matplotlib/category.py b/lib/matplotlib/category.py index f93520838208..5d651b26c5b0 100644 --- a/lib/matplotlib/category.py +++ b/lib/matplotlib/category.py @@ -6,7 +6,6 @@ from collections import OrderedDict import itertools -from numbers import Number import numpy as np @@ -22,15 +21,8 @@ class StrCategoryConverter(units.ConversionInterface): def convert(value, unit, axis): """Uses axis.unit_data map to encode data as floats.""" mapping = axis.unit_data._mapping - if isinstance(value, (Number, np.number)): - return value - elif isinstance(value, (str, bytes)): - return mapping[_to_str(value)] - else: - return np.array([v if isinstance(v, (Number, np.number)) - else mapping[_to_str(v)] - for v in value], - float) + return (mapping[_to_str(value)] if np.isscalar(value) + else np.array([mapping[_to_str(v)] for v in value], float)) @staticmethod def axisinfo(unit, axis): diff --git a/lib/matplotlib/tests/test_category.py b/lib/matplotlib/tests/test_category.py index adbfa15db374..06ef63dae215 100644 --- a/lib/matplotlib/tests/test_category.py +++ b/lib/matplotlib/tests/test_category.py @@ -52,8 +52,8 @@ class TestStrCategoryConverter(object): {'a': 0, 'b': 1, 'c': 2}, [0, 1, 1, 0, 0, 2, 2, 2]), (['A', 'A', 'B', 3.14], - {'A': 1, 'B': 2}, - [1, 1, 2, 3.14])] + {'3.14': 0, 'A': 1, 'B': 2}, + [1, 1, 2, 0])] ids = ["unicode", "single", "basic", "mixed"] @pytest.fixture(autouse=True)