From 5392fc1996c133460d911abe7bc66a587d1fca51 Mon Sep 17 00:00:00 2001 From: Antony Lee Date: Sun, 24 Oct 2021 02:31:26 +0200 Subject: [PATCH] Use named groups in mathtext parser. This makes the definitions much easier to read, by removing the need to use Group() and Suppress(): previously, these would be used to nest the parsed tokens and remove the unnecessary ones (e.g, braces), but such tokens can also easily be ignored by giving names to the tokens we use (instead of accessing them by position). (See e.g. the implementation of subsuper, which is also somewhat simplified.) Also remove a few other pyparsing combinators that are not needed: Combine (just concatenate the strings ourselves), FollowedBy (use lookahead regexes), Literal (use plain strings, in particular braces which are very common). Also remove right_delim_safe, as self._right_delim already includes the backslash-escaped closing brace rather than the unescaped one (right_delim_safe dates back to when it didn't). The error message for `a^2_2^2` now changed to "double superscript" as we now simply check for subscripts and superscripts one at a time, and fail when we see either two subscript or two superscripts, emitting the corresponding error. --- lib/matplotlib/_mathtext.py | 392 ++++++++++---------------- lib/matplotlib/tests/test_mathtext.py | 7 +- 2 files changed, 153 insertions(+), 246 deletions(-) diff --git a/lib/matplotlib/_mathtext.py b/lib/matplotlib/_mathtext.py index 3feb70f59736..88619f2485a8 100644 --- a/lib/matplotlib/_mathtext.py +++ b/lib/matplotlib/_mathtext.py @@ -12,9 +12,9 @@ import numpy as np from pyparsing import ( - Combine, Empty, Forward, Group, Literal, oneOf, OneOrMore, - Optional, ParseBaseException, ParseFatalException, ParserElement, - ParseResults, QuotedString, Regex, StringEnd, Suppress, White, ZeroOrMore) + Empty, Forward, NotAny, oneOf, OneOrMore, Optional, ParseBaseException, + ParseFatalException, ParserElement, ParseResults, QuotedString, Regex, + StringEnd, ZeroOrMore, pyparsing_common) import matplotlib as mpl from . import _api, cbook @@ -1948,10 +1948,8 @@ def __init__(self): # All forward declarations are here p.accent = Forward() p.ambi_delim = Forward() - p.apostrophe = Forward() p.auto_delim = Forward() p.binom = Forward() - p.bslash = Forward() p.customspace = Forward() p.end_group = Forward() p.float_literal = Forward() @@ -1961,11 +1959,7 @@ def __init__(self): p.function = Forward() p.genfrac = Forward() p.group = Forward() - p.int_literal = Forward() - p.latexfont = Forward() - p.lbracket = Forward() p.left_delim = Forward() - p.lbrace = Forward() p.main = Forward() p.math = Forward() p.math_string = Forward() @@ -1974,11 +1968,8 @@ def __init__(self): p.overline = Forward() p.overset = Forward() p.placeable = Forward() - p.rbrace = Forward() - p.rbracket = Forward() p.required_group = Forward() p.right_delim = Forward() - p.right_delim_safe = Forward() p.simple = Forward() p.simple_group = Forward() p.single_symbol = Forward() @@ -1987,142 +1978,99 @@ def __init__(self): p.sqrt = Forward() p.start_group = Forward() p.subsuper = Forward() - p.subsuperop = Forward() p.symbol = Forward() p.symbol_name = Forward() p.token = Forward() p.underset = Forward() p.unknown_symbol = Forward() - # Set names on everything -- very useful for debugging for key, val in vars(p).items(): if not key.startswith('_'): + # Set names on everything -- very useful for debugging val.setName(key) + # Set actions + if hasattr(self, key): + val.setParseAction(getattr(self, key)) p.float_literal <<= Regex(r"[-+]?([0-9]+\.?[0-9]*|\.[0-9]+)") - p.int_literal <<= Regex("[-+]?[0-9]+") - - p.lbrace <<= Literal('{').suppress() - p.rbrace <<= Literal('}').suppress() - p.lbracket <<= Literal('[').suppress() - p.rbracket <<= Literal(']').suppress() - p.bslash <<= Literal('\\') - - p.space <<= oneOf(list(self._space_widths)) - p.customspace <<= ( - Suppress(Literal(r'\hspace')) - - ((p.lbrace + p.float_literal + p.rbrace) - | Error(r"Expected \hspace{n}")) - ) - unicode_range = "\U00000080-\U0001ffff" + p.space <<= oneOf(self._space_widths)("space") + p.customspace <<= r"\hspace" - ( + "{" + p.float_literal("space") + "}" + | Error(r"Expected \hspace{n}")) + p.single_symbol <<= Regex( r"([a-zA-Z0-9 +\-*/<>=:,.;!\?&'@()\[\]|%s])|(\\[%%${}\[\]_|])" % - unicode_range) - p.accentprefixed <<= Suppress(p.bslash) + oneOf(self._accentprefixed) + "\U00000080-\U0001ffff" # unicode range + )("sym") + p.accentprefixed <<= "\\" + oneOf(self._accentprefixed)("sym") p.symbol_name <<= ( - Combine(p.bslash + oneOf(list(tex2uni))) - + Suppress(Regex("(?=[^A-Za-z]|$)").leaveWhitespace()) - ) + oneOf([rf"\{sym}" for sym in tex2uni])("sym") + + Regex("(?=[^A-Za-z]|$)").leaveWhitespace()) p.symbol <<= (p.single_symbol | p.symbol_name).leaveWhitespace() - p.apostrophe <<= Regex("'+") - - p.accent <<= Group( - Suppress(p.bslash) - + oneOf([*self._accent_map, *self._wide_accents]) - + Suppress(Optional(White())) - - p.placeable - ) - - p.function <<= ( - Suppress(p.bslash) - + oneOf(list(self._function_names)) - ) - - p.start_group <<= Optional(p.latexfont) + p.lbrace - p.end_group <<= p.rbrace.copy() - p.simple_group <<= Group(p.lbrace + ZeroOrMore(p.token) + p.rbrace) - p.required_group <<= Group(p.lbrace + OneOrMore(p.token) + p.rbrace) - p.group <<= Group( - p.start_group + ZeroOrMore(p.token) + p.end_group - ) - - p.font <<= Suppress(p.bslash) + oneOf(list(self._fontnames)) - p.latexfont <<= ( - Suppress(p.bslash) - + oneOf(['math' + x for x in self._fontnames]) - ) - - p.frac <<= Group( - Suppress(Literal(r"\frac")) - - ((p.required_group + p.required_group) - | Error(r"Expected \frac{num}{den}")) - ) - - p.dfrac <<= Group( - Suppress(Literal(r"\dfrac")) - - ((p.required_group + p.required_group) - | Error(r"Expected \dfrac{num}{den}")) - ) - - p.binom <<= Group( - Suppress(Literal(r"\binom")) - - ((p.required_group + p.required_group) - | Error(r"Expected \binom{num}{den}")) - ) - - p.ambi_delim <<= oneOf(list(self._ambi_delim)) - p.left_delim <<= oneOf(list(self._left_delim)) - p.right_delim <<= oneOf(list(self._right_delim)) - p.right_delim_safe <<= oneOf([*(self._right_delim - {'}'}), r'\}']) - - p.genfrac <<= Group( - Suppress(Literal(r"\genfrac")) - - (((p.lbrace - + Optional(p.ambi_delim | p.left_delim, default='') - + p.rbrace) - + (p.lbrace - + Optional(p.ambi_delim | p.right_delim_safe, default='') - + p.rbrace) - + (p.lbrace + p.float_literal + p.rbrace) - + p.simple_group + p.required_group + p.required_group) - | Error("Expected " - r"\genfrac{ldelim}{rdelim}{rulesize}{style}{num}{den}")) - ) - - p.sqrt <<= Group( - Suppress(Literal(r"\sqrt")) - - ((Group(Optional( - p.lbracket + OneOrMore(~p.rbracket + p.token) + p.rbracket)) - + p.required_group) - | Error("Expected \\sqrt{value}")) - ) - - p.overline <<= Group( - Suppress(Literal(r"\overline")) - - (p.required_group | Error("Expected \\overline{value}")) - ) - - p.overset <<= Group( - Suppress(Literal(r"\overset")) - - ((p.simple_group + p.simple_group) - | Error("Expected \\overset{body}{annotation}")) - ) - - p.underset <<= Group( - Suppress(Literal(r"\underset")) - - ((p.simple_group + p.simple_group) - | Error("Expected \\underset{body}{annotation}")) - ) - - p.unknown_symbol <<= Combine(p.bslash + Regex("[A-Za-z]*")) - - p.operatorname <<= Group( - Suppress(Literal(r"\operatorname")) - - ((p.lbrace + ZeroOrMore(p.simple | p.unknown_symbol) + p.rbrace) - | Error("Expected \\operatorname{value}")) - ) + p.accent <<= ( + "\\" + + oneOf([*self._accent_map, *self._wide_accents])("accent") + - p.placeable("sym")) + + p.function <<= "\\" + oneOf(self._function_names)("name") + p.operatorname <<= r"\operatorname" - ( + "{" + ZeroOrMore(p.simple | p.unknown_symbol)("name") + "}" + | Error(r"Expected \operatorname{name}")) + + p.start_group <<= ( + Optional(r"\math" + oneOf(self._fontnames)("font")) + "{") + p.end_group <<= "}" + p.group <<= ( + p.start_group + ZeroOrMore(p.token)("group") + p.end_group) + + p.font <<= "\\" + oneOf(self._fontnames)("font") + + p.simple_group <<= "{" + ZeroOrMore(p.token)("group") + "}" + p.required_group <<= "{" + OneOrMore(p.token)("group") + "}" + + p.frac <<= r"\frac" - ( + p.required_group("num") + p.required_group("den") + | Error(r"Expected \frac{num}{den}")) + p.dfrac <<= r"\dfrac" - ( + p.required_group("num") + p.required_group("den") + | Error(r"Expected \dfrac{num}{den}")) + p.binom <<= r"\binom" - ( + p.required_group("num") + p.required_group("den") + | Error(r"Expected \binom{num}{den}")) + + p.ambi_delim <<= oneOf(self._ambi_delim) + p.left_delim <<= oneOf(self._left_delim) + p.right_delim <<= oneOf(self._right_delim) + + p.genfrac <<= r"\genfrac" - ( + "{" + Optional(p.ambi_delim | p.left_delim)("ldelim") + "}" + + "{" + Optional(p.ambi_delim | p.right_delim)("rdelim") + "}" + + "{" + p.float_literal("rulesize") + "}" + + p.simple_group("style") + + p.required_group("num") + + p.required_group("den") + | Error("Expected " + r"\genfrac{ldelim}{rdelim}{rulesize}{style}{num}{den}")) + + p.sqrt <<= r"\sqrt" - ( + Optional("[" + OneOrMore(NotAny("]") + p.token)("root") + "]") + + p.required_group("value") + | Error(r"Expected \sqrt{value}")) + + p.overline <<= r"\overline" - ( + p.required_group("body") + | Error(r"Expected \overline{value}")) + + p.overset <<= r"\overset" - ( + p.simple_group("annotation") + p.simple_group("body") + | Error(r"Expected \overset{annotation}{body}")) + p.underset <<= r"\underset" - ( + p.simple_group("annotation") + p.simple_group("body") + | Error(r"Expected \underset{annotation}{body}")) + + p.unknown_symbol <<= Regex(r"\\[A-Za-z]*")("name") p.placeable <<= ( p.accentprefixed # Must be before accent so named symbols that are @@ -2131,6 +2079,7 @@ def __init__(self): | p.symbol # Must be third to catch all named symbols and single # chars not in a group | p.function + | p.operatorname | p.group | p.frac | p.dfrac @@ -2140,7 +2089,6 @@ def __init__(self): | p.underset | p.sqrt | p.overline - | p.operatorname ) p.simple <<= ( @@ -2150,14 +2098,12 @@ def __init__(self): | p.subsuper ) - p.subsuperop <<= oneOf(["_", "^"]) - - p.subsuper <<= Group( - (Optional(p.placeable) - + OneOrMore(p.subsuperop - p.placeable) - + Optional(p.apostrophe)) - | (p.placeable + Optional(p.apostrophe)) - | p.apostrophe + p.subsuper <<= ( + (Optional(p.placeable)("nucleus") + + OneOrMore(oneOf(["_", "^"]) - p.placeable)("subsuper") + + Regex("'*")("apostrophes")) + | Regex("'+")("apostrophes") + | (p.placeable("nucleus") + Regex("'*")("apostrophes")) ) p.token <<= ( @@ -2167,12 +2113,12 @@ def __init__(self): ) p.auto_delim <<= ( - Suppress(Literal(r"\left")) - - ((p.left_delim | p.ambi_delim) + r"\left" + - ((p.left_delim | p.ambi_delim)("left") | Error("Expected a delimiter")) - + Group(ZeroOrMore(p.simple | p.auto_delim)) - + Suppress(Literal(r"\right")) - - ((p.right_delim | p.ambi_delim) + + ZeroOrMore(p.simple | p.auto_delim)("mid") + + r"\right" + - ((p.right_delim | p.ambi_delim)("right") | Error("Expected a delimiter")) ) @@ -2186,12 +2132,6 @@ def __init__(self): p.non_math + ZeroOrMore(p.math_string + p.non_math) + StringEnd() ) - # Set actions - for key, val in vars(p).items(): - if not key.startswith('_'): - if hasattr(self, key): - val.setParseAction(getattr(self, key)) - self._expression = p.main self._math_expression = p.math @@ -2290,6 +2230,8 @@ def non_math(self, s, loc, toks): self.get_state().font = mpl.rcParams['mathtext.default'] return [hlist] + float_literal = staticmethod(pyparsing_common.convertToFloat) + def _make_space(self, percentage): # All spaces are relative to em width state = self.get_state() @@ -2319,16 +2261,15 @@ def _make_space(self, percentage): } def space(self, s, loc, toks): - tok, = toks - num = self._space_widths[tok] + num = self._space_widths[toks["space"]] box = self._make_space(num) return [box] def customspace(self, s, loc, toks): - return [self._make_space(float(toks[0]))] + return [self._make_space(toks["space"])] def symbol(self, s, loc, toks): - c, = toks + c = toks["sym"] try: char = Char(c, self.get_state()) except ValueError as err: @@ -2368,8 +2309,7 @@ def symbol(self, s, loc, toks): accentprefixed = symbol def unknown_symbol(self, s, loc, toks): - c, = toks - raise ParseFatalException(s, loc, "Unknown symbol: %s" % c) + raise ParseFatalException(s, loc, f"Unknown symbol: {toks['name']}") _accent_map = { r'hat': r'\circumflexaccent', @@ -2405,7 +2345,8 @@ def unknown_symbol(self, s, loc, toks): def accent(self, s, loc, toks): state = self.get_state() thickness = state.get_current_underline_thickness() - (accent, sym), = toks + accent = toks["accent"] + sym = toks["sym"] if accent in self._wide_accents: accent_box = AutoWidthChar( '\\' + accent, sym.width, state, char_class=Accent) @@ -2424,7 +2365,7 @@ def accent(self, s, loc, toks): def function(self, s, loc, toks): hlist = self.operatorname(s, loc, toks) - hlist.function_name, = toks + hlist.function_name = toks["name"] return hlist def operatorname(self, s, loc, toks): @@ -2433,7 +2374,8 @@ def operatorname(self, s, loc, toks): state.font = 'rm' hlist_list = [] # Change the font of Chars, but leave Kerns alone - for c in toks[0]: + name = toks["name"] + for c in name: if isinstance(c, Char): c.font = 'rm' c._update_metrics() @@ -2442,14 +2384,14 @@ def operatorname(self, s, loc, toks): hlist_list.append(Char(c, state)) else: hlist_list.append(c) - next_char_loc = loc + len(toks[0]) + 1 - if isinstance(toks[0], ParseResults): + next_char_loc = loc + len(name) + 1 + if isinstance(name, ParseResults): next_char_loc += len('operatorname{}') next_char = next((c for c in s[next_char_loc:] if c != ' '), '') delimiters = self._left_delim | self._ambi_delim | self._right_delim delimiters |= {'^', '_'} if (next_char not in delimiters and - toks[0] not in self._overunder_functions): + name not in self._overunder_functions): # Add thin space except when followed by parenthesis, bracket, etc. hlist_list += [self._make_space(self._space_widths[r'\,'])] self.pop_state() @@ -2458,22 +2400,25 @@ def operatorname(self, s, loc, toks): def start_group(self, s, loc, toks): self.push_state() # Deal with LaTeX-style font tokens - if len(toks): - self.get_state().font = toks[0][4:] + if toks.get("font"): + self.get_state().font = toks.get("font") return [] def group(self, s, loc, toks): - grp = Hlist(toks[0]) + grp = Hlist(toks.get("group", [])) return [grp] - required_group = simple_group = group + + def required_group(self, s, loc, toks): + return Hlist(toks.get("group", [])) + + simple_group = required_group def end_group(self, s, loc, toks): self.pop_state() return [] def font(self, s, loc, toks): - name, = toks - self.get_state().font = name + self.get_state().font = toks["font"] return [] def is_overunder(self, nucleus): @@ -2497,60 +2442,24 @@ def is_between_brackets(self, s, loc): return False def subsuper(self, s, loc, toks): - assert len(toks) == 1 - - nucleus = None - sub = None - super = None - - # Pick all of the apostrophes out, including first apostrophes that - # have been parsed as characters - napostrophes = 0 - new_toks = [] - for tok in toks[0]: - if isinstance(tok, str) and tok not in ('^', '_'): - napostrophes += len(tok) - elif isinstance(tok, Char) and tok.c == "'": - napostrophes += 1 - else: - new_toks.append(tok) - toks = new_toks - - if len(toks) == 0: - assert napostrophes - nucleus = Hbox(0.0) - elif len(toks) == 1: - if not napostrophes: - return toks[0] # .asList() - else: - nucleus = toks[0] - elif len(toks) in (2, 3): - # single subscript or superscript - nucleus = toks[0] if len(toks) == 3 else Hbox(0.0) - op, next = toks[-2:] + nucleus = toks.get("nucleus", Hbox(0)) + subsuper = toks.get("subsuper", []) + napostrophes = len(toks.get("apostrophes", [])) + + if not subsuper and not napostrophes: + return nucleus + + sub = super = None + while subsuper: + op, arg, *subsuper = subsuper if op == '_': - sub = next - else: - super = next - elif len(toks) in (4, 5): - # subscript and superscript - nucleus = toks[0] if len(toks) == 5 else Hbox(0.0) - op1, next1, op2, next2 = toks[-4:] - if op1 == op2: - if op1 == '_': + if sub is not None: raise ParseFatalException("Double subscript") - else: - raise ParseFatalException("Double superscript") - if op1 == '_': - sub = next1 - super = next2 + sub = arg else: - super = next1 - sub = next2 - else: - raise ParseFatalException( - "Subscript/superscript sequence is too long. " - "Use braces { } to remove ambiguity.") + if super is not None: + raise ParseFatalException("Double superscript") + super = arg state = self.get_state() rule_thickness = state.font_output.get_underline_thickness( @@ -2562,7 +2471,7 @@ def subsuper(self, s, loc, toks): if super is None: super = Hlist([]) for i in range(napostrophes): - super.children.extend(self.symbol(s, loc, ['\\prime'])) + super.children.extend(self.symbol(s, loc, {"sym": "\\prime"})) # kern() and hpack() needed to get the metrics right after # extending super.kern() @@ -2691,8 +2600,6 @@ def _genfrac(self, ldelim, rdelim, rule, style, num, den): state = self.get_state() thickness = state.get_current_underline_thickness() - rule = float(rule) - if style is not self._MathStyle.DISPLAYSTYLE: num.shrink() den.shrink() @@ -2728,28 +2635,29 @@ def _genfrac(self, ldelim, rdelim, rule, style, num, den): return result def genfrac(self, s, loc, toks): - args, = toks - return self._genfrac(*args) + return self._genfrac( + toks.get("ldelim", ""), toks.get("rdelim", ""), + toks["rulesize"], toks["style"], + toks["num"], toks["den"]) def frac(self, s, loc, toks): - thickness = self.get_state().get_current_underline_thickness() - (num, den), = toks - return self._genfrac('', '', thickness, self._MathStyle.TEXTSTYLE, - num, den) + return self._genfrac( + "", "", self.get_state().get_current_underline_thickness(), + self._MathStyle.TEXTSTYLE, toks["num"], toks["den"]) def dfrac(self, s, loc, toks): - thickness = self.get_state().get_current_underline_thickness() - (num, den), = toks - return self._genfrac('', '', thickness, self._MathStyle.DISPLAYSTYLE, - num, den) + return self._genfrac( + "", "", self.get_state().get_current_underline_thickness(), + self._MathStyle.DISPLAYSTYLE, toks["num"], toks["den"]) def binom(self, s, loc, toks): - (num, den), = toks - return self._genfrac('(', ')', 0.0, self._MathStyle.TEXTSTYLE, - num, den) + return self._genfrac( + "(", ")", 0, + self._MathStyle.TEXTSTYLE, toks["num"], toks["den"]) def _genset(self, s, loc, toks): - (annotation, body), = toks + annotation = toks["annotation"] + body = toks["body"] thickness = self.get_state().get_current_underline_thickness() annotation.shrink() @@ -2780,7 +2688,8 @@ def _genset(self, s, loc, toks): overset = underset = _genset def sqrt(self, s, loc, toks): - (root, body), = toks + root = toks.get("root") + body = toks["value"] state = self.get_state() thickness = state.get_current_underline_thickness() @@ -2819,7 +2728,7 @@ def sqrt(self, s, loc, toks): return [hlist] def overline(self, s, loc, toks): - (body,), = toks + body = toks["body"] state = self.get_state() thickness = state.get_current_underline_thickness() @@ -2860,6 +2769,5 @@ def _auto_sized_delimiter(self, front, middle, back): return hlist def auto_delim(self, s, loc, toks): - front, middle, back = toks - - return self._auto_sized_delimiter(front, middle.asList(), back) + return self._auto_sized_delimiter( + toks["left"], toks["mid"].asList(), toks["right"]) diff --git a/lib/matplotlib/tests/test_mathtext.py b/lib/matplotlib/tests/test_mathtext.py index 9f196a29216f..abe023495953 100644 --- a/lib/matplotlib/tests/test_mathtext.py +++ b/lib/matplotlib/tests/test_mathtext.py @@ -255,13 +255,12 @@ def test_fontinfo(): (r'$\left($', re.compile(r'Expected ("|\'\\)\\right["\']')), (r'$\dfrac$', r'Expected \dfrac{num}{den}'), (r'$\dfrac{}{}$', r'Expected \dfrac{num}{den}'), - (r'$\overset$', r'Expected \overset{body}{annotation}'), - (r'$\underset$', r'Expected \underset{body}{annotation}'), + (r'$\overset$', r'Expected \overset{annotation}{body}'), + (r'$\underset$', r'Expected \underset{annotation}{body}'), (r'$\foo$', r'Unknown symbol: \foo'), (r'$a^2^2$', r'Double superscript'), (r'$a_2_2$', r'Double subscript'), - (r'$a^2_a^2$', r'Subscript/superscript sequence is too long.'\ - r' Use braces { } to remove ambiguity.'), + (r'$a^2_a^2$', r'Double superscript'), ], ids=[ 'hspace without value',