Make AFM parser both more compliant and less strict.

anntzer · anntzer · commit 997df25fcc47 · 2019-02-15T17:49:48.000+01:00
See changelog entry.

Also support comma as decimal separator in the floating-point fields, as
it is used in certain real-world files.
diff --git a/doc/api/next_api_changes/2019-02-16-AL.rst b/doc/api/next_api_changes/2019-02-16-AL.rst
@@ -0,0 +1,10 @@
+Changes in AFM parsing
+``````````````````````
+
+In accordance with the AFM spec, the AFM parser no longer truncates the
+``UnderlinePosition`` and ``UnderlineThickness`` fields to integers.
+
+The ``Notice`` field (which can only be publically accessed by the deprecated
+``afm.parse_afm`` API) is no longer decoded to a `str`, but instead kept as
+`bytes`, to support non-conformant AFM files that use non-ASCII characters in
+that field.
diff --git a/lib/matplotlib/afm.py b/lib/matplotlib/afm.py
@@ -49,17 +49,24 @@
 _log = logging.getLogger(__name__)
 
 
-# some afm files have floats where we are expecting ints -- there is
-# probably a better way to handle this (support floats, round rather
-# than truncate).  But I don't know what the best approach is now and
-# this change to _to_int should at least prevent mpl from crashing on
-# these JDH (2009-11-06)
-
 def _to_int(x):
+    # Some AFM files have floats where we are expecting ints -- there is
+    # probably a better way to handle this (support floats, round rather
+    # than truncate).  But I don't know what the best approach is now and
+    # this change to _to_int should at least prevent mpl from crashing on
+    # these JDH (2009-11-06)
     return int(float(x))
 
 
-_to_float = float
+def _to_float(x):
+    # Some AFM files use "," instead of "." as decimal separator -- this
+    # shouldn't be ambiguous (unless someone is wicked enough to use "," as
+    # thousands separator...).
+    if isinstance(x, bytes):
+        # Encoding doesn't really matter -- if we have codepoints >127 the call
+        # to float() will error anyways.
+        x = x.decode('latin-1')
+    return float(x.replace(',', '.'))
 
 
 def _to_str(x):
@@ -84,18 +91,15 @@ def _to_bool(s):
 
 def _sanity_check(fh):
     """
-    Check if the file at least looks like AFM.
-    If not, raise `RuntimeError`.
+    Check if the file looks like AFM; if it doesn't, raise `RuntimeError`.
     """
-
     # Remember the file position in case the caller wants to
     # do something else with the file.
     pos = fh.tell()
     try:
         line = next(fh)
     finally:
         fh.seek(pos, 0)
-
     # AFM spec, Section 4: The StartFontMetrics keyword [followed by a
     # version number] must be the first line in the file, and the
     # EndFontMetrics keyword must be the last non-empty line in the
@@ -122,7 +126,7 @@ def _parse_header(fh):
       XHeight, Ascender, Descender, StartCharMetrics
 
     """
-    headerConverters = {
+    header_converters = {
         b'StartFontMetrics': _to_float,
         b'FontName': _to_str,
         b'FullName': _to_str,
@@ -131,10 +135,13 @@ def _parse_header(fh):
         b'ItalicAngle': _to_float,
         b'IsFixedPitch': _to_bool,
         b'FontBBox': _to_list_of_ints,
-        b'UnderlinePosition': _to_int,
-        b'UnderlineThickness': _to_int,
+        b'UnderlinePosition': _to_float,
+        b'UnderlineThickness': _to_float,
         b'Version': _to_str,
-        b'Notice': _to_str,
+        # Some AFM files have non-ASCII characters (which are not allowed by
+        # the spec).  Given that there is actually no public API to even access
+        # this field, just return it as straight bytes.
+        b'Notice': lambda x: x,
         b'EncodingScheme': _to_str,
         b'CapHeight': _to_float,  # Is the second version a mistake, or
         b'Capheight': _to_float,  # do some AFM files contain 'Capheight'? -JKS
@@ -162,13 +169,15 @@ def _parse_header(fh):
             val = b''
 
         try:
-            d[key] = headerConverters[key](val)
-        except ValueError:
-            _log.error('Value error parsing header in AFM: %s, %s', key, val)
-            continue
+            converter = header_converters[key]
         except KeyError:
             _log.error('Found an unknown keyword in AFM header (was %r)' % key)
             continue
+        try:
+            d[key] = converter(val)
+        except ValueError:
+            _log.error('Value error parsing header in AFM: %s, %s', key, val)
+            continue
         if key == b'StartCharMetrics':
             return d
     raise RuntimeError('Bad parse')
diff --git a/lib/matplotlib/tests/test_afm.py b/lib/matplotlib/tests/test_afm.py
@@ -4,6 +4,9 @@
 from matplotlib import font_manager as fm
 
 
+# See note in afm.py re: use of comma as decimal separator in the
+# UnderlineThickness field and re: use of non-ASCII characters in the Notice
+# field.
 AFM_TEST_DATA = b"""StartFontMetrics 2.0
 Comment Comments are ignored.
 Comment Creation Date:Mon Nov 13 12:34:11 GMT 2017
@@ -15,9 +18,9 @@
 ItalicAngle 0.0
 IsFixedPitch false
 UnderlinePosition -100
-UnderlineThickness 50
+UnderlineThickness 56,789
 Version 001.000
-Notice Copyright (c) 2017 No one.
+Notice Copyright \xa9 2017 No one.
 FontBBox 0 -321 1234 369
 StartCharMetrics 3
 C 0 ; WX 250 ; N space ; B 0 0 0 0 ;
@@ -51,9 +54,9 @@ def test_parse_header():
         b'ItalicAngle': 0.0,
         b'IsFixedPitch': False,
         b'UnderlinePosition': -100,
-        b'UnderlineThickness': 50,
+        b'UnderlineThickness': 56.789,
         b'Version': '001.000',
-        b'Notice': 'Copyright (c) 2017 No one.',
+        b'Notice': b'Copyright \xa9 2017 No one.',
         b'FontBBox': [0, -321, 1234, 369],
         b'StartCharMetrics': 3,
     }