diff --git a/LICENSE/LICENSE_COURIERTEN b/LICENSE/LICENSE_COURIERTEN new file mode 100644 index 000000000000..c6d3fd7410a2 --- /dev/null +++ b/LICENSE/LICENSE_COURIERTEN @@ -0,0 +1,18 @@ +The Courier10PitchBT-Bold.pfb file is a Type-1 version of +Courier 10 Pitch BT Bold by Bitstream, obtained from +. It is included +here as test data only, but the following license applies. + + +(c) Copyright 1989-1992, Bitstream Inc., Cambridge, MA. + +You are hereby granted permission under all Bitstream propriety rights +to use, copy, modify, sublicense, sell, and redistribute the 4 Bitstream +Charter (r) Type 1 outline fonts and the 4 Courier Type 1 outline fonts +for any purpose and without restriction; provided, that this notice is +left intact on all copies of such fonts and that Bitstream's trademark +is acknowledged as shown below on all unmodified copies of the 4 Charter +Type 1 fonts. + +BITSTREAM CHARTER is a registered trademark of Bitstream Inc. + diff --git a/doc/api/next_api_changes/behavior/20715-JKS.rst b/doc/api/next_api_changes/behavior/20715-JKS.rst new file mode 100644 index 000000000000..f0ca1d707d3d --- /dev/null +++ b/doc/api/next_api_changes/behavior/20715-JKS.rst @@ -0,0 +1,8 @@ +``Type1Font`` objects include more properties +--------------------------------------------- + +The `.type1font.Type1Font.prop` dictionary now includes more keys, such +as ``CharStrings`` and ``Subrs``. The value of the ``Encoding`` key is +now a dictionary mapping codes to glyph names. The +`.type1font.Type1Font.transform` method now correctly removes +``UniqueID`` properties from the font. diff --git a/lib/matplotlib/tests/Courier10PitchBT-Bold.pfb b/lib/matplotlib/tests/Courier10PitchBT-Bold.pfb new file mode 100644 index 000000000000..88d9af2af701 Binary files /dev/null and b/lib/matplotlib/tests/Courier10PitchBT-Bold.pfb differ diff --git a/lib/matplotlib/tests/test_type1font.py b/lib/matplotlib/tests/test_type1font.py index 99cc3e500b0e..6a16da10def1 100644 --- a/lib/matplotlib/tests/test_type1font.py +++ b/lib/matplotlib/tests/test_type1font.py @@ -1,6 +1,7 @@ import matplotlib.type1font as t1f import os.path import difflib +import pytest def test_Type1Font(): @@ -13,10 +14,35 @@ def test_Type1Font(): assert font.parts[0] == rawdata[0x0006:0x10c5] assert font.parts[1] == rawdata[0x10cb:0x897f] assert font.parts[2] == rawdata[0x8985:0x8ba6] - assert font.parts[1:] == slanted.parts[1:] - assert font.parts[1:] == condensed.parts[1:] assert font.decrypted.startswith(b'dup\n/Private 18 dict dup begin') assert font.decrypted.endswith(b'mark currentfile closefile\n') + assert slanted.decrypted.startswith(b'dup\n/Private 18 dict dup begin') + assert slanted.decrypted.endswith(b'mark currentfile closefile\n') + assert b'UniqueID 5000793' in font.parts[0] + assert b'UniqueID 5000793' in font.decrypted + assert font._pos['UniqueID'] == [(797, 818), (4483, 4504)] + + len0 = len(font.parts[0]) + for key in font._pos.keys(): + for pos0, pos1 in font._pos[key]: + if pos0 < len0: + data = font.parts[0][pos0:pos1] + else: + data = font.decrypted[pos0-len0:pos1-len0] + assert data.startswith(f'/{key}'.encode('ascii')) + assert {'FontType', 'FontMatrix', 'PaintType', 'ItalicAngle', 'RD' + } < set(font._pos.keys()) + + assert b'UniqueID 5000793' not in slanted.parts[0] + assert b'UniqueID 5000793' not in slanted.decrypted + assert 'UniqueID' not in slanted._pos + assert font.prop['Weight'] == 'Medium' + assert not font.prop['isFixedPitch'] + assert font.prop['ItalicAngle'] == 0 + assert slanted.prop['ItalicAngle'] == -45 + assert font.prop['Encoding'][5] == 'Pi' + assert isinstance(font.prop['CharStrings']['Pi'], bytes) + assert font._abbr['ND'] == 'ND' differ = difflib.Differ() diff = list(differ.compare( @@ -24,14 +50,13 @@ def test_Type1Font(): slanted.parts[0].decode('latin-1').splitlines())) for line in ( # Removes UniqueID - '- FontDirectory/CMR10 known{/CMR10 findfont dup/UniqueID known{dup', - '+ FontDirectory/CMR10 known{/CMR10 findfont dup', + '- /UniqueID 5000793 def', # Changes the font name '- /FontName /CMR10 def', - '+ /FontName /CMR10_Slant_1000 def', + '+ /FontName/CMR10_Slant_1000 def', # Alters FontMatrix '- /FontMatrix [0.001 0 0 0.001 0 0 ]readonly def', - '+ /FontMatrix [0.001 0 0.001 0.001 0 0]readonly def', + '+ /FontMatrix [0.001 0 0.001 0.001 0 0] readonly def', # Alters ItalicAngle '- /ItalicAngle 0 def', '+ /ItalicAngle -45.0 def'): @@ -42,17 +67,73 @@ def test_Type1Font(): condensed.parts[0].decode('latin-1').splitlines())) for line in ( # Removes UniqueID - '- FontDirectory/CMR10 known{/CMR10 findfont dup/UniqueID known{dup', - '+ FontDirectory/CMR10 known{/CMR10 findfont dup', + '- /UniqueID 5000793 def', # Changes the font name '- /FontName /CMR10 def', - '+ /FontName /CMR10_Extend_500 def', + '+ /FontName/CMR10_Extend_500 def', # Alters FontMatrix '- /FontMatrix [0.001 0 0 0.001 0 0 ]readonly def', - '+ /FontMatrix [0.0005 0 0 0.001 0 0]readonly def'): + '+ /FontMatrix [0.0005 0 0 0.001 0 0] readonly def'): assert line in diff, 'diff to condensed font must contain %s' % line +def test_Type1Font_2(): + filename = os.path.join(os.path.dirname(__file__), + 'Courier10PitchBT-Bold.pfb') + font = t1f.Type1Font(filename) + assert font.prop['Weight'] == 'Bold' + assert font.prop['isFixedPitch'] + assert font.prop['Encoding'][65] == 'A' # the font uses StandardEncoding + (pos0, pos1), = font._pos['Encoding'] + assert font.parts[0][pos0:pos1] == b'/Encoding StandardEncoding' + assert font._abbr['ND'] == '|-' + + +def test_tokenize(): + data = (b'1234/abc false -9.81 Foo <<[0 1 2]<0 1ef a\t>>>\n' + b'(string with(nested\t\\) par)ens\\\\)') + # 1 2 x 2 xx1 + # 1 and 2 are matching parens, x means escaped character + n, w, num, kw, d = 'name', 'whitespace', 'number', 'keyword', 'delimiter' + b, s = 'boolean', 'string' + correct = [ + (num, 1234), (n, 'abc'), (w, ' '), (b, False), (w, ' '), (num, -9.81), + (w, ' '), (kw, 'Foo'), (w, ' '), (d, '<<'), (d, '['), (num, 0), + (w, ' '), (num, 1), (w, ' '), (num, 2), (d, ']'), (s, b'\x01\xef\xa0'), + (d, '>>'), (w, '\n'), (s, 'string with(nested\t) par)ens\\') + ] + correct_no_ws = [x for x in correct if x[0] != w] + + def convert(tokens): + return [(t.kind, t.value()) for t in tokens] + + assert convert(t1f._tokenize(data, False)) == correct + assert convert(t1f._tokenize(data, True)) == correct_no_ws + + def bin_after(n): + tokens = t1f._tokenize(data, True) + result = [] + for _ in range(n): + result.append(next(tokens)) + result.append(tokens.send(10)) + return convert(result) + + for n in range(1, len(correct_no_ws)): + result = bin_after(n) + assert result[:-1] == correct_no_ws[:n] + assert result[-1][0] == 'binary' + assert isinstance(result[-1][1], bytes) + + +def test_tokenize_errors(): + with pytest.raises(ValueError): + list(t1f._tokenize(b'1234 (this (string) is unterminated\\)', True)) + with pytest.raises(ValueError): + list(t1f._tokenize(b'/Foo<01234', True)) + with pytest.raises(ValueError): + list(t1f._tokenize(b'/Foo<01234abcg>/Bar', True)) + + def test_overprecision(): # We used to output too many digits in FontMatrix entries and # ItalicAngle, which could make Type-1 parsers unhappy. diff --git a/lib/matplotlib/type1font.py b/lib/matplotlib/type1font.py index f417c0fc97a4..4c39ea8750b9 100644 --- a/lib/matplotlib/type1font.py +++ b/lib/matplotlib/type1font.py @@ -22,10 +22,10 @@ """ import binascii -import enum -import itertools +import functools import logging import re +import string import struct import numpy as np @@ -35,9 +35,292 @@ _log = logging.getLogger(__name__) -# token types -_TokenType = enum.Enum('_TokenType', - 'whitespace name string delimiter number') + +class _Token: + """ + A token in a PostScript stream + + Attributes + ---------- + pos : int + position, i.e. offset from the beginning of the data + + raw : str + the raw text of the token + + kind : str + description of the token (for debugging or testing) + """ + __slots__ = ('pos', 'raw') + kind = '?' + + def __init__(self, pos, raw): + _log.debug('type1font._Token %s at %d: %r', self.kind, pos, raw) + self.pos = pos + self.raw = raw + + def __str__(self): + return f"<{self.kind} {self.raw} @{self.pos}>" + + def endpos(self): + """Position one past the end of the token""" + return self.pos + len(self.raw) + + def is_keyword(self, *names): + """Is this a name token with one of the names?""" + return False + + def is_slash_name(self): + """Is this a name token that starts with a slash?""" + return False + + def is_delim(self): + """Is this a delimiter token?""" + return False + + def is_number(self): + """Is this a number token?""" + return False + + def value(self): + return self.raw + + +class _NameToken(_Token): + kind = 'name' + + def is_slash_name(self): + return self.raw.startswith('/') + + def value(self): + return self.raw[1:] + + +class _BooleanToken(_Token): + kind = 'boolean' + + def value(self): + return self.raw == 'true' + + +class _KeywordToken(_Token): + kind = 'keyword' + + def is_keyword(self, *names): + return self.raw in names + + +class _DelimiterToken(_Token): + kind = 'delimiter' + + def is_delim(self): + return True + + def opposite(self): + return {'[': ']', ']': '[', + '{': '}', '}': '{', + '<<': '>>', '>>': '<<' + }[self.raw] + + +class _WhitespaceToken(_Token): + kind = 'whitespace' + + +class _StringToken(_Token): + kind = 'string' + _escapes_re = re.compile(r'\\([\\()nrtbf]|[0-7]{1,3})') + _replacements = {'\\': '\\', '(': '(', ')': ')', 'n': '\n', + 'r': '\r', 't': '\t', 'b': '\b', 'f': '\f'} + _ws_re = re.compile('[\0\t\r\f\n ]') + + @classmethod + def _escape(cls, match): + group = match.group(1) + try: + return cls._replacements[group] + except KeyError: + return chr(int(group, 8)) + + @functools.lru_cache() + def value(self): + if self.raw[0] == '(': + return self._escapes_re.sub(self._escape, self.raw[1:-1]) + else: + data = self._ws_re.sub('', self.raw[1:-1]) + if len(data) % 2 == 1: + data += '0' + return binascii.unhexlify(data) + + +class _BinaryToken(_Token): + kind = 'binary' + + def value(self): + return self.raw[1:] + + +class _NumberToken(_Token): + kind = 'number' + + def is_number(self): + return True + + def value(self): + if '.' not in self.raw: + return int(self.raw) + else: + return float(self.raw) + + +def _tokenize(data: bytes, skip_ws: bool): + """ + A generator that produces _Token instances from Type-1 font code. + + The consumer of the generator may send an integer to the tokenizer + to indicate that the next token should be _BinaryToken of the given + length. + + Parameters + ---------- + data : bytes + The data of the font to tokenize. + + skip_ws : bool + If true, the generator will drop any _WhitespaceTokens from the output. + """ + + text = data.decode('ascii', 'replace') + whitespace_or_comment_re = re.compile(r'[\0\t\r\f\n ]+|%[^\r\n]*') + token_re = re.compile(r'/{0,2}[^]\0\t\r\f\n ()<>{}/%[]+') + instring_re = re.compile(r'[()\\]') + hex_re = re.compile(r'^<[0-9a-fA-F\0\t\r\f\n ]*>$') + oct_re = re.compile(r'[0-7]{1,3}') + pos = 0 + next_binary = None + + while pos < len(text): + if next_binary is not None: + n = next_binary + next_binary = (yield _BinaryToken(pos, data[pos:pos+n])) + pos += n + continue + match = whitespace_or_comment_re.match(text, pos) + if match: + if not skip_ws: + next_binary = (yield _WhitespaceToken(pos, match.group())) + pos = match.end() + elif text[pos] == '(': + # PostScript string rules: + # - parentheses must be balanced + # - backslashes escape backslashes and parens + # - also codes \n\r\t\b\f and octal escapes are recognized + # - other backslashes do not escape anything + start = pos + pos += 1 + depth = 1 + while depth: + match = instring_re.search(text, pos) + if match is None: + raise ValueError( + f'Unterminated string starting at {start}') + pos = match.end() + if match.group() == '(': + depth += 1 + elif match.group() == ')': + depth -= 1 + else: # a backslash + char = text[pos] + if char in r'\()nrtbf': + pos += 1 + else: + octal = oct_re.match(text, pos) + if octal: + pos = octal.end() + else: + pass # non-escaping backslash + next_binary = (yield _StringToken(start, text[start:pos])) + elif text[pos:pos + 2] in ('<<', '>>'): + next_binary = (yield _DelimiterToken(pos, text[pos:pos + 2])) + pos += 2 + elif text[pos] == '<': + start = pos + try: + pos = text.index('>', pos) + 1 + except ValueError as e: + raise ValueError(f'Unterminated hex string starting at {start}' + ) from e + if not hex_re.match(text[start:pos]): + raise ValueError(f'Malformed hex string starting at {start}') + next_binary = (yield _StringToken(pos, text[start:pos])) + else: + match = token_re.match(text, pos) + if match: + raw = match.group() + if raw.startswith('/'): + next_binary = (yield _NameToken(pos, raw)) + elif match.group() in ('true', 'false'): + next_binary = (yield _BooleanToken(pos, raw)) + else: + try: + float(raw) + next_binary = (yield _NumberToken(pos, raw)) + except ValueError: + next_binary = (yield _KeywordToken(pos, raw)) + pos = match.end() + else: + next_binary = (yield _DelimiterToken(pos, text[pos])) + pos += 1 + + +class _BalancedExpression(_Token): + pass + + +def _expression(initial, tokens, data): + """ + Consume some number of tokens and return a balanced PostScript expression + + Parameters + ---------- + initial : _Token + the token that triggered parsing a balanced expression + + tokens : iterator of _Token + following tokens + + data : bytes + underlying data that the token positions point to + + Returns + ------- + _BalancedExpression + """ + delim_stack = [] + token = initial + while True: + if token.is_delim(): + if token.raw in ('[', '{'): + delim_stack.append(token) + elif token.raw in (']', '}'): + if not delim_stack: + raise RuntimeError(f"unmatched closing token {token}") + match = delim_stack.pop() + if match.raw != token.opposite(): + raise RuntimeError( + f"opening token {match} closed by {token}" + ) + if not delim_stack: + break + else: + raise RuntimeError(f'unknown delimiter {token}') + elif not delim_stack: + break + token = next(tokens) + return _BalancedExpression( + initial.pos, + data[initial.pos:token.endpos()].decode('ascii', 'replace') + ) class Type1Font: @@ -52,9 +335,23 @@ class Type1Font: decrypted : bytes The decrypted form of parts[1]. prop : dict[str, Any] - A dictionary of font properties. + A dictionary of font properties. Noteworthy keys include: + FontName - PostScript name of the font + Encoding - dict from numeric codes to glyph names + FontMatrix - bytes object encoding a matrix + UniqueID - optional font identifier, dropped when modifying the font + CharStrings - dict from glyph names to byte code + Subrs - array of byte code subroutines + OtherSubrs - bytes object encoding some PostScript code """ - __slots__ = ('parts', 'decrypted', 'prop') + __slots__ = ('parts', 'decrypted', 'prop', '_pos', '_abbr') + # the _pos dict contains (begin, end) indices to parts[0] + decrypted + # so that they can be replaced when transforming the font; + # but since sometimes a definition appears in both parts[0] and decrypted, + # _pos[name] is an array of such pairs + # + # _abbr maps three standard abbreviations to their particular names in + # this font (e.g. 'RD' is named '-|' in some fonts) def __init__(self, input): """ @@ -74,6 +371,7 @@ def __init__(self, input): self.parts = self._split(data) self.decrypted = self._decrypt(self.parts[1], 'eexec') + self._abbr = {'RD': 'RD', 'ND': 'ND', 'NP': 'NP'} self._parse() def _read(self, file): @@ -144,10 +442,6 @@ def _split(self, data): return data[:len1], binary, data[idx+1:] - _whitespace_or_comment_re = re.compile(br'[\0\t\r\014\n ]+|%[^\r\n\v]*') - _token_re = re.compile(br'/{0,2}[^]\0\t\r\v\n ()<>{}/%[]+') - _instring_re = re.compile(br'[()\\]') - @staticmethod def _decrypt(ciphertext, key, ndiscard=4): """ @@ -196,101 +490,83 @@ def _encrypt(plaintext, key, ndiscard=4): return bytes(ciphertext) - @classmethod - def _tokens(cls, text): - """ - A PostScript tokenizer. Yield (token, value) pairs such as - (_TokenType.whitespace, ' ') or (_TokenType.name, '/Foobar'). - """ - # Preload enum members for speed. - tok_whitespace = _TokenType.whitespace - tok_name = _TokenType.name - tok_string = _TokenType.string - tok_delimiter = _TokenType.delimiter - tok_number = _TokenType.number - pos = 0 - while pos < len(text): - match = cls._whitespace_or_comment_re.match(text, pos) - if match: - yield (tok_whitespace, match.group()) - pos = match.end() - elif text[pos:pos+1] == b'(': - start = pos - pos += 1 - depth = 1 - while depth: - match = cls._instring_re.search(text, pos) - if match is None: - return - pos = match.end() - if match.group() == b'(': - depth += 1 - elif match.group() == b')': - depth -= 1 - else: # a backslash - skip the next character - pos += 1 - yield (tok_string, text[start:pos]) - elif text[pos:pos + 2] in (b'<<', b'>>'): - yield (tok_delimiter, text[pos:pos + 2]) - pos += 2 - elif text[pos:pos+1] == b'<': - start = pos - pos = text.index(b'>', pos) - yield (tok_string, text[start:pos]) - else: - match = cls._token_re.match(text, pos) - if match: - try: - float(match.group()) - yield (tok_number, match.group()) - except ValueError: - yield (tok_name, match.group()) - pos = match.end() - else: - yield (tok_delimiter, text[pos:pos + 1]) - pos += 1 - def _parse(self): """ Find the values of various font properties. This limited kind of parsing is described in Chapter 10 "Adobe Type Manager Compatibility" of the Type-1 spec. """ - # Preload enum members for speed. - tok_whitespace = _TokenType.whitespace - tok_name = _TokenType.name - tok_string = _TokenType.string - tok_number = _TokenType.number # Start with reasonable defaults - prop = {'weight': 'Regular', 'ItalicAngle': 0.0, 'isFixedPitch': False, + prop = {'Weight': 'Regular', 'ItalicAngle': 0.0, 'isFixedPitch': False, 'UnderlinePosition': -100, 'UnderlineThickness': 50} - filtered = ((token, value) - for token, value in self._tokens(self.parts[0]) - if token is not tok_whitespace) - # The spec calls this an ASCII format; in Python 2.x we could - # just treat the strings and names as opaque bytes but let's - # turn them into proper Unicode, and be lenient in case of high bytes. - def convert(x): return x.decode('ascii', 'replace') - for token, value in filtered: - if token is tok_name and value.startswith(b'/'): - key = convert(value[1:]) - token, value = next(filtered) - if token is tok_name: - if value in (b'true', b'false'): - value = value == b'true' - else: - value = convert(value.lstrip(b'/')) - elif token is tok_string: - value = convert(value.lstrip(b'(').rstrip(b')')) - elif token is tok_number: - if b'.' in value: - value = float(value) - else: - value = int(value) - else: # more complicated value such as an array - value = None - if key != 'FontInfo' and value is not None: - prop[key] = value + pos = {} + data = self.parts[0] + self.decrypted + + source = _tokenize(data, True) + while True: + # See if there is a key to be assigned a value + # e.g. /FontName in /FontName /Helvetica def + try: + token = next(source) + except StopIteration: + break + if token.is_delim(): + # skip over this - we want top-level keys only + _expression(token, source, data) + if token.is_slash_name(): + key = token.value() + keypos = token.pos + else: + continue + + # Some values need special parsing + if key in ('Subrs', 'CharStrings', 'Encoding', 'OtherSubrs'): + prop[key], endpos = { + 'Subrs': self._parse_subrs, + 'CharStrings': self._parse_charstrings, + 'Encoding': self._parse_encoding, + 'OtherSubrs': self._parse_othersubrs + }[key](source, data) + pos.setdefault(key, []).append((keypos, endpos)) + continue + + try: + token = next(source) + except StopIteration: + break + + if isinstance(token, _KeywordToken): + # constructs like + # FontDirectory /Helvetica known {...} {...} ifelse + # mean the key was not really a key + continue + + if token.is_delim(): + value = _expression(token, source, data).raw + else: + value = token.value() + + # look for a 'def' possibly preceded by access modifiers + try: + kw = next( + kw for kw in source + if not kw.is_keyword('readonly', 'noaccess', 'executeonly') + ) + except StopIteration: + break + + # sometimes noaccess def and readonly def are abbreviated + if kw.is_keyword('def', self._abbr['ND'], self._abbr['NP']): + prop[key] = value + pos.setdefault(key, []).append((keypos, kw.endpos())) + + # detect the standard abbreviations + if value == '{noaccess def}': + self._abbr['ND'] = key + elif value == '{noaccess put}': + self._abbr['NP'] = key + elif value == '{string currentfile exch readstring pop}': + self._abbr['RD'] = key # Fill in the various *Name properties if 'FontName' not in prop: @@ -303,79 +579,119 @@ def convert(x): return x.decode('ascii', 'replace') extras = ('(?i)([ -](regular|plain|italic|oblique|(semi)?bold|' '(ultra)?light|extra|condensed))+$') prop['FamilyName'] = re.sub(extras, '', prop['FullName']) + # Decrypt the encrypted parts + ndiscard = prop.get('lenIV', 4) + cs = prop['CharStrings'] + for key, value in cs.items(): + cs[key] = self._decrypt(value, 'charstring', ndiscard) + if 'Subrs' in prop: + prop['Subrs'] = [ + self._decrypt(value, 'charstring', ndiscard) + for value in prop['Subrs'] + ] self.prop = prop + self._pos = pos - @classmethod - def _transformer(cls, tokens, slant, extend): - tok_whitespace = _TokenType.whitespace - tok_name = _TokenType.name - - def fontname(name): - result = name - if slant: - result += b'_Slant_%d' % int(1000 * slant) - if extend != 1.0: - result += b'_Extend_%d' % int(1000 * extend) - return result - - def italicangle(angle): - return b'%a' % round( - float(angle) - np.arctan(slant) / np.pi * 180, - 5 + def _parse_subrs(self, tokens, _data): + count_token = next(tokens) + if not count_token.is_number(): + raise RuntimeError( + f"Token following /Subrs must be a number, was {count_token}" ) + count = count_token.value() + array = [None] * count + next(t for t in tokens if t.is_keyword('array')) + for _ in range(count): + next(t for t in tokens if t.is_keyword('dup')) + index_token = next(tokens) + if not index_token.is_number(): + raise RuntimeError( + "Token following dup in Subrs definition must be a " + f"number, was {index_token}" + ) + nbytes_token = next(tokens) + if not nbytes_token.is_number(): + raise RuntimeError( + "Second token following dup in Subrs definition must " + f"be a number, was {nbytes_token}" + ) + token = next(tokens) + if not token.is_keyword(self._abbr['RD']): + raise RuntimeError( + f"Token preceding subr must be {self._abbr['RD']}, " + f"was {token}" + ) + binary_token = tokens.send(1+nbytes_token.value()) + array[index_token.value()] = binary_token.value() + + return array, next(tokens).endpos() - def fontmatrix(array): - array = array.lstrip(b'[').rstrip(b']').split() - array = [float(x) for x in array] - oldmatrix = np.eye(3, 3) - oldmatrix[0:3, 0] = array[::2] - oldmatrix[0:3, 1] = array[1::2] - modifier = np.array([[extend, 0, 0], - [slant, 1, 0], - [0, 0, 1]]) - newmatrix = np.dot(modifier, oldmatrix) - array[::2] = newmatrix[0:3, 0] - array[1::2] = newmatrix[0:3, 1] - return ( - '[%s]' % ' '.join(_format_approx(x, 6) for x in array) - ).encode('ascii') - - def replace(fun): - def replacer(tokens): - token, value = next(tokens) # name, e.g., /FontMatrix - yield value - token, value = next(tokens) # possible whitespace - while token is tok_whitespace: - yield value - token, value = next(tokens) - if value != b'[': # name/number/etc. - yield fun(value) - else: # array, e.g., [1 2 3] - result = b'' - while value != b']': - result += value - token, value = next(tokens) - result += value - yield fun(result) - return replacer - - def suppress(tokens): - for _ in itertools.takewhile(lambda x: x[1] != b'def', tokens): - pass - yield b'' - - table = {b'/FontName': replace(fontname), - b'/ItalicAngle': replace(italicangle), - b'/FontMatrix': replace(fontmatrix), - b'/UniqueID': suppress} - - for token, value in tokens: - if token is tok_name and value in table: - yield from table[value]( - itertools.chain([(token, value)], tokens)) - else: - yield value + @staticmethod + def _parse_charstrings(tokens, _data): + count_token = next(tokens) + if not count_token.is_number(): + raise RuntimeError( + "Token following /CharStrings must be a number, " + f"was {count_token}" + ) + count = count_token.value() + charstrings = {} + next(t for t in tokens if t.is_keyword('begin')) + while True: + token = next(t for t in tokens + if t.is_keyword('end') or t.is_slash_name()) + if token.raw == 'end': + return charstrings, token.endpos() + glyphname = token.value() + nbytes_token = next(tokens) + if not nbytes_token.is_number(): + raise RuntimeError( + f"Token following /{glyphname} in CharStrings definition " + f"must be a number, was {nbytes_token}" + ) + next(tokens) # usually RD or |- + binary_token = tokens.send(1+nbytes_token.value()) + charstrings[glyphname] = binary_token.value() + + @staticmethod + def _parse_encoding(tokens, _data): + # this only works for encodings that follow the Adobe manual + # but some old fonts include non-compliant data - we log a warning + # and return a possibly incomplete encoding + encoding = {} + while True: + token = next(t for t in tokens + if t.is_keyword('StandardEncoding', 'dup', 'def')) + if token.is_keyword('StandardEncoding'): + return _StandardEncoding, token.endpos() + if token.is_keyword('def'): + return encoding, token.endpos() + index_token = next(tokens) + if not index_token.is_number(): + _log.warning( + f"Parsing encoding: expected number, got {index_token}" + ) + continue + name_token = next(tokens) + if not name_token.is_slash_name(): + _log.warning( + f"Parsing encoding: expected slash-name, got {name_token}" + ) + continue + encoding[index_token.value()] = name_token.value() + + @staticmethod + def _parse_othersubrs(tokens, data): + init_pos = None + while True: + token = next(tokens) + if init_pos is None: + init_pos = token.pos + if token.is_delim(): + _expression(token, tokens, data) + elif token.is_keyword('def', 'ND', '|-'): + return data[init_pos:token.endpos()], token.endpos() def transform(self, effects): """ @@ -397,8 +713,167 @@ def transform(self, effects): ------- `Type1Font` """ - tokenizer = self._tokens(self.parts[0]) - transformed = self._transformer(tokenizer, - slant=effects.get('slant', 0.0), - extend=effects.get('extend', 1.0)) - return Type1Font((b"".join(transformed), self.parts[1], self.parts[2])) + fontname = self.prop['FontName'] + italicangle = self.prop['ItalicAngle'] + + array = [ + float(x) for x in (self.prop['FontMatrix'] + .lstrip('[').rstrip(']').split()) + ] + oldmatrix = np.eye(3, 3) + oldmatrix[0:3, 0] = array[::2] + oldmatrix[0:3, 1] = array[1::2] + modifier = np.eye(3, 3) + + if 'slant' in effects: + slant = effects['slant'] + fontname += '_Slant_%d' % int(1000 * slant) + italicangle = round( + float(italicangle) - np.arctan(slant) / np.pi * 180, + 5 + ) + modifier[1, 0] = slant + + if 'extend' in effects: + extend = effects['extend'] + fontname += '_Extend_%d' % int(1000 * extend) + modifier[0, 0] = extend + + newmatrix = np.dot(modifier, oldmatrix) + array[::2] = newmatrix[0:3, 0] + array[1::2] = newmatrix[0:3, 1] + fontmatrix = ( + '[%s]' % ' '.join(_format_approx(x, 6) for x in array) + ) + replacements = ( + [(x, '/FontName/%s def' % fontname) + for x in self._pos['FontName']] + + [(x, '/ItalicAngle %a def' % italicangle) + for x in self._pos['ItalicAngle']] + + [(x, '/FontMatrix %s readonly def' % fontmatrix) + for x in self._pos['FontMatrix']] + + [(x, '') for x in self._pos.get('UniqueID', [])] + ) + + data = bytearray(self.parts[0]) + data.extend(self.decrypted) + len0 = len(self.parts[0]) + for (pos0, pos1), value in sorted(replacements, reverse=True): + data[pos0:pos1] = value.encode('ascii', 'replace') + if pos0 < len(self.parts[0]): + if pos1 >= len(self.parts[0]): + raise RuntimeError( + f"text to be replaced with {value} spans " + "the eexec boundary" + ) + len0 += len(value) - pos1 + pos0 + + data = bytes(data) + return Type1Font(( + data[:len0], + self._encrypt(data[len0:], 'eexec'), + self.parts[2] + )) + + +_StandardEncoding = { + **{ord(letter): letter for letter in string.ascii_letters}, + 0: '.notdef', + 32: 'space', + 33: 'exclam', + 34: 'quotedbl', + 35: 'numbersign', + 36: 'dollar', + 37: 'percent', + 38: 'ampersand', + 39: 'quoteright', + 40: 'parenleft', + 41: 'parenright', + 42: 'asterisk', + 43: 'plus', + 44: 'comma', + 45: 'hyphen', + 46: 'period', + 47: 'slash', + 48: 'zero', + 49: 'one', + 50: 'two', + 51: 'three', + 52: 'four', + 53: 'five', + 54: 'six', + 55: 'seven', + 56: 'eight', + 57: 'nine', + 58: 'colon', + 59: 'semicolon', + 60: 'less', + 61: 'equal', + 62: 'greater', + 63: 'question', + 64: 'at', + 91: 'bracketleft', + 92: 'backslash', + 93: 'bracketright', + 94: 'asciicircum', + 95: 'underscore', + 96: 'quoteleft', + 123: 'braceleft', + 124: 'bar', + 125: 'braceright', + 126: 'asciitilde', + 161: 'exclamdown', + 162: 'cent', + 163: 'sterling', + 164: 'fraction', + 165: 'yen', + 166: 'florin', + 167: 'section', + 168: 'currency', + 169: 'quotesingle', + 170: 'quotedblleft', + 171: 'guillemotleft', + 172: 'guilsinglleft', + 173: 'guilsinglright', + 174: 'fi', + 175: 'fl', + 177: 'endash', + 178: 'dagger', + 179: 'daggerdbl', + 180: 'periodcentered', + 182: 'paragraph', + 183: 'bullet', + 184: 'quotesinglbase', + 185: 'quotedblbase', + 186: 'quotedblright', + 187: 'guillemotright', + 188: 'ellipsis', + 189: 'perthousand', + 191: 'questiondown', + 193: 'grave', + 194: 'acute', + 195: 'circumflex', + 196: 'tilde', + 197: 'macron', + 198: 'breve', + 199: 'dotaccent', + 200: 'dieresis', + 202: 'ring', + 203: 'cedilla', + 205: 'hungarumlaut', + 206: 'ogonek', + 207: 'caron', + 208: 'emdash', + 225: 'AE', + 227: 'ordfeminine', + 232: 'Lslash', + 233: 'Oslash', + 234: 'OE', + 235: 'ordmasculine', + 241: 'ae', + 245: 'dotlessi', + 248: 'lslash', + 249: 'oslash', + 250: 'oe', + 251: 'germandbls', +}