From 26283851cdce09a30c7a29b559147dc8b58d66bb Mon Sep 17 00:00:00 2001 From: Oscar Gustafsson Date: Sat, 8 Jan 2022 12:26:00 +0100 Subject: [PATCH 1/7] Started deprecating dviread and texmanager --- lib/matplotlib/{dviread.py => _dviread.py} | 0 lib/matplotlib/{texmanager.py => _texmanager.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename lib/matplotlib/{dviread.py => _dviread.py} (100%) rename lib/matplotlib/{texmanager.py => _texmanager.py} (100%) diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/_dviread.py similarity index 100% rename from lib/matplotlib/dviread.py rename to lib/matplotlib/_dviread.py diff --git a/lib/matplotlib/texmanager.py b/lib/matplotlib/_texmanager.py similarity index 100% rename from lib/matplotlib/texmanager.py rename to lib/matplotlib/_texmanager.py From 993a38712a08d9ab0c03a573e48d895fd5055b59 Mon Sep 17 00:00:00 2001 From: Oscar Gustafsson Date: Sat, 8 Jan 2022 12:26:46 +0100 Subject: [PATCH 2/7] Restored dviread --- lib/matplotlib/dviread.py | 1119 +++++++++++++++++++++++++++++++++++++ 1 file changed, 1119 insertions(+) create mode 100644 lib/matplotlib/dviread.py diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py new file mode 100644 index 000000000000..7f90a13f1086 --- /dev/null +++ b/lib/matplotlib/dviread.py @@ -0,0 +1,1119 @@ +""" +A module for reading dvi files output by TeX. Several limitations make +this not (currently) useful as a general-purpose dvi preprocessor, but +it is currently used by the pdf backend for processing usetex text. + +Interface:: + + with Dvi(filename, 72) as dvi: + # iterate over pages: + for page in dvi: + w, h, d = page.width, page.height, page.descent + for x, y, font, glyph, width in page.text: + fontname = font.texname + pointsize = font.size + ... + for x, y, height, width in page.boxes: + ... +""" + +from collections import namedtuple +import enum +from functools import lru_cache, partial, wraps +import logging +import os +from pathlib import Path +import re +import struct +import subprocess +import sys + +import numpy as np + +from matplotlib import _api, cbook + +_log = logging.getLogger(__name__) + +# Many dvi related files are looked for by external processes, require +# additional parsing, and are used many times per rendering, which is why they +# are cached using lru_cache(). + +# Dvi is a bytecode format documented in +# https://ctan.org/pkg/dvitype +# https://texdoc.org/serve/dvitype.pdf/0 +# +# The file consists of a preamble, some number of pages, a postamble, +# and a finale. Different opcodes are allowed in different contexts, +# so the Dvi object has a parser state: +# +# pre: expecting the preamble +# outer: between pages (followed by a page or the postamble, +# also e.g. font definitions are allowed) +# page: processing a page +# post_post: state after the postamble (our current implementation +# just stops reading) +# finale: the finale (unimplemented in our current implementation) + +_dvistate = enum.Enum('DviState', 'pre outer inpage post_post finale') + +# The marks on a page consist of text and boxes. A page also has dimensions. +Page = namedtuple('Page', 'text boxes height width descent') +Text = namedtuple('Text', 'x y font glyph width') +Box = namedtuple('Box', 'x y height width') + + +# Opcode argument parsing +# +# Each of the following functions takes a Dvi object and delta, +# which is the difference between the opcode and the minimum opcode +# with the same meaning. Dvi opcodes often encode the number of +# argument bytes in this delta. + +def _arg_raw(dvi, delta): + """Return *delta* without reading anything more from the dvi file.""" + return delta + + +def _arg(nbytes, signed, dvi, _): + """ + Read *nbytes* bytes, returning the bytes interpreted as a signed integer + if *signed* is true, unsigned otherwise. + """ + return dvi._arg(nbytes, signed) + + +def _arg_slen(dvi, delta): + """ + Read *delta* bytes, returning None if *delta* is zero, and the bytes + interpreted as a signed integer otherwise. + """ + if delta == 0: + return None + return dvi._arg(delta, True) + + +def _arg_slen1(dvi, delta): + """ + Read *delta*+1 bytes, returning the bytes interpreted as signed. + """ + return dvi._arg(delta + 1, True) + + +def _arg_ulen1(dvi, delta): + """ + Read *delta*+1 bytes, returning the bytes interpreted as unsigned. + """ + return dvi._arg(delta + 1, False) + + +def _arg_olen1(dvi, delta): + """ + Read *delta*+1 bytes, returning the bytes interpreted as + unsigned integer for 0<=*delta*<3 and signed if *delta*==3. + """ + return dvi._arg(delta + 1, delta == 3) + + +_arg_mapping = dict(raw=_arg_raw, + u1=partial(_arg, 1, False), + u4=partial(_arg, 4, False), + s4=partial(_arg, 4, True), + slen=_arg_slen, + olen1=_arg_olen1, + slen1=_arg_slen1, + ulen1=_arg_ulen1) + + +def _dispatch(table, min, max=None, state=None, args=('raw',)): + """ + Decorator for dispatch by opcode. Sets the values in *table* + from *min* to *max* to this method, adds a check that the Dvi state + matches *state* if not None, reads arguments from the file according + to *args*. + + Parameters + ---------- + table : dict[int, callable] + The dispatch table to be filled in. + + min, max : int + Range of opcodes that calls the registered function; *max* defaults to + *min*. + + state : _dvistate, optional + State of the Dvi object in which these opcodes are allowed. + + args : list[str], default: ['raw'] + Sequence of argument specifications: + + - 'raw': opcode minus minimum + - 'u1': read one unsigned byte + - 'u4': read four bytes, treat as an unsigned number + - 's4': read four bytes, treat as a signed number + - 'slen': read (opcode - minimum) bytes, treat as signed + - 'slen1': read (opcode - minimum + 1) bytes, treat as signed + - 'ulen1': read (opcode - minimum + 1) bytes, treat as unsigned + - 'olen1': read (opcode - minimum + 1) bytes, treat as unsigned + if under four bytes, signed if four bytes + """ + def decorate(method): + get_args = [_arg_mapping[x] for x in args] + + @wraps(method) + def wrapper(self, byte): + if state is not None and self.state != state: + raise ValueError("state precondition failed") + return method(self, *[f(self, byte-min) for f in get_args]) + if max is None: + table[min] = wrapper + else: + for i in range(min, max+1): + assert table[i] is None + table[i] = wrapper + return wrapper + return decorate + + +class Dvi: + """ + A reader for a dvi ("device-independent") file, as produced by TeX. + + The current implementation can only iterate through pages in order, + and does not even attempt to verify the postamble. + + This class can be used as a context manager to close the underlying + file upon exit. Pages can be read via iteration. Here is an overly + simple way to extract text without trying to detect whitespace:: + + >>> with matplotlib.dviread.Dvi('input.dvi', 72) as dvi: + ... for page in dvi: + ... print(''.join(chr(t.glyph) for t in page.text)) + """ + # dispatch table + _dtable = [None] * 256 + _dispatch = partial(_dispatch, _dtable) + + def __init__(self, filename, dpi): + """ + Read the data from the file named *filename* and convert + TeX's internal units to units of *dpi* per inch. + *dpi* only sets the units and does not limit the resolution. + Use None to return TeX's internal units. + """ + _log.debug('Dvi: %s', filename) + self.file = open(filename, 'rb') + self.dpi = dpi + self.fonts = {} + self.state = _dvistate.pre + + baseline = _api.deprecated("3.5")(property(lambda self: None)) + + def __enter__(self): + """Context manager enter method, does nothing.""" + return self + + def __exit__(self, etype, evalue, etrace): + """ + Context manager exit method, closes the underlying file if it is open. + """ + self.close() + + def __iter__(self): + """ + Iterate through the pages of the file. + + Yields + ------ + Page + Details of all the text and box objects on the page. + The Page tuple contains lists of Text and Box tuples and + the page dimensions, and the Text and Box tuples contain + coordinates transformed into a standard Cartesian + coordinate system at the dpi value given when initializing. + The coordinates are floating point numbers, but otherwise + precision is not lost and coordinate values are not clipped to + integers. + """ + while self._read(): + yield self._output() + + def close(self): + """Close the underlying file if it is open.""" + if not self.file.closed: + self.file.close() + + def _output(self): + """ + Output the text and boxes belonging to the most recent page. + page = dvi._output() + """ + minx, miny, maxx, maxy = np.inf, np.inf, -np.inf, -np.inf + maxy_pure = -np.inf + for elt in self.text + self.boxes: + if isinstance(elt, Box): + x, y, h, w = elt + e = 0 # zero depth + else: # glyph + x, y, font, g, w = elt + h, e = font._height_depth_of(g) + minx = min(minx, x) + miny = min(miny, y - h) + maxx = max(maxx, x + w) + maxy = max(maxy, y + e) + maxy_pure = max(maxy_pure, y) + if self._baseline_v is not None: + maxy_pure = self._baseline_v # This should normally be the case. + self._baseline_v = None + + if not self.text and not self.boxes: # Avoid infs/nans from inf+/-inf. + return Page(text=[], boxes=[], width=0, height=0, descent=0) + + if self.dpi is None: + # special case for ease of debugging: output raw dvi coordinates + return Page(text=self.text, boxes=self.boxes, + width=maxx-minx, height=maxy_pure-miny, + descent=maxy-maxy_pure) + + # convert from TeX's "scaled points" to dpi units + d = self.dpi / (72.27 * 2**16) + descent = (maxy - maxy_pure) * d + + text = [Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d) + for (x, y, f, g, w) in self.text] + boxes = [Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d) + for (x, y, h, w) in self.boxes] + + return Page(text=text, boxes=boxes, width=(maxx-minx)*d, + height=(maxy_pure-miny)*d, descent=descent) + + def _read(self): + """ + Read one page from the file. Return True if successful, + False if there were no more pages. + """ + # Pages appear to start with the sequence + # bop (begin of page) + # xxx comment + # # if using chemformula + # down + # push + # down + # # if using xcolor + # down + # push + # down (possibly multiple) + # push <= here, v is the baseline position. + # etc. + # (dviasm is useful to explore this structure.) + # Thus, we use the vertical position at the first time the stack depth + # reaches 3, while at least three "downs" have been executed (excluding + # those popped out (corresponding to the chemformula preamble)), as the + # baseline (the "down" count is necessary to handle xcolor). + down_stack = [0] + self._baseline_v = None + while True: + byte = self.file.read(1)[0] + self._dtable[byte](self, byte) + name = self._dtable[byte].__name__ + if name == "_push": + down_stack.append(down_stack[-1]) + elif name == "_pop": + down_stack.pop() + elif name == "_down": + down_stack[-1] += 1 + if (self._baseline_v is None + and len(getattr(self, "stack", [])) == 3 + and down_stack[-1] >= 4): + self._baseline_v = self.v + if byte == 140: # end of page + return True + if self.state is _dvistate.post_post: # end of file + self.close() + return False + + def _arg(self, nbytes, signed=False): + """ + Read and return an integer argument *nbytes* long. + Signedness is determined by the *signed* keyword. + """ + buf = self.file.read(nbytes) + value = buf[0] + if signed and value >= 0x80: + value = value - 0x100 + for b in buf[1:]: + value = 0x100*value + b + return value + + @_dispatch(min=0, max=127, state=_dvistate.inpage) + def _set_char_immediate(self, char): + self._put_char_real(char) + self.h += self.fonts[self.f]._width_of(char) + + @_dispatch(min=128, max=131, state=_dvistate.inpage, args=('olen1',)) + def _set_char(self, char): + self._put_char_real(char) + self.h += self.fonts[self.f]._width_of(char) + + @_dispatch(132, state=_dvistate.inpage, args=('s4', 's4')) + def _set_rule(self, a, b): + self._put_rule_real(a, b) + self.h += b + + @_dispatch(min=133, max=136, state=_dvistate.inpage, args=('olen1',)) + def _put_char(self, char): + self._put_char_real(char) + + def _put_char_real(self, char): + font = self.fonts[self.f] + if font._vf is None: + self.text.append(Text(self.h, self.v, font, char, + font._width_of(char))) + else: + scale = font._scale + for x, y, f, g, w in font._vf[char].text: + newf = DviFont(scale=_mul2012(scale, f._scale), + tfm=f._tfm, texname=f.texname, vf=f._vf) + self.text.append(Text(self.h + _mul2012(x, scale), + self.v + _mul2012(y, scale), + newf, g, newf._width_of(g))) + self.boxes.extend([Box(self.h + _mul2012(x, scale), + self.v + _mul2012(y, scale), + _mul2012(a, scale), _mul2012(b, scale)) + for x, y, a, b in font._vf[char].boxes]) + + @_dispatch(137, state=_dvistate.inpage, args=('s4', 's4')) + def _put_rule(self, a, b): + self._put_rule_real(a, b) + + def _put_rule_real(self, a, b): + if a > 0 and b > 0: + self.boxes.append(Box(self.h, self.v, a, b)) + + @_dispatch(138) + def _nop(self, _): + pass + + @_dispatch(139, state=_dvistate.outer, args=('s4',)*11) + def _bop(self, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, p): + self.state = _dvistate.inpage + self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0 + self.stack = [] + self.text = [] # list of Text objects + self.boxes = [] # list of Box objects + + @_dispatch(140, state=_dvistate.inpage) + def _eop(self, _): + self.state = _dvistate.outer + del self.h, self.v, self.w, self.x, self.y, self.z, self.stack + + @_dispatch(141, state=_dvistate.inpage) + def _push(self, _): + self.stack.append((self.h, self.v, self.w, self.x, self.y, self.z)) + + @_dispatch(142, state=_dvistate.inpage) + def _pop(self, _): + self.h, self.v, self.w, self.x, self.y, self.z = self.stack.pop() + + @_dispatch(min=143, max=146, state=_dvistate.inpage, args=('slen1',)) + def _right(self, b): + self.h += b + + @_dispatch(min=147, max=151, state=_dvistate.inpage, args=('slen',)) + def _right_w(self, new_w): + if new_w is not None: + self.w = new_w + self.h += self.w + + @_dispatch(min=152, max=156, state=_dvistate.inpage, args=('slen',)) + def _right_x(self, new_x): + if new_x is not None: + self.x = new_x + self.h += self.x + + @_dispatch(min=157, max=160, state=_dvistate.inpage, args=('slen1',)) + def _down(self, a): + self.v += a + + @_dispatch(min=161, max=165, state=_dvistate.inpage, args=('slen',)) + def _down_y(self, new_y): + if new_y is not None: + self.y = new_y + self.v += self.y + + @_dispatch(min=166, max=170, state=_dvistate.inpage, args=('slen',)) + def _down_z(self, new_z): + if new_z is not None: + self.z = new_z + self.v += self.z + + @_dispatch(min=171, max=234, state=_dvistate.inpage) + def _fnt_num_immediate(self, k): + self.f = k + + @_dispatch(min=235, max=238, state=_dvistate.inpage, args=('olen1',)) + def _fnt_num(self, new_f): + self.f = new_f + + @_dispatch(min=239, max=242, args=('ulen1',)) + def _xxx(self, datalen): + special = self.file.read(datalen) + _log.debug( + 'Dvi._xxx: encountered special: %s', + ''.join([chr(ch) if 32 <= ch < 127 else '<%02x>' % ch + for ch in special])) + + @_dispatch(min=243, max=246, args=('olen1', 'u4', 'u4', 'u4', 'u1', 'u1')) + def _fnt_def(self, k, c, s, d, a, l): + self._fnt_def_real(k, c, s, d, a, l) + + def _fnt_def_real(self, k, c, s, d, a, l): + n = self.file.read(a + l) + fontname = n[-l:].decode('ascii') + tfm = _tfmfile(fontname) + if c != 0 and tfm.checksum != 0 and c != tfm.checksum: + raise ValueError('tfm checksum mismatch: %s' % n) + try: + vf = _vffile(fontname) + except FileNotFoundError: + vf = None + self.fonts[k] = DviFont(scale=s, tfm=tfm, texname=n, vf=vf) + + @_dispatch(247, state=_dvistate.pre, args=('u1', 'u4', 'u4', 'u4', 'u1')) + def _pre(self, i, num, den, mag, k): + self.file.read(k) # comment in the dvi file + if i != 2: + raise ValueError("Unknown dvi format %d" % i) + if num != 25400000 or den != 7227 * 2**16: + raise ValueError("Nonstandard units in dvi file") + # meaning: TeX always uses those exact values, so it + # should be enough for us to support those + # (There are 72.27 pt to an inch so 7227 pt = + # 7227 * 2**16 sp to 100 in. The numerator is multiplied + # by 10^5 to get units of 10**-7 meters.) + if mag != 1000: + raise ValueError("Nonstandard magnification in dvi file") + # meaning: LaTeX seems to frown on setting \mag, so + # I think we can assume this is constant + self.state = _dvistate.outer + + @_dispatch(248, state=_dvistate.outer) + def _post(self, _): + self.state = _dvistate.post_post + # TODO: actually read the postamble and finale? + # currently post_post just triggers closing the file + + @_dispatch(249) + def _post_post(self, _): + raise NotImplementedError + + @_dispatch(min=250, max=255) + def _malformed(self, offset): + raise ValueError(f"unknown command: byte {250 + offset}") + + +class DviFont: + """ + Encapsulation of a font that a DVI file can refer to. + + This class holds a font's texname and size, supports comparison, + and knows the widths of glyphs in the same units as the AFM file. + There are also internal attributes (for use by dviread.py) that + are *not* used for comparison. + + The size is in Adobe points (converted from TeX points). + + Parameters + ---------- + scale : float + Factor by which the font is scaled from its natural size. + tfm : Tfm + TeX font metrics for this font + texname : bytes + Name of the font as used internally by TeX and friends, as an ASCII + bytestring. This is usually very different from any external font + names; `PsfontsMap` can be used to find the external name of the font. + vf : Vf + A TeX "virtual font" file, or None if this font is not virtual. + + Attributes + ---------- + texname : bytes + size : float + Size of the font in Adobe points, converted from the slightly + smaller TeX points. + widths : list + Widths of glyphs in glyph-space units, typically 1/1000ths of + the point size. + + """ + __slots__ = ('texname', 'size', 'widths', '_scale', '_vf', '_tfm') + + def __init__(self, scale, tfm, texname, vf): + _api.check_isinstance(bytes, texname=texname) + self._scale = scale + self._tfm = tfm + self.texname = texname + self._vf = vf + self.size = scale * (72.0 / (72.27 * 2**16)) + try: + nchars = max(tfm.width) + 1 + except ValueError: + nchars = 0 + self.widths = [(1000*tfm.width.get(char, 0)) >> 20 + for char in range(nchars)] + + def __eq__(self, other): + return (type(self) == type(other) + and self.texname == other.texname and self.size == other.size) + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + return "<{}: {}>".format(type(self).__name__, self.texname) + + def _width_of(self, char): + """Width of char in dvi units.""" + width = self._tfm.width.get(char, None) + if width is not None: + return _mul2012(width, self._scale) + _log.debug('No width for char %d in font %s.', char, self.texname) + return 0 + + def _height_depth_of(self, char): + """Height and depth of char in dvi units.""" + result = [] + for metric, name in ((self._tfm.height, "height"), + (self._tfm.depth, "depth")): + value = metric.get(char, None) + if value is None: + _log.debug('No %s for char %d in font %s', + name, char, self.texname) + result.append(0) + else: + result.append(_mul2012(value, self._scale)) + # cmsyXX (symbols font) glyph 0 ("minus") has a nonzero descent + # so that TeX aligns equations properly + # (https://tex.stackexchange.com/q/526103/) + # but we actually care about the rasterization depth to align + # the dvipng-generated images. + if re.match(br'^cmsy\d+$', self.texname) and char == 0: + result[-1] = 0 + return result + + +class Vf(Dvi): + r""" + A virtual font (\*.vf file) containing subroutines for dvi files. + + Parameters + ---------- + filename : str or path-like + + Notes + ----- + The virtual font format is a derivative of dvi: + http://mirrors.ctan.org/info/knuth/virtual-fonts + This class reuses some of the machinery of `Dvi` + but replaces the `_read` loop and dispatch mechanism. + + Examples + -------- + :: + + vf = Vf(filename) + glyph = vf[code] + glyph.text, glyph.boxes, glyph.width + """ + + def __init__(self, filename): + super().__init__(filename, 0) + try: + self._first_font = None + self._chars = {} + self._read() + finally: + self.close() + + def __getitem__(self, code): + return self._chars[code] + + def _read(self): + """ + Read one page from the file. Return True if successful, + False if there were no more pages. + """ + packet_char, packet_ends = None, None + packet_len, packet_width = None, None + while True: + byte = self.file.read(1)[0] + # If we are in a packet, execute the dvi instructions + if self.state is _dvistate.inpage: + byte_at = self.file.tell()-1 + if byte_at == packet_ends: + self._finalize_packet(packet_char, packet_width) + packet_len, packet_char, packet_width = None, None, None + # fall through to out-of-packet code + elif byte_at > packet_ends: + raise ValueError("Packet length mismatch in vf file") + else: + if byte in (139, 140) or byte >= 243: + raise ValueError( + "Inappropriate opcode %d in vf file" % byte) + Dvi._dtable[byte](self, byte) + continue + + # We are outside a packet + if byte < 242: # a short packet (length given by byte) + packet_len = byte + packet_char, packet_width = self._arg(1), self._arg(3) + packet_ends = self._init_packet(byte) + self.state = _dvistate.inpage + elif byte == 242: # a long packet + packet_len, packet_char, packet_width = \ + [self._arg(x) for x in (4, 4, 4)] + self._init_packet(packet_len) + elif 243 <= byte <= 246: + k = self._arg(byte - 242, byte == 246) + c, s, d, a, l = [self._arg(x) for x in (4, 4, 4, 1, 1)] + self._fnt_def_real(k, c, s, d, a, l) + if self._first_font is None: + self._first_font = k + elif byte == 247: # preamble + i, k = self._arg(1), self._arg(1) + x = self.file.read(k) + cs, ds = self._arg(4), self._arg(4) + self._pre(i, x, cs, ds) + elif byte == 248: # postamble (just some number of 248s) + break + else: + raise ValueError("Unknown vf opcode %d" % byte) + + def _init_packet(self, pl): + if self.state != _dvistate.outer: + raise ValueError("Misplaced packet in vf file") + self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0 + self.stack, self.text, self.boxes = [], [], [] + self.f = self._first_font + return self.file.tell() + pl + + def _finalize_packet(self, packet_char, packet_width): + self._chars[packet_char] = Page( + text=self.text, boxes=self.boxes, width=packet_width, + height=None, descent=None) + self.state = _dvistate.outer + + def _pre(self, i, x, cs, ds): + if self.state is not _dvistate.pre: + raise ValueError("pre command in middle of vf file") + if i != 202: + raise ValueError("Unknown vf format %d" % i) + if len(x): + _log.debug('vf file comment: %s', x) + self.state = _dvistate.outer + # cs = checksum, ds = design size + + +def _mul2012(num1, num2): + """Multiply two numbers in 20.12 fixed point format.""" + # Separated into a function because >> has surprising precedence + return (num1*num2) >> 20 + + +class Tfm: + """ + A TeX Font Metric file. + + This implementation covers only the bare minimum needed by the Dvi class. + + Parameters + ---------- + filename : str or path-like + + Attributes + ---------- + checksum : int + Used for verifying against the dvi file. + design_size : int + Design size of the font (unknown units) + width, height, depth : dict + Dimensions of each character, need to be scaled by the factor + specified in the dvi file. These are dicts because indexing may + not start from 0. + """ + __slots__ = ('checksum', 'design_size', 'width', 'height', 'depth') + + def __init__(self, filename): + _log.debug('opening tfm file %s', filename) + with open(filename, 'rb') as file: + header1 = file.read(24) + lh, bc, ec, nw, nh, nd = struct.unpack('!6H', header1[2:14]) + _log.debug('lh=%d, bc=%d, ec=%d, nw=%d, nh=%d, nd=%d', + lh, bc, ec, nw, nh, nd) + header2 = file.read(4*lh) + self.checksum, self.design_size = struct.unpack('!2I', header2[:8]) + # there is also encoding information etc. + char_info = file.read(4*(ec-bc+1)) + widths = struct.unpack(f'!{nw}i', file.read(4*nw)) + heights = struct.unpack(f'!{nh}i', file.read(4*nh)) + depths = struct.unpack(f'!{nd}i', file.read(4*nd)) + self.width, self.height, self.depth = {}, {}, {} + for idx, char in enumerate(range(bc, ec+1)): + byte0 = char_info[4*idx] + byte1 = char_info[4*idx+1] + self.width[char] = widths[byte0] + self.height[char] = heights[byte1 >> 4] + self.depth[char] = depths[byte1 & 0xf] + + +PsFont = namedtuple('PsFont', 'texname psname effects encoding filename') + + +class PsfontsMap: + """ + A psfonts.map formatted file, mapping TeX fonts to PS fonts. + + Parameters + ---------- + filename : str or path-like + + Notes + ----- + For historical reasons, TeX knows many Type-1 fonts by different + names than the outside world. (For one thing, the names have to + fit in eight characters.) Also, TeX's native fonts are not Type-1 + but Metafont, which is nontrivial to convert to PostScript except + as a bitmap. While high-quality conversions to Type-1 format exist + and are shipped with modern TeX distributions, we need to know + which Type-1 fonts are the counterparts of which native fonts. For + these reasons a mapping is needed from internal font names to font + file names. + + A texmf tree typically includes mapping files called e.g. + :file:`psfonts.map`, :file:`pdftex.map`, or :file:`dvipdfm.map`. + The file :file:`psfonts.map` is used by :program:`dvips`, + :file:`pdftex.map` by :program:`pdfTeX`, and :file:`dvipdfm.map` + by :program:`dvipdfm`. :file:`psfonts.map` might avoid embedding + the 35 PostScript fonts (i.e., have no filename for them, as in + the Times-Bold example above), while the pdf-related files perhaps + only avoid the "Base 14" pdf fonts. But the user may have + configured these files differently. + + Examples + -------- + >>> map = PsfontsMap(find_tex_file('pdftex.map')) + >>> entry = map[b'ptmbo8r'] + >>> entry.texname + b'ptmbo8r' + >>> entry.psname + b'Times-Bold' + >>> entry.encoding + '/usr/local/texlive/2008/texmf-dist/fonts/enc/dvips/base/8r.enc' + >>> entry.effects + {'slant': 0.16700000000000001} + >>> entry.filename + """ + __slots__ = ('_filename', '_unparsed', '_parsed') + + # Create a filename -> PsfontsMap cache, so that calling + # `PsfontsMap(filename)` with the same filename a second time immediately + # returns the same object. + @lru_cache() + def __new__(cls, filename): + self = object.__new__(cls) + self._filename = os.fsdecode(filename) + # Some TeX distributions have enormous pdftex.map files which would + # take hundreds of milliseconds to parse, but it is easy enough to just + # store the unparsed lines (keyed by the first word, which is the + # texname) and parse them on-demand. + with open(filename, 'rb') as file: + self._unparsed = {} + for line in file: + tfmname = line.split(b' ', 1)[0] + self._unparsed.setdefault(tfmname, []).append(line) + self._parsed = {} + return self + + def __getitem__(self, texname): + assert isinstance(texname, bytes) + if texname in self._unparsed: + for line in self._unparsed.pop(texname): + if self._parse_and_cache_line(line): + break + try: + return self._parsed[texname] + except KeyError: + raise LookupError( + f"An associated PostScript font (required by Matplotlib) " + f"could not be found for TeX font {texname.decode('ascii')!r} " + f"in {self._filename!r}; this problem can often be solved by " + f"installing a suitable PostScript font package in your TeX " + f"package manager") from None + + def _parse_and_cache_line(self, line): + """ + Parse a line in the font mapping file. + + The format is (partially) documented at + http://mirrors.ctan.org/systems/doc/pdftex/manual/pdftex-a.pdf + https://tug.org/texinfohtml/dvips.html#psfonts_002emap + Each line can have the following fields: + + - tfmname (first, only required field), + - psname (defaults to tfmname, must come immediately after tfmname if + present), + - fontflags (integer, must come immediately after psname if present, + ignored by us), + - special (SlantFont and ExtendFont, only field that is double-quoted), + - fontfile, encodingfile (optional, prefixed by <, <<, or <[; << always + precedes a font, <[ always precedes an encoding, < can precede either + but then an encoding file must have extension .enc; < and << also + request different font subsetting behaviors but we ignore that; < can + be separated from the filename by whitespace). + + special, fontfile, and encodingfile can appear in any order. + """ + # If the map file specifies multiple encodings for a font, we + # follow pdfTeX in choosing the last one specified. Such + # entries are probably mistakes but they have occurred. + # https://tex.stackexchange.com/q/10826/ + + if not line or line.startswith((b" ", b"%", b"*", b";", b"#")): + return + tfmname = basename = special = encodingfile = fontfile = None + is_subsetted = is_t1 = is_truetype = False + matches = re.finditer(br'"([^"]*)(?:"|$)|(\S+)', line) + for match in matches: + quoted, unquoted = match.groups() + if unquoted: + if unquoted.startswith(b"<<"): # font + fontfile = unquoted[2:] + elif unquoted.startswith(b"<["): # encoding + encodingfile = unquoted[2:] + elif unquoted.startswith(b"<"): # font or encoding + word = ( + # foo + unquoted[1:] + # < by itself => read the next word + or next(filter(None, next(matches).groups()))) + if word.endswith(b".enc"): + encodingfile = word + else: + fontfile = word + is_subsetted = True + elif tfmname is None: + tfmname = unquoted + elif basename is None: + basename = unquoted + elif quoted: + special = quoted + effects = {} + if special: + words = reversed(special.split()) + for word in words: + if word == b"SlantFont": + effects["slant"] = float(next(words)) + elif word == b"ExtendFont": + effects["extend"] = float(next(words)) + + # Verify some properties of the line that would cause it to be ignored + # otherwise. + if fontfile is not None: + if fontfile.endswith((b".ttf", b".ttc")): + is_truetype = True + elif not fontfile.endswith(b".otf"): + is_t1 = True + elif basename is not None: + is_t1 = True + if is_truetype and is_subsetted and encodingfile is None: + return + if not is_t1 and ("slant" in effects or "extend" in effects): + return + if abs(effects.get("slant", 0)) > 1: + return + if abs(effects.get("extend", 0)) > 2: + return + + if basename is None: + basename = tfmname + if encodingfile is not None: + encodingfile = _find_tex_file(encodingfile) + if fontfile is not None: + fontfile = _find_tex_file(fontfile) + self._parsed[tfmname] = PsFont( + texname=tfmname, psname=basename, effects=effects, + encoding=encodingfile, filename=fontfile) + return True + + +def _parse_enc(path): + r""" + Parse a \*.enc file referenced from a psfonts.map style file. + + The format supported by this function is a tiny subset of PostScript. + + Parameters + ---------- + path : os.PathLike + + Returns + ------- + list + The nth entry of the list is the PostScript glyph name of the nth + glyph. + """ + no_comments = re.sub("%.*", "", Path(path).read_text(encoding="ascii")) + array = re.search(r"(?s)\[(.*)\]", no_comments).group(1) + lines = [line for line in array.split() if line] + if all(line.startswith("/") for line in lines): + return [line[1:] for line in lines] + else: + raise ValueError( + "Failed to parse {} as Postscript encoding".format(path)) + + +class _LuatexKpsewhich: + @lru_cache() # A singleton. + def __new__(cls): + self = object.__new__(cls) + self._proc = self._new_proc() + return self + + def _new_proc(self): + return subprocess.Popen( + ["luatex", "--luaonly", + str(cbook._get_data_path("kpsewhich.lua"))], + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + + def search(self, filename): + if self._proc.poll() is not None: # Dead, restart it. + self._proc = self._new_proc() + self._proc.stdin.write(os.fsencode(filename) + b"\n") + self._proc.stdin.flush() + out = self._proc.stdout.readline().rstrip() + return None if out == b"nil" else os.fsdecode(out) + + +@lru_cache() +@_api.delete_parameter("3.5", "format") +def _find_tex_file(filename, format=None): + """ + Find a file in the texmf tree using kpathsea_. + + The kpathsea library, provided by most existing TeX distributions, both + on Unix-like systems and on Windows (MikTeX), is invoked via a long-lived + luatex process if luatex is installed, or via kpsewhich otherwise. + + .. _kpathsea: https://www.tug.org/kpathsea/ + + Parameters + ---------- + filename : str or path-like + format : str or bytes + Used as the value of the ``--format`` option to :program:`kpsewhich`. + Could be e.g. 'tfm' or 'vf' to limit the search to that type of files. + Deprecated. + + Raises + ------ + FileNotFoundError + If the file is not found. + """ + + # we expect these to always be ascii encoded, but use utf-8 + # out of caution + if isinstance(filename, bytes): + filename = filename.decode('utf-8', errors='replace') + if isinstance(format, bytes): + format = format.decode('utf-8', errors='replace') + + try: + lk = _LuatexKpsewhich() + except FileNotFoundError: + lk = None # Fallback to directly calling kpsewhich, as below. + + if lk and format is None: + path = lk.search(filename) + + else: + if os.name == 'nt': + # On Windows only, kpathsea can use utf-8 for cmd args and output. + # The `command_line_encoding` environment variable is set to force + # it to always use utf-8 encoding. See Matplotlib issue #11848. + kwargs = {'env': {**os.environ, 'command_line_encoding': 'utf-8'}, + 'encoding': 'utf-8'} + else: # On POSIX, run through the equivalent of os.fsdecode(). + kwargs = {'encoding': sys.getfilesystemencoding(), + 'errors': 'surrogateescape'} + + cmd = ['kpsewhich'] + if format is not None: + cmd += ['--format=' + format] + cmd += [filename] + try: + path = (cbook._check_and_log_subprocess(cmd, _log, **kwargs) + .rstrip('\n')) + except (FileNotFoundError, RuntimeError): + path = None + + if path: + return path + else: + raise FileNotFoundError( + f"Matplotlib's TeX implementation searched for a file named " + f"{filename!r} in your texmf tree, but could not find it") + + +# After the deprecation period elapses, delete this shim and rename +# _find_tex_file to find_tex_file everywhere. +@_api.delete_parameter("3.5", "format") +def find_tex_file(filename, format=None): + try: + return (_find_tex_file(filename, format) if format is not None else + _find_tex_file(filename)) + except FileNotFoundError as exc: + _api.warn_deprecated( + "3.6", message=f"{exc.args[0]}; in the future, this will raise a " + f"FileNotFoundError.") + return "" + + +find_tex_file.__doc__ = _find_tex_file.__doc__ + + +@lru_cache() +def _fontfile(cls, suffix, texname): + return cls(_find_tex_file(texname + suffix)) + + +_tfmfile = partial(_fontfile, Tfm, ".tfm") +_vffile = partial(_fontfile, Vf, ".vf") + + +if __name__ == '__main__': + from argparse import ArgumentParser + import itertools + + parser = ArgumentParser() + parser.add_argument("filename") + parser.add_argument("dpi", nargs="?", type=float, default=None) + args = parser.parse_args() + with Dvi(args.filename, args.dpi) as dvi: + fontmap = PsfontsMap(_find_tex_file('pdftex.map')) + for page in dvi: + print(f"=== new page === " + f"(w: {page.width}, h: {page.height}, d: {page.descent})") + for font, group in itertools.groupby( + page.text, lambda text: text.font): + print(f"font: {font.texname.decode('latin-1')!r}\t" + f"scale: {font._scale / 2 ** 20}") + print("x", "y", "glyph", "chr", "w", "(glyphs)", sep="\t") + for text in group: + print(text.x, text.y, text.glyph, + chr(text.glyph) if chr(text.glyph).isprintable() + else ".", + text.width, sep="\t") + if page.boxes: + print("x", "y", "w", "h", "", "(boxes)", sep="\t") + for x, y, w, h in page.boxes: + print(x, y, w, h, sep="\t") From 98962d0a7b0661b8cea1b1a238ca6d1e3e78567a Mon Sep 17 00:00:00 2001 From: Oscar Gustafsson Date: Sat, 8 Jan 2022 12:27:29 +0100 Subject: [PATCH 3/7] Created base for _dviread_vf.py --- lib/matplotlib/{_dviread.py => _dviread_vf.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename lib/matplotlib/{_dviread.py => _dviread_vf.py} (100%) diff --git a/lib/matplotlib/_dviread.py b/lib/matplotlib/_dviread_vf.py similarity index 100% rename from lib/matplotlib/_dviread.py rename to lib/matplotlib/_dviread_vf.py From b65e9dcd9e71032111824dd4717b02a3bbacf248 Mon Sep 17 00:00:00 2001 From: Oscar Gustafsson Date: Sat, 8 Jan 2022 12:27:52 +0100 Subject: [PATCH 4/7] Restored _dviread.py --- lib/matplotlib/_dviread.py | 1119 ++++++++++++++++++++++++++++++++++++ 1 file changed, 1119 insertions(+) create mode 100644 lib/matplotlib/_dviread.py diff --git a/lib/matplotlib/_dviread.py b/lib/matplotlib/_dviread.py new file mode 100644 index 000000000000..7f90a13f1086 --- /dev/null +++ b/lib/matplotlib/_dviread.py @@ -0,0 +1,1119 @@ +""" +A module for reading dvi files output by TeX. Several limitations make +this not (currently) useful as a general-purpose dvi preprocessor, but +it is currently used by the pdf backend for processing usetex text. + +Interface:: + + with Dvi(filename, 72) as dvi: + # iterate over pages: + for page in dvi: + w, h, d = page.width, page.height, page.descent + for x, y, font, glyph, width in page.text: + fontname = font.texname + pointsize = font.size + ... + for x, y, height, width in page.boxes: + ... +""" + +from collections import namedtuple +import enum +from functools import lru_cache, partial, wraps +import logging +import os +from pathlib import Path +import re +import struct +import subprocess +import sys + +import numpy as np + +from matplotlib import _api, cbook + +_log = logging.getLogger(__name__) + +# Many dvi related files are looked for by external processes, require +# additional parsing, and are used many times per rendering, which is why they +# are cached using lru_cache(). + +# Dvi is a bytecode format documented in +# https://ctan.org/pkg/dvitype +# https://texdoc.org/serve/dvitype.pdf/0 +# +# The file consists of a preamble, some number of pages, a postamble, +# and a finale. Different opcodes are allowed in different contexts, +# so the Dvi object has a parser state: +# +# pre: expecting the preamble +# outer: between pages (followed by a page or the postamble, +# also e.g. font definitions are allowed) +# page: processing a page +# post_post: state after the postamble (our current implementation +# just stops reading) +# finale: the finale (unimplemented in our current implementation) + +_dvistate = enum.Enum('DviState', 'pre outer inpage post_post finale') + +# The marks on a page consist of text and boxes. A page also has dimensions. +Page = namedtuple('Page', 'text boxes height width descent') +Text = namedtuple('Text', 'x y font glyph width') +Box = namedtuple('Box', 'x y height width') + + +# Opcode argument parsing +# +# Each of the following functions takes a Dvi object and delta, +# which is the difference between the opcode and the minimum opcode +# with the same meaning. Dvi opcodes often encode the number of +# argument bytes in this delta. + +def _arg_raw(dvi, delta): + """Return *delta* without reading anything more from the dvi file.""" + return delta + + +def _arg(nbytes, signed, dvi, _): + """ + Read *nbytes* bytes, returning the bytes interpreted as a signed integer + if *signed* is true, unsigned otherwise. + """ + return dvi._arg(nbytes, signed) + + +def _arg_slen(dvi, delta): + """ + Read *delta* bytes, returning None if *delta* is zero, and the bytes + interpreted as a signed integer otherwise. + """ + if delta == 0: + return None + return dvi._arg(delta, True) + + +def _arg_slen1(dvi, delta): + """ + Read *delta*+1 bytes, returning the bytes interpreted as signed. + """ + return dvi._arg(delta + 1, True) + + +def _arg_ulen1(dvi, delta): + """ + Read *delta*+1 bytes, returning the bytes interpreted as unsigned. + """ + return dvi._arg(delta + 1, False) + + +def _arg_olen1(dvi, delta): + """ + Read *delta*+1 bytes, returning the bytes interpreted as + unsigned integer for 0<=*delta*<3 and signed if *delta*==3. + """ + return dvi._arg(delta + 1, delta == 3) + + +_arg_mapping = dict(raw=_arg_raw, + u1=partial(_arg, 1, False), + u4=partial(_arg, 4, False), + s4=partial(_arg, 4, True), + slen=_arg_slen, + olen1=_arg_olen1, + slen1=_arg_slen1, + ulen1=_arg_ulen1) + + +def _dispatch(table, min, max=None, state=None, args=('raw',)): + """ + Decorator for dispatch by opcode. Sets the values in *table* + from *min* to *max* to this method, adds a check that the Dvi state + matches *state* if not None, reads arguments from the file according + to *args*. + + Parameters + ---------- + table : dict[int, callable] + The dispatch table to be filled in. + + min, max : int + Range of opcodes that calls the registered function; *max* defaults to + *min*. + + state : _dvistate, optional + State of the Dvi object in which these opcodes are allowed. + + args : list[str], default: ['raw'] + Sequence of argument specifications: + + - 'raw': opcode minus minimum + - 'u1': read one unsigned byte + - 'u4': read four bytes, treat as an unsigned number + - 's4': read four bytes, treat as a signed number + - 'slen': read (opcode - minimum) bytes, treat as signed + - 'slen1': read (opcode - minimum + 1) bytes, treat as signed + - 'ulen1': read (opcode - minimum + 1) bytes, treat as unsigned + - 'olen1': read (opcode - minimum + 1) bytes, treat as unsigned + if under four bytes, signed if four bytes + """ + def decorate(method): + get_args = [_arg_mapping[x] for x in args] + + @wraps(method) + def wrapper(self, byte): + if state is not None and self.state != state: + raise ValueError("state precondition failed") + return method(self, *[f(self, byte-min) for f in get_args]) + if max is None: + table[min] = wrapper + else: + for i in range(min, max+1): + assert table[i] is None + table[i] = wrapper + return wrapper + return decorate + + +class Dvi: + """ + A reader for a dvi ("device-independent") file, as produced by TeX. + + The current implementation can only iterate through pages in order, + and does not even attempt to verify the postamble. + + This class can be used as a context manager to close the underlying + file upon exit. Pages can be read via iteration. Here is an overly + simple way to extract text without trying to detect whitespace:: + + >>> with matplotlib.dviread.Dvi('input.dvi', 72) as dvi: + ... for page in dvi: + ... print(''.join(chr(t.glyph) for t in page.text)) + """ + # dispatch table + _dtable = [None] * 256 + _dispatch = partial(_dispatch, _dtable) + + def __init__(self, filename, dpi): + """ + Read the data from the file named *filename* and convert + TeX's internal units to units of *dpi* per inch. + *dpi* only sets the units and does not limit the resolution. + Use None to return TeX's internal units. + """ + _log.debug('Dvi: %s', filename) + self.file = open(filename, 'rb') + self.dpi = dpi + self.fonts = {} + self.state = _dvistate.pre + + baseline = _api.deprecated("3.5")(property(lambda self: None)) + + def __enter__(self): + """Context manager enter method, does nothing.""" + return self + + def __exit__(self, etype, evalue, etrace): + """ + Context manager exit method, closes the underlying file if it is open. + """ + self.close() + + def __iter__(self): + """ + Iterate through the pages of the file. + + Yields + ------ + Page + Details of all the text and box objects on the page. + The Page tuple contains lists of Text and Box tuples and + the page dimensions, and the Text and Box tuples contain + coordinates transformed into a standard Cartesian + coordinate system at the dpi value given when initializing. + The coordinates are floating point numbers, but otherwise + precision is not lost and coordinate values are not clipped to + integers. + """ + while self._read(): + yield self._output() + + def close(self): + """Close the underlying file if it is open.""" + if not self.file.closed: + self.file.close() + + def _output(self): + """ + Output the text and boxes belonging to the most recent page. + page = dvi._output() + """ + minx, miny, maxx, maxy = np.inf, np.inf, -np.inf, -np.inf + maxy_pure = -np.inf + for elt in self.text + self.boxes: + if isinstance(elt, Box): + x, y, h, w = elt + e = 0 # zero depth + else: # glyph + x, y, font, g, w = elt + h, e = font._height_depth_of(g) + minx = min(minx, x) + miny = min(miny, y - h) + maxx = max(maxx, x + w) + maxy = max(maxy, y + e) + maxy_pure = max(maxy_pure, y) + if self._baseline_v is not None: + maxy_pure = self._baseline_v # This should normally be the case. + self._baseline_v = None + + if not self.text and not self.boxes: # Avoid infs/nans from inf+/-inf. + return Page(text=[], boxes=[], width=0, height=0, descent=0) + + if self.dpi is None: + # special case for ease of debugging: output raw dvi coordinates + return Page(text=self.text, boxes=self.boxes, + width=maxx-minx, height=maxy_pure-miny, + descent=maxy-maxy_pure) + + # convert from TeX's "scaled points" to dpi units + d = self.dpi / (72.27 * 2**16) + descent = (maxy - maxy_pure) * d + + text = [Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d) + for (x, y, f, g, w) in self.text] + boxes = [Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d) + for (x, y, h, w) in self.boxes] + + return Page(text=text, boxes=boxes, width=(maxx-minx)*d, + height=(maxy_pure-miny)*d, descent=descent) + + def _read(self): + """ + Read one page from the file. Return True if successful, + False if there were no more pages. + """ + # Pages appear to start with the sequence + # bop (begin of page) + # xxx comment + # # if using chemformula + # down + # push + # down + # # if using xcolor + # down + # push + # down (possibly multiple) + # push <= here, v is the baseline position. + # etc. + # (dviasm is useful to explore this structure.) + # Thus, we use the vertical position at the first time the stack depth + # reaches 3, while at least three "downs" have been executed (excluding + # those popped out (corresponding to the chemformula preamble)), as the + # baseline (the "down" count is necessary to handle xcolor). + down_stack = [0] + self._baseline_v = None + while True: + byte = self.file.read(1)[0] + self._dtable[byte](self, byte) + name = self._dtable[byte].__name__ + if name == "_push": + down_stack.append(down_stack[-1]) + elif name == "_pop": + down_stack.pop() + elif name == "_down": + down_stack[-1] += 1 + if (self._baseline_v is None + and len(getattr(self, "stack", [])) == 3 + and down_stack[-1] >= 4): + self._baseline_v = self.v + if byte == 140: # end of page + return True + if self.state is _dvistate.post_post: # end of file + self.close() + return False + + def _arg(self, nbytes, signed=False): + """ + Read and return an integer argument *nbytes* long. + Signedness is determined by the *signed* keyword. + """ + buf = self.file.read(nbytes) + value = buf[0] + if signed and value >= 0x80: + value = value - 0x100 + for b in buf[1:]: + value = 0x100*value + b + return value + + @_dispatch(min=0, max=127, state=_dvistate.inpage) + def _set_char_immediate(self, char): + self._put_char_real(char) + self.h += self.fonts[self.f]._width_of(char) + + @_dispatch(min=128, max=131, state=_dvistate.inpage, args=('olen1',)) + def _set_char(self, char): + self._put_char_real(char) + self.h += self.fonts[self.f]._width_of(char) + + @_dispatch(132, state=_dvistate.inpage, args=('s4', 's4')) + def _set_rule(self, a, b): + self._put_rule_real(a, b) + self.h += b + + @_dispatch(min=133, max=136, state=_dvistate.inpage, args=('olen1',)) + def _put_char(self, char): + self._put_char_real(char) + + def _put_char_real(self, char): + font = self.fonts[self.f] + if font._vf is None: + self.text.append(Text(self.h, self.v, font, char, + font._width_of(char))) + else: + scale = font._scale + for x, y, f, g, w in font._vf[char].text: + newf = DviFont(scale=_mul2012(scale, f._scale), + tfm=f._tfm, texname=f.texname, vf=f._vf) + self.text.append(Text(self.h + _mul2012(x, scale), + self.v + _mul2012(y, scale), + newf, g, newf._width_of(g))) + self.boxes.extend([Box(self.h + _mul2012(x, scale), + self.v + _mul2012(y, scale), + _mul2012(a, scale), _mul2012(b, scale)) + for x, y, a, b in font._vf[char].boxes]) + + @_dispatch(137, state=_dvistate.inpage, args=('s4', 's4')) + def _put_rule(self, a, b): + self._put_rule_real(a, b) + + def _put_rule_real(self, a, b): + if a > 0 and b > 0: + self.boxes.append(Box(self.h, self.v, a, b)) + + @_dispatch(138) + def _nop(self, _): + pass + + @_dispatch(139, state=_dvistate.outer, args=('s4',)*11) + def _bop(self, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, p): + self.state = _dvistate.inpage + self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0 + self.stack = [] + self.text = [] # list of Text objects + self.boxes = [] # list of Box objects + + @_dispatch(140, state=_dvistate.inpage) + def _eop(self, _): + self.state = _dvistate.outer + del self.h, self.v, self.w, self.x, self.y, self.z, self.stack + + @_dispatch(141, state=_dvistate.inpage) + def _push(self, _): + self.stack.append((self.h, self.v, self.w, self.x, self.y, self.z)) + + @_dispatch(142, state=_dvistate.inpage) + def _pop(self, _): + self.h, self.v, self.w, self.x, self.y, self.z = self.stack.pop() + + @_dispatch(min=143, max=146, state=_dvistate.inpage, args=('slen1',)) + def _right(self, b): + self.h += b + + @_dispatch(min=147, max=151, state=_dvistate.inpage, args=('slen',)) + def _right_w(self, new_w): + if new_w is not None: + self.w = new_w + self.h += self.w + + @_dispatch(min=152, max=156, state=_dvistate.inpage, args=('slen',)) + def _right_x(self, new_x): + if new_x is not None: + self.x = new_x + self.h += self.x + + @_dispatch(min=157, max=160, state=_dvistate.inpage, args=('slen1',)) + def _down(self, a): + self.v += a + + @_dispatch(min=161, max=165, state=_dvistate.inpage, args=('slen',)) + def _down_y(self, new_y): + if new_y is not None: + self.y = new_y + self.v += self.y + + @_dispatch(min=166, max=170, state=_dvistate.inpage, args=('slen',)) + def _down_z(self, new_z): + if new_z is not None: + self.z = new_z + self.v += self.z + + @_dispatch(min=171, max=234, state=_dvistate.inpage) + def _fnt_num_immediate(self, k): + self.f = k + + @_dispatch(min=235, max=238, state=_dvistate.inpage, args=('olen1',)) + def _fnt_num(self, new_f): + self.f = new_f + + @_dispatch(min=239, max=242, args=('ulen1',)) + def _xxx(self, datalen): + special = self.file.read(datalen) + _log.debug( + 'Dvi._xxx: encountered special: %s', + ''.join([chr(ch) if 32 <= ch < 127 else '<%02x>' % ch + for ch in special])) + + @_dispatch(min=243, max=246, args=('olen1', 'u4', 'u4', 'u4', 'u1', 'u1')) + def _fnt_def(self, k, c, s, d, a, l): + self._fnt_def_real(k, c, s, d, a, l) + + def _fnt_def_real(self, k, c, s, d, a, l): + n = self.file.read(a + l) + fontname = n[-l:].decode('ascii') + tfm = _tfmfile(fontname) + if c != 0 and tfm.checksum != 0 and c != tfm.checksum: + raise ValueError('tfm checksum mismatch: %s' % n) + try: + vf = _vffile(fontname) + except FileNotFoundError: + vf = None + self.fonts[k] = DviFont(scale=s, tfm=tfm, texname=n, vf=vf) + + @_dispatch(247, state=_dvistate.pre, args=('u1', 'u4', 'u4', 'u4', 'u1')) + def _pre(self, i, num, den, mag, k): + self.file.read(k) # comment in the dvi file + if i != 2: + raise ValueError("Unknown dvi format %d" % i) + if num != 25400000 or den != 7227 * 2**16: + raise ValueError("Nonstandard units in dvi file") + # meaning: TeX always uses those exact values, so it + # should be enough for us to support those + # (There are 72.27 pt to an inch so 7227 pt = + # 7227 * 2**16 sp to 100 in. The numerator is multiplied + # by 10^5 to get units of 10**-7 meters.) + if mag != 1000: + raise ValueError("Nonstandard magnification in dvi file") + # meaning: LaTeX seems to frown on setting \mag, so + # I think we can assume this is constant + self.state = _dvistate.outer + + @_dispatch(248, state=_dvistate.outer) + def _post(self, _): + self.state = _dvistate.post_post + # TODO: actually read the postamble and finale? + # currently post_post just triggers closing the file + + @_dispatch(249) + def _post_post(self, _): + raise NotImplementedError + + @_dispatch(min=250, max=255) + def _malformed(self, offset): + raise ValueError(f"unknown command: byte {250 + offset}") + + +class DviFont: + """ + Encapsulation of a font that a DVI file can refer to. + + This class holds a font's texname and size, supports comparison, + and knows the widths of glyphs in the same units as the AFM file. + There are also internal attributes (for use by dviread.py) that + are *not* used for comparison. + + The size is in Adobe points (converted from TeX points). + + Parameters + ---------- + scale : float + Factor by which the font is scaled from its natural size. + tfm : Tfm + TeX font metrics for this font + texname : bytes + Name of the font as used internally by TeX and friends, as an ASCII + bytestring. This is usually very different from any external font + names; `PsfontsMap` can be used to find the external name of the font. + vf : Vf + A TeX "virtual font" file, or None if this font is not virtual. + + Attributes + ---------- + texname : bytes + size : float + Size of the font in Adobe points, converted from the slightly + smaller TeX points. + widths : list + Widths of glyphs in glyph-space units, typically 1/1000ths of + the point size. + + """ + __slots__ = ('texname', 'size', 'widths', '_scale', '_vf', '_tfm') + + def __init__(self, scale, tfm, texname, vf): + _api.check_isinstance(bytes, texname=texname) + self._scale = scale + self._tfm = tfm + self.texname = texname + self._vf = vf + self.size = scale * (72.0 / (72.27 * 2**16)) + try: + nchars = max(tfm.width) + 1 + except ValueError: + nchars = 0 + self.widths = [(1000*tfm.width.get(char, 0)) >> 20 + for char in range(nchars)] + + def __eq__(self, other): + return (type(self) == type(other) + and self.texname == other.texname and self.size == other.size) + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + return "<{}: {}>".format(type(self).__name__, self.texname) + + def _width_of(self, char): + """Width of char in dvi units.""" + width = self._tfm.width.get(char, None) + if width is not None: + return _mul2012(width, self._scale) + _log.debug('No width for char %d in font %s.', char, self.texname) + return 0 + + def _height_depth_of(self, char): + """Height and depth of char in dvi units.""" + result = [] + for metric, name in ((self._tfm.height, "height"), + (self._tfm.depth, "depth")): + value = metric.get(char, None) + if value is None: + _log.debug('No %s for char %d in font %s', + name, char, self.texname) + result.append(0) + else: + result.append(_mul2012(value, self._scale)) + # cmsyXX (symbols font) glyph 0 ("minus") has a nonzero descent + # so that TeX aligns equations properly + # (https://tex.stackexchange.com/q/526103/) + # but we actually care about the rasterization depth to align + # the dvipng-generated images. + if re.match(br'^cmsy\d+$', self.texname) and char == 0: + result[-1] = 0 + return result + + +class Vf(Dvi): + r""" + A virtual font (\*.vf file) containing subroutines for dvi files. + + Parameters + ---------- + filename : str or path-like + + Notes + ----- + The virtual font format is a derivative of dvi: + http://mirrors.ctan.org/info/knuth/virtual-fonts + This class reuses some of the machinery of `Dvi` + but replaces the `_read` loop and dispatch mechanism. + + Examples + -------- + :: + + vf = Vf(filename) + glyph = vf[code] + glyph.text, glyph.boxes, glyph.width + """ + + def __init__(self, filename): + super().__init__(filename, 0) + try: + self._first_font = None + self._chars = {} + self._read() + finally: + self.close() + + def __getitem__(self, code): + return self._chars[code] + + def _read(self): + """ + Read one page from the file. Return True if successful, + False if there were no more pages. + """ + packet_char, packet_ends = None, None + packet_len, packet_width = None, None + while True: + byte = self.file.read(1)[0] + # If we are in a packet, execute the dvi instructions + if self.state is _dvistate.inpage: + byte_at = self.file.tell()-1 + if byte_at == packet_ends: + self._finalize_packet(packet_char, packet_width) + packet_len, packet_char, packet_width = None, None, None + # fall through to out-of-packet code + elif byte_at > packet_ends: + raise ValueError("Packet length mismatch in vf file") + else: + if byte in (139, 140) or byte >= 243: + raise ValueError( + "Inappropriate opcode %d in vf file" % byte) + Dvi._dtable[byte](self, byte) + continue + + # We are outside a packet + if byte < 242: # a short packet (length given by byte) + packet_len = byte + packet_char, packet_width = self._arg(1), self._arg(3) + packet_ends = self._init_packet(byte) + self.state = _dvistate.inpage + elif byte == 242: # a long packet + packet_len, packet_char, packet_width = \ + [self._arg(x) for x in (4, 4, 4)] + self._init_packet(packet_len) + elif 243 <= byte <= 246: + k = self._arg(byte - 242, byte == 246) + c, s, d, a, l = [self._arg(x) for x in (4, 4, 4, 1, 1)] + self._fnt_def_real(k, c, s, d, a, l) + if self._first_font is None: + self._first_font = k + elif byte == 247: # preamble + i, k = self._arg(1), self._arg(1) + x = self.file.read(k) + cs, ds = self._arg(4), self._arg(4) + self._pre(i, x, cs, ds) + elif byte == 248: # postamble (just some number of 248s) + break + else: + raise ValueError("Unknown vf opcode %d" % byte) + + def _init_packet(self, pl): + if self.state != _dvistate.outer: + raise ValueError("Misplaced packet in vf file") + self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0 + self.stack, self.text, self.boxes = [], [], [] + self.f = self._first_font + return self.file.tell() + pl + + def _finalize_packet(self, packet_char, packet_width): + self._chars[packet_char] = Page( + text=self.text, boxes=self.boxes, width=packet_width, + height=None, descent=None) + self.state = _dvistate.outer + + def _pre(self, i, x, cs, ds): + if self.state is not _dvistate.pre: + raise ValueError("pre command in middle of vf file") + if i != 202: + raise ValueError("Unknown vf format %d" % i) + if len(x): + _log.debug('vf file comment: %s', x) + self.state = _dvistate.outer + # cs = checksum, ds = design size + + +def _mul2012(num1, num2): + """Multiply two numbers in 20.12 fixed point format.""" + # Separated into a function because >> has surprising precedence + return (num1*num2) >> 20 + + +class Tfm: + """ + A TeX Font Metric file. + + This implementation covers only the bare minimum needed by the Dvi class. + + Parameters + ---------- + filename : str or path-like + + Attributes + ---------- + checksum : int + Used for verifying against the dvi file. + design_size : int + Design size of the font (unknown units) + width, height, depth : dict + Dimensions of each character, need to be scaled by the factor + specified in the dvi file. These are dicts because indexing may + not start from 0. + """ + __slots__ = ('checksum', 'design_size', 'width', 'height', 'depth') + + def __init__(self, filename): + _log.debug('opening tfm file %s', filename) + with open(filename, 'rb') as file: + header1 = file.read(24) + lh, bc, ec, nw, nh, nd = struct.unpack('!6H', header1[2:14]) + _log.debug('lh=%d, bc=%d, ec=%d, nw=%d, nh=%d, nd=%d', + lh, bc, ec, nw, nh, nd) + header2 = file.read(4*lh) + self.checksum, self.design_size = struct.unpack('!2I', header2[:8]) + # there is also encoding information etc. + char_info = file.read(4*(ec-bc+1)) + widths = struct.unpack(f'!{nw}i', file.read(4*nw)) + heights = struct.unpack(f'!{nh}i', file.read(4*nh)) + depths = struct.unpack(f'!{nd}i', file.read(4*nd)) + self.width, self.height, self.depth = {}, {}, {} + for idx, char in enumerate(range(bc, ec+1)): + byte0 = char_info[4*idx] + byte1 = char_info[4*idx+1] + self.width[char] = widths[byte0] + self.height[char] = heights[byte1 >> 4] + self.depth[char] = depths[byte1 & 0xf] + + +PsFont = namedtuple('PsFont', 'texname psname effects encoding filename') + + +class PsfontsMap: + """ + A psfonts.map formatted file, mapping TeX fonts to PS fonts. + + Parameters + ---------- + filename : str or path-like + + Notes + ----- + For historical reasons, TeX knows many Type-1 fonts by different + names than the outside world. (For one thing, the names have to + fit in eight characters.) Also, TeX's native fonts are not Type-1 + but Metafont, which is nontrivial to convert to PostScript except + as a bitmap. While high-quality conversions to Type-1 format exist + and are shipped with modern TeX distributions, we need to know + which Type-1 fonts are the counterparts of which native fonts. For + these reasons a mapping is needed from internal font names to font + file names. + + A texmf tree typically includes mapping files called e.g. + :file:`psfonts.map`, :file:`pdftex.map`, or :file:`dvipdfm.map`. + The file :file:`psfonts.map` is used by :program:`dvips`, + :file:`pdftex.map` by :program:`pdfTeX`, and :file:`dvipdfm.map` + by :program:`dvipdfm`. :file:`psfonts.map` might avoid embedding + the 35 PostScript fonts (i.e., have no filename for them, as in + the Times-Bold example above), while the pdf-related files perhaps + only avoid the "Base 14" pdf fonts. But the user may have + configured these files differently. + + Examples + -------- + >>> map = PsfontsMap(find_tex_file('pdftex.map')) + >>> entry = map[b'ptmbo8r'] + >>> entry.texname + b'ptmbo8r' + >>> entry.psname + b'Times-Bold' + >>> entry.encoding + '/usr/local/texlive/2008/texmf-dist/fonts/enc/dvips/base/8r.enc' + >>> entry.effects + {'slant': 0.16700000000000001} + >>> entry.filename + """ + __slots__ = ('_filename', '_unparsed', '_parsed') + + # Create a filename -> PsfontsMap cache, so that calling + # `PsfontsMap(filename)` with the same filename a second time immediately + # returns the same object. + @lru_cache() + def __new__(cls, filename): + self = object.__new__(cls) + self._filename = os.fsdecode(filename) + # Some TeX distributions have enormous pdftex.map files which would + # take hundreds of milliseconds to parse, but it is easy enough to just + # store the unparsed lines (keyed by the first word, which is the + # texname) and parse them on-demand. + with open(filename, 'rb') as file: + self._unparsed = {} + for line in file: + tfmname = line.split(b' ', 1)[0] + self._unparsed.setdefault(tfmname, []).append(line) + self._parsed = {} + return self + + def __getitem__(self, texname): + assert isinstance(texname, bytes) + if texname in self._unparsed: + for line in self._unparsed.pop(texname): + if self._parse_and_cache_line(line): + break + try: + return self._parsed[texname] + except KeyError: + raise LookupError( + f"An associated PostScript font (required by Matplotlib) " + f"could not be found for TeX font {texname.decode('ascii')!r} " + f"in {self._filename!r}; this problem can often be solved by " + f"installing a suitable PostScript font package in your TeX " + f"package manager") from None + + def _parse_and_cache_line(self, line): + """ + Parse a line in the font mapping file. + + The format is (partially) documented at + http://mirrors.ctan.org/systems/doc/pdftex/manual/pdftex-a.pdf + https://tug.org/texinfohtml/dvips.html#psfonts_002emap + Each line can have the following fields: + + - tfmname (first, only required field), + - psname (defaults to tfmname, must come immediately after tfmname if + present), + - fontflags (integer, must come immediately after psname if present, + ignored by us), + - special (SlantFont and ExtendFont, only field that is double-quoted), + - fontfile, encodingfile (optional, prefixed by <, <<, or <[; << always + precedes a font, <[ always precedes an encoding, < can precede either + but then an encoding file must have extension .enc; < and << also + request different font subsetting behaviors but we ignore that; < can + be separated from the filename by whitespace). + + special, fontfile, and encodingfile can appear in any order. + """ + # If the map file specifies multiple encodings for a font, we + # follow pdfTeX in choosing the last one specified. Such + # entries are probably mistakes but they have occurred. + # https://tex.stackexchange.com/q/10826/ + + if not line or line.startswith((b" ", b"%", b"*", b";", b"#")): + return + tfmname = basename = special = encodingfile = fontfile = None + is_subsetted = is_t1 = is_truetype = False + matches = re.finditer(br'"([^"]*)(?:"|$)|(\S+)', line) + for match in matches: + quoted, unquoted = match.groups() + if unquoted: + if unquoted.startswith(b"<<"): # font + fontfile = unquoted[2:] + elif unquoted.startswith(b"<["): # encoding + encodingfile = unquoted[2:] + elif unquoted.startswith(b"<"): # font or encoding + word = ( + # foo + unquoted[1:] + # < by itself => read the next word + or next(filter(None, next(matches).groups()))) + if word.endswith(b".enc"): + encodingfile = word + else: + fontfile = word + is_subsetted = True + elif tfmname is None: + tfmname = unquoted + elif basename is None: + basename = unquoted + elif quoted: + special = quoted + effects = {} + if special: + words = reversed(special.split()) + for word in words: + if word == b"SlantFont": + effects["slant"] = float(next(words)) + elif word == b"ExtendFont": + effects["extend"] = float(next(words)) + + # Verify some properties of the line that would cause it to be ignored + # otherwise. + if fontfile is not None: + if fontfile.endswith((b".ttf", b".ttc")): + is_truetype = True + elif not fontfile.endswith(b".otf"): + is_t1 = True + elif basename is not None: + is_t1 = True + if is_truetype and is_subsetted and encodingfile is None: + return + if not is_t1 and ("slant" in effects or "extend" in effects): + return + if abs(effects.get("slant", 0)) > 1: + return + if abs(effects.get("extend", 0)) > 2: + return + + if basename is None: + basename = tfmname + if encodingfile is not None: + encodingfile = _find_tex_file(encodingfile) + if fontfile is not None: + fontfile = _find_tex_file(fontfile) + self._parsed[tfmname] = PsFont( + texname=tfmname, psname=basename, effects=effects, + encoding=encodingfile, filename=fontfile) + return True + + +def _parse_enc(path): + r""" + Parse a \*.enc file referenced from a psfonts.map style file. + + The format supported by this function is a tiny subset of PostScript. + + Parameters + ---------- + path : os.PathLike + + Returns + ------- + list + The nth entry of the list is the PostScript glyph name of the nth + glyph. + """ + no_comments = re.sub("%.*", "", Path(path).read_text(encoding="ascii")) + array = re.search(r"(?s)\[(.*)\]", no_comments).group(1) + lines = [line for line in array.split() if line] + if all(line.startswith("/") for line in lines): + return [line[1:] for line in lines] + else: + raise ValueError( + "Failed to parse {} as Postscript encoding".format(path)) + + +class _LuatexKpsewhich: + @lru_cache() # A singleton. + def __new__(cls): + self = object.__new__(cls) + self._proc = self._new_proc() + return self + + def _new_proc(self): + return subprocess.Popen( + ["luatex", "--luaonly", + str(cbook._get_data_path("kpsewhich.lua"))], + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + + def search(self, filename): + if self._proc.poll() is not None: # Dead, restart it. + self._proc = self._new_proc() + self._proc.stdin.write(os.fsencode(filename) + b"\n") + self._proc.stdin.flush() + out = self._proc.stdout.readline().rstrip() + return None if out == b"nil" else os.fsdecode(out) + + +@lru_cache() +@_api.delete_parameter("3.5", "format") +def _find_tex_file(filename, format=None): + """ + Find a file in the texmf tree using kpathsea_. + + The kpathsea library, provided by most existing TeX distributions, both + on Unix-like systems and on Windows (MikTeX), is invoked via a long-lived + luatex process if luatex is installed, or via kpsewhich otherwise. + + .. _kpathsea: https://www.tug.org/kpathsea/ + + Parameters + ---------- + filename : str or path-like + format : str or bytes + Used as the value of the ``--format`` option to :program:`kpsewhich`. + Could be e.g. 'tfm' or 'vf' to limit the search to that type of files. + Deprecated. + + Raises + ------ + FileNotFoundError + If the file is not found. + """ + + # we expect these to always be ascii encoded, but use utf-8 + # out of caution + if isinstance(filename, bytes): + filename = filename.decode('utf-8', errors='replace') + if isinstance(format, bytes): + format = format.decode('utf-8', errors='replace') + + try: + lk = _LuatexKpsewhich() + except FileNotFoundError: + lk = None # Fallback to directly calling kpsewhich, as below. + + if lk and format is None: + path = lk.search(filename) + + else: + if os.name == 'nt': + # On Windows only, kpathsea can use utf-8 for cmd args and output. + # The `command_line_encoding` environment variable is set to force + # it to always use utf-8 encoding. See Matplotlib issue #11848. + kwargs = {'env': {**os.environ, 'command_line_encoding': 'utf-8'}, + 'encoding': 'utf-8'} + else: # On POSIX, run through the equivalent of os.fsdecode(). + kwargs = {'encoding': sys.getfilesystemencoding(), + 'errors': 'surrogateescape'} + + cmd = ['kpsewhich'] + if format is not None: + cmd += ['--format=' + format] + cmd += [filename] + try: + path = (cbook._check_and_log_subprocess(cmd, _log, **kwargs) + .rstrip('\n')) + except (FileNotFoundError, RuntimeError): + path = None + + if path: + return path + else: + raise FileNotFoundError( + f"Matplotlib's TeX implementation searched for a file named " + f"{filename!r} in your texmf tree, but could not find it") + + +# After the deprecation period elapses, delete this shim and rename +# _find_tex_file to find_tex_file everywhere. +@_api.delete_parameter("3.5", "format") +def find_tex_file(filename, format=None): + try: + return (_find_tex_file(filename, format) if format is not None else + _find_tex_file(filename)) + except FileNotFoundError as exc: + _api.warn_deprecated( + "3.6", message=f"{exc.args[0]}; in the future, this will raise a " + f"FileNotFoundError.") + return "" + + +find_tex_file.__doc__ = _find_tex_file.__doc__ + + +@lru_cache() +def _fontfile(cls, suffix, texname): + return cls(_find_tex_file(texname + suffix)) + + +_tfmfile = partial(_fontfile, Tfm, ".tfm") +_vffile = partial(_fontfile, Vf, ".vf") + + +if __name__ == '__main__': + from argparse import ArgumentParser + import itertools + + parser = ArgumentParser() + parser.add_argument("filename") + parser.add_argument("dpi", nargs="?", type=float, default=None) + args = parser.parse_args() + with Dvi(args.filename, args.dpi) as dvi: + fontmap = PsfontsMap(_find_tex_file('pdftex.map')) + for page in dvi: + print(f"=== new page === " + f"(w: {page.width}, h: {page.height}, d: {page.descent})") + for font, group in itertools.groupby( + page.text, lambda text: text.font): + print(f"font: {font.texname.decode('latin-1')!r}\t" + f"scale: {font._scale / 2 ** 20}") + print("x", "y", "glyph", "chr", "w", "(glyphs)", sep="\t") + for text in group: + print(text.x, text.y, text.glyph, + chr(text.glyph) if chr(text.glyph).isprintable() + else ".", + text.width, sep="\t") + if page.boxes: + print("x", "y", "w", "h", "", "(boxes)", sep="\t") + for x, y, w, h in page.boxes: + print(x, y, w, h, sep="\t") From 0b8bc1b8c6108ab96d1671ba98258ee30336eb20 Mon Sep 17 00:00:00 2001 From: Oscar Gustafsson Date: Sat, 8 Jan 2022 12:46:45 +0100 Subject: [PATCH 5/7] Organized files and updated references --- lib/matplotlib/_dviread.py | 499 ----------- lib/matplotlib/_dviread_vf.py | 1002 +---------------------- lib/matplotlib/backends/backend_pdf.py | 6 +- lib/matplotlib/backends/backend_ps.py | 2 +- lib/matplotlib/dviread.py | 585 +------------ lib/matplotlib/mpl-data/kpsewhich.lua | 2 +- lib/matplotlib/testing/__init__.py | 2 +- lib/matplotlib/tests/test_dviread.py | 9 +- lib/matplotlib/tests/test_texmanager.py | 2 +- lib/matplotlib/tests/test_usetex.py | 6 +- lib/matplotlib/texmanager.py | 5 + lib/matplotlib/textpath.py | 8 +- 12 files changed, 65 insertions(+), 2063 deletions(-) create mode 100644 lib/matplotlib/texmanager.py diff --git a/lib/matplotlib/_dviread.py b/lib/matplotlib/_dviread.py index 7f90a13f1086..af1de84779f5 100644 --- a/lib/matplotlib/_dviread.py +++ b/lib/matplotlib/_dviread.py @@ -28,8 +28,6 @@ import subprocess import sys -import numpy as np - from matplotlib import _api, cbook _log = logging.getLogger(__name__) @@ -38,24 +36,6 @@ # additional parsing, and are used many times per rendering, which is why they # are cached using lru_cache(). -# Dvi is a bytecode format documented in -# https://ctan.org/pkg/dvitype -# https://texdoc.org/serve/dvitype.pdf/0 -# -# The file consists of a preamble, some number of pages, a postamble, -# and a finale. Different opcodes are allowed in different contexts, -# so the Dvi object has a parser state: -# -# pre: expecting the preamble -# outer: between pages (followed by a page or the postamble, -# also e.g. font definitions are allowed) -# page: processing a page -# post_post: state after the postamble (our current implementation -# just stops reading) -# finale: the finale (unimplemented in our current implementation) - -_dvistate = enum.Enum('DviState', 'pre outer inpage post_post finale') - # The marks on a page consist of text and boxes. A page also has dimensions. Page = namedtuple('Page', 'text boxes height width descent') Text = namedtuple('Text', 'x y font glyph width') @@ -174,343 +154,6 @@ def wrapper(self, byte): return decorate -class Dvi: - """ - A reader for a dvi ("device-independent") file, as produced by TeX. - - The current implementation can only iterate through pages in order, - and does not even attempt to verify the postamble. - - This class can be used as a context manager to close the underlying - file upon exit. Pages can be read via iteration. Here is an overly - simple way to extract text without trying to detect whitespace:: - - >>> with matplotlib.dviread.Dvi('input.dvi', 72) as dvi: - ... for page in dvi: - ... print(''.join(chr(t.glyph) for t in page.text)) - """ - # dispatch table - _dtable = [None] * 256 - _dispatch = partial(_dispatch, _dtable) - - def __init__(self, filename, dpi): - """ - Read the data from the file named *filename* and convert - TeX's internal units to units of *dpi* per inch. - *dpi* only sets the units and does not limit the resolution. - Use None to return TeX's internal units. - """ - _log.debug('Dvi: %s', filename) - self.file = open(filename, 'rb') - self.dpi = dpi - self.fonts = {} - self.state = _dvistate.pre - - baseline = _api.deprecated("3.5")(property(lambda self: None)) - - def __enter__(self): - """Context manager enter method, does nothing.""" - return self - - def __exit__(self, etype, evalue, etrace): - """ - Context manager exit method, closes the underlying file if it is open. - """ - self.close() - - def __iter__(self): - """ - Iterate through the pages of the file. - - Yields - ------ - Page - Details of all the text and box objects on the page. - The Page tuple contains lists of Text and Box tuples and - the page dimensions, and the Text and Box tuples contain - coordinates transformed into a standard Cartesian - coordinate system at the dpi value given when initializing. - The coordinates are floating point numbers, but otherwise - precision is not lost and coordinate values are not clipped to - integers. - """ - while self._read(): - yield self._output() - - def close(self): - """Close the underlying file if it is open.""" - if not self.file.closed: - self.file.close() - - def _output(self): - """ - Output the text and boxes belonging to the most recent page. - page = dvi._output() - """ - minx, miny, maxx, maxy = np.inf, np.inf, -np.inf, -np.inf - maxy_pure = -np.inf - for elt in self.text + self.boxes: - if isinstance(elt, Box): - x, y, h, w = elt - e = 0 # zero depth - else: # glyph - x, y, font, g, w = elt - h, e = font._height_depth_of(g) - minx = min(minx, x) - miny = min(miny, y - h) - maxx = max(maxx, x + w) - maxy = max(maxy, y + e) - maxy_pure = max(maxy_pure, y) - if self._baseline_v is not None: - maxy_pure = self._baseline_v # This should normally be the case. - self._baseline_v = None - - if not self.text and not self.boxes: # Avoid infs/nans from inf+/-inf. - return Page(text=[], boxes=[], width=0, height=0, descent=0) - - if self.dpi is None: - # special case for ease of debugging: output raw dvi coordinates - return Page(text=self.text, boxes=self.boxes, - width=maxx-minx, height=maxy_pure-miny, - descent=maxy-maxy_pure) - - # convert from TeX's "scaled points" to dpi units - d = self.dpi / (72.27 * 2**16) - descent = (maxy - maxy_pure) * d - - text = [Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d) - for (x, y, f, g, w) in self.text] - boxes = [Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d) - for (x, y, h, w) in self.boxes] - - return Page(text=text, boxes=boxes, width=(maxx-minx)*d, - height=(maxy_pure-miny)*d, descent=descent) - - def _read(self): - """ - Read one page from the file. Return True if successful, - False if there were no more pages. - """ - # Pages appear to start with the sequence - # bop (begin of page) - # xxx comment - # # if using chemformula - # down - # push - # down - # # if using xcolor - # down - # push - # down (possibly multiple) - # push <= here, v is the baseline position. - # etc. - # (dviasm is useful to explore this structure.) - # Thus, we use the vertical position at the first time the stack depth - # reaches 3, while at least three "downs" have been executed (excluding - # those popped out (corresponding to the chemformula preamble)), as the - # baseline (the "down" count is necessary to handle xcolor). - down_stack = [0] - self._baseline_v = None - while True: - byte = self.file.read(1)[0] - self._dtable[byte](self, byte) - name = self._dtable[byte].__name__ - if name == "_push": - down_stack.append(down_stack[-1]) - elif name == "_pop": - down_stack.pop() - elif name == "_down": - down_stack[-1] += 1 - if (self._baseline_v is None - and len(getattr(self, "stack", [])) == 3 - and down_stack[-1] >= 4): - self._baseline_v = self.v - if byte == 140: # end of page - return True - if self.state is _dvistate.post_post: # end of file - self.close() - return False - - def _arg(self, nbytes, signed=False): - """ - Read and return an integer argument *nbytes* long. - Signedness is determined by the *signed* keyword. - """ - buf = self.file.read(nbytes) - value = buf[0] - if signed and value >= 0x80: - value = value - 0x100 - for b in buf[1:]: - value = 0x100*value + b - return value - - @_dispatch(min=0, max=127, state=_dvistate.inpage) - def _set_char_immediate(self, char): - self._put_char_real(char) - self.h += self.fonts[self.f]._width_of(char) - - @_dispatch(min=128, max=131, state=_dvistate.inpage, args=('olen1',)) - def _set_char(self, char): - self._put_char_real(char) - self.h += self.fonts[self.f]._width_of(char) - - @_dispatch(132, state=_dvistate.inpage, args=('s4', 's4')) - def _set_rule(self, a, b): - self._put_rule_real(a, b) - self.h += b - - @_dispatch(min=133, max=136, state=_dvistate.inpage, args=('olen1',)) - def _put_char(self, char): - self._put_char_real(char) - - def _put_char_real(self, char): - font = self.fonts[self.f] - if font._vf is None: - self.text.append(Text(self.h, self.v, font, char, - font._width_of(char))) - else: - scale = font._scale - for x, y, f, g, w in font._vf[char].text: - newf = DviFont(scale=_mul2012(scale, f._scale), - tfm=f._tfm, texname=f.texname, vf=f._vf) - self.text.append(Text(self.h + _mul2012(x, scale), - self.v + _mul2012(y, scale), - newf, g, newf._width_of(g))) - self.boxes.extend([Box(self.h + _mul2012(x, scale), - self.v + _mul2012(y, scale), - _mul2012(a, scale), _mul2012(b, scale)) - for x, y, a, b in font._vf[char].boxes]) - - @_dispatch(137, state=_dvistate.inpage, args=('s4', 's4')) - def _put_rule(self, a, b): - self._put_rule_real(a, b) - - def _put_rule_real(self, a, b): - if a > 0 and b > 0: - self.boxes.append(Box(self.h, self.v, a, b)) - - @_dispatch(138) - def _nop(self, _): - pass - - @_dispatch(139, state=_dvistate.outer, args=('s4',)*11) - def _bop(self, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, p): - self.state = _dvistate.inpage - self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0 - self.stack = [] - self.text = [] # list of Text objects - self.boxes = [] # list of Box objects - - @_dispatch(140, state=_dvistate.inpage) - def _eop(self, _): - self.state = _dvistate.outer - del self.h, self.v, self.w, self.x, self.y, self.z, self.stack - - @_dispatch(141, state=_dvistate.inpage) - def _push(self, _): - self.stack.append((self.h, self.v, self.w, self.x, self.y, self.z)) - - @_dispatch(142, state=_dvistate.inpage) - def _pop(self, _): - self.h, self.v, self.w, self.x, self.y, self.z = self.stack.pop() - - @_dispatch(min=143, max=146, state=_dvistate.inpage, args=('slen1',)) - def _right(self, b): - self.h += b - - @_dispatch(min=147, max=151, state=_dvistate.inpage, args=('slen',)) - def _right_w(self, new_w): - if new_w is not None: - self.w = new_w - self.h += self.w - - @_dispatch(min=152, max=156, state=_dvistate.inpage, args=('slen',)) - def _right_x(self, new_x): - if new_x is not None: - self.x = new_x - self.h += self.x - - @_dispatch(min=157, max=160, state=_dvistate.inpage, args=('slen1',)) - def _down(self, a): - self.v += a - - @_dispatch(min=161, max=165, state=_dvistate.inpage, args=('slen',)) - def _down_y(self, new_y): - if new_y is not None: - self.y = new_y - self.v += self.y - - @_dispatch(min=166, max=170, state=_dvistate.inpage, args=('slen',)) - def _down_z(self, new_z): - if new_z is not None: - self.z = new_z - self.v += self.z - - @_dispatch(min=171, max=234, state=_dvistate.inpage) - def _fnt_num_immediate(self, k): - self.f = k - - @_dispatch(min=235, max=238, state=_dvistate.inpage, args=('olen1',)) - def _fnt_num(self, new_f): - self.f = new_f - - @_dispatch(min=239, max=242, args=('ulen1',)) - def _xxx(self, datalen): - special = self.file.read(datalen) - _log.debug( - 'Dvi._xxx: encountered special: %s', - ''.join([chr(ch) if 32 <= ch < 127 else '<%02x>' % ch - for ch in special])) - - @_dispatch(min=243, max=246, args=('olen1', 'u4', 'u4', 'u4', 'u1', 'u1')) - def _fnt_def(self, k, c, s, d, a, l): - self._fnt_def_real(k, c, s, d, a, l) - - def _fnt_def_real(self, k, c, s, d, a, l): - n = self.file.read(a + l) - fontname = n[-l:].decode('ascii') - tfm = _tfmfile(fontname) - if c != 0 and tfm.checksum != 0 and c != tfm.checksum: - raise ValueError('tfm checksum mismatch: %s' % n) - try: - vf = _vffile(fontname) - except FileNotFoundError: - vf = None - self.fonts[k] = DviFont(scale=s, tfm=tfm, texname=n, vf=vf) - - @_dispatch(247, state=_dvistate.pre, args=('u1', 'u4', 'u4', 'u4', 'u1')) - def _pre(self, i, num, den, mag, k): - self.file.read(k) # comment in the dvi file - if i != 2: - raise ValueError("Unknown dvi format %d" % i) - if num != 25400000 or den != 7227 * 2**16: - raise ValueError("Nonstandard units in dvi file") - # meaning: TeX always uses those exact values, so it - # should be enough for us to support those - # (There are 72.27 pt to an inch so 7227 pt = - # 7227 * 2**16 sp to 100 in. The numerator is multiplied - # by 10^5 to get units of 10**-7 meters.) - if mag != 1000: - raise ValueError("Nonstandard magnification in dvi file") - # meaning: LaTeX seems to frown on setting \mag, so - # I think we can assume this is constant - self.state = _dvistate.outer - - @_dispatch(248, state=_dvistate.outer) - def _post(self, _): - self.state = _dvistate.post_post - # TODO: actually read the postamble and finale? - # currently post_post just triggers closing the file - - @_dispatch(249) - def _post_post(self, _): - raise NotImplementedError - - @_dispatch(min=250, max=255) - def _malformed(self, offset): - raise ValueError(f"unknown command: byte {250 + offset}") - - class DviFont: """ Encapsulation of a font that a DVI file can refer to. @@ -602,118 +245,6 @@ def _height_depth_of(self, char): return result -class Vf(Dvi): - r""" - A virtual font (\*.vf file) containing subroutines for dvi files. - - Parameters - ---------- - filename : str or path-like - - Notes - ----- - The virtual font format is a derivative of dvi: - http://mirrors.ctan.org/info/knuth/virtual-fonts - This class reuses some of the machinery of `Dvi` - but replaces the `_read` loop and dispatch mechanism. - - Examples - -------- - :: - - vf = Vf(filename) - glyph = vf[code] - glyph.text, glyph.boxes, glyph.width - """ - - def __init__(self, filename): - super().__init__(filename, 0) - try: - self._first_font = None - self._chars = {} - self._read() - finally: - self.close() - - def __getitem__(self, code): - return self._chars[code] - - def _read(self): - """ - Read one page from the file. Return True if successful, - False if there were no more pages. - """ - packet_char, packet_ends = None, None - packet_len, packet_width = None, None - while True: - byte = self.file.read(1)[0] - # If we are in a packet, execute the dvi instructions - if self.state is _dvistate.inpage: - byte_at = self.file.tell()-1 - if byte_at == packet_ends: - self._finalize_packet(packet_char, packet_width) - packet_len, packet_char, packet_width = None, None, None - # fall through to out-of-packet code - elif byte_at > packet_ends: - raise ValueError("Packet length mismatch in vf file") - else: - if byte in (139, 140) or byte >= 243: - raise ValueError( - "Inappropriate opcode %d in vf file" % byte) - Dvi._dtable[byte](self, byte) - continue - - # We are outside a packet - if byte < 242: # a short packet (length given by byte) - packet_len = byte - packet_char, packet_width = self._arg(1), self._arg(3) - packet_ends = self._init_packet(byte) - self.state = _dvistate.inpage - elif byte == 242: # a long packet - packet_len, packet_char, packet_width = \ - [self._arg(x) for x in (4, 4, 4)] - self._init_packet(packet_len) - elif 243 <= byte <= 246: - k = self._arg(byte - 242, byte == 246) - c, s, d, a, l = [self._arg(x) for x in (4, 4, 4, 1, 1)] - self._fnt_def_real(k, c, s, d, a, l) - if self._first_font is None: - self._first_font = k - elif byte == 247: # preamble - i, k = self._arg(1), self._arg(1) - x = self.file.read(k) - cs, ds = self._arg(4), self._arg(4) - self._pre(i, x, cs, ds) - elif byte == 248: # postamble (just some number of 248s) - break - else: - raise ValueError("Unknown vf opcode %d" % byte) - - def _init_packet(self, pl): - if self.state != _dvistate.outer: - raise ValueError("Misplaced packet in vf file") - self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0 - self.stack, self.text, self.boxes = [], [], [] - self.f = self._first_font - return self.file.tell() + pl - - def _finalize_packet(self, packet_char, packet_width): - self._chars[packet_char] = Page( - text=self.text, boxes=self.boxes, width=packet_width, - height=None, descent=None) - self.state = _dvistate.outer - - def _pre(self, i, x, cs, ds): - if self.state is not _dvistate.pre: - raise ValueError("pre command in middle of vf file") - if i != 202: - raise ValueError("Unknown vf format %d" % i) - if len(x): - _log.debug('vf file comment: %s', x) - self.state = _dvistate.outer - # cs = checksum, ds = design size - - def _mul2012(num1, num2): """Multiply two numbers in 20.12 fixed point format.""" # Separated into a function because >> has surprising precedence @@ -1087,33 +618,3 @@ def _fontfile(cls, suffix, texname): _tfmfile = partial(_fontfile, Tfm, ".tfm") -_vffile = partial(_fontfile, Vf, ".vf") - - -if __name__ == '__main__': - from argparse import ArgumentParser - import itertools - - parser = ArgumentParser() - parser.add_argument("filename") - parser.add_argument("dpi", nargs="?", type=float, default=None) - args = parser.parse_args() - with Dvi(args.filename, args.dpi) as dvi: - fontmap = PsfontsMap(_find_tex_file('pdftex.map')) - for page in dvi: - print(f"=== new page === " - f"(w: {page.width}, h: {page.height}, d: {page.descent})") - for font, group in itertools.groupby( - page.text, lambda text: text.font): - print(f"font: {font.texname.decode('latin-1')!r}\t" - f"scale: {font._scale / 2 ** 20}") - print("x", "y", "glyph", "chr", "w", "(glyphs)", sep="\t") - for text in group: - print(text.x, text.y, text.glyph, - chr(text.glyph) if chr(text.glyph).isprintable() - else ".", - text.width, sep="\t") - if page.boxes: - print("x", "y", "w", "h", "", "(boxes)", sep="\t") - for x, y, w, h in page.boxes: - print(x, y, w, h, sep="\t") diff --git a/lib/matplotlib/_dviread_vf.py b/lib/matplotlib/_dviread_vf.py index 7f90a13f1086..7beb013d5464 100644 --- a/lib/matplotlib/_dviread_vf.py +++ b/lib/matplotlib/_dviread_vf.py @@ -1,606 +1,12 @@ -""" -A module for reading dvi files output by TeX. Several limitations make -this not (currently) useful as a general-purpose dvi preprocessor, but -it is currently used by the pdf backend for processing usetex text. - -Interface:: - - with Dvi(filename, 72) as dvi: - # iterate over pages: - for page in dvi: - w, h, d = page.width, page.height, page.descent - for x, y, font, glyph, width in page.text: - fontname = font.texname - pointsize = font.size - ... - for x, y, height, width in page.boxes: - ... -""" - -from collections import namedtuple -import enum -from functools import lru_cache, partial, wraps +from functools import partial import logging -import os -from pathlib import Path -import re -import struct -import subprocess -import sys -import numpy as np +from matplotlib.dviread import Dvi +from matplotlib._dviread import _dvistate, _fontfile, Page -from matplotlib import _api, cbook _log = logging.getLogger(__name__) -# Many dvi related files are looked for by external processes, require -# additional parsing, and are used many times per rendering, which is why they -# are cached using lru_cache(). - -# Dvi is a bytecode format documented in -# https://ctan.org/pkg/dvitype -# https://texdoc.org/serve/dvitype.pdf/0 -# -# The file consists of a preamble, some number of pages, a postamble, -# and a finale. Different opcodes are allowed in different contexts, -# so the Dvi object has a parser state: -# -# pre: expecting the preamble -# outer: between pages (followed by a page or the postamble, -# also e.g. font definitions are allowed) -# page: processing a page -# post_post: state after the postamble (our current implementation -# just stops reading) -# finale: the finale (unimplemented in our current implementation) - -_dvistate = enum.Enum('DviState', 'pre outer inpage post_post finale') - -# The marks on a page consist of text and boxes. A page also has dimensions. -Page = namedtuple('Page', 'text boxes height width descent') -Text = namedtuple('Text', 'x y font glyph width') -Box = namedtuple('Box', 'x y height width') - - -# Opcode argument parsing -# -# Each of the following functions takes a Dvi object and delta, -# which is the difference between the opcode and the minimum opcode -# with the same meaning. Dvi opcodes often encode the number of -# argument bytes in this delta. - -def _arg_raw(dvi, delta): - """Return *delta* without reading anything more from the dvi file.""" - return delta - - -def _arg(nbytes, signed, dvi, _): - """ - Read *nbytes* bytes, returning the bytes interpreted as a signed integer - if *signed* is true, unsigned otherwise. - """ - return dvi._arg(nbytes, signed) - - -def _arg_slen(dvi, delta): - """ - Read *delta* bytes, returning None if *delta* is zero, and the bytes - interpreted as a signed integer otherwise. - """ - if delta == 0: - return None - return dvi._arg(delta, True) - - -def _arg_slen1(dvi, delta): - """ - Read *delta*+1 bytes, returning the bytes interpreted as signed. - """ - return dvi._arg(delta + 1, True) - - -def _arg_ulen1(dvi, delta): - """ - Read *delta*+1 bytes, returning the bytes interpreted as unsigned. - """ - return dvi._arg(delta + 1, False) - - -def _arg_olen1(dvi, delta): - """ - Read *delta*+1 bytes, returning the bytes interpreted as - unsigned integer for 0<=*delta*<3 and signed if *delta*==3. - """ - return dvi._arg(delta + 1, delta == 3) - - -_arg_mapping = dict(raw=_arg_raw, - u1=partial(_arg, 1, False), - u4=partial(_arg, 4, False), - s4=partial(_arg, 4, True), - slen=_arg_slen, - olen1=_arg_olen1, - slen1=_arg_slen1, - ulen1=_arg_ulen1) - - -def _dispatch(table, min, max=None, state=None, args=('raw',)): - """ - Decorator for dispatch by opcode. Sets the values in *table* - from *min* to *max* to this method, adds a check that the Dvi state - matches *state* if not None, reads arguments from the file according - to *args*. - - Parameters - ---------- - table : dict[int, callable] - The dispatch table to be filled in. - - min, max : int - Range of opcodes that calls the registered function; *max* defaults to - *min*. - - state : _dvistate, optional - State of the Dvi object in which these opcodes are allowed. - - args : list[str], default: ['raw'] - Sequence of argument specifications: - - - 'raw': opcode minus minimum - - 'u1': read one unsigned byte - - 'u4': read four bytes, treat as an unsigned number - - 's4': read four bytes, treat as a signed number - - 'slen': read (opcode - minimum) bytes, treat as signed - - 'slen1': read (opcode - minimum + 1) bytes, treat as signed - - 'ulen1': read (opcode - minimum + 1) bytes, treat as unsigned - - 'olen1': read (opcode - minimum + 1) bytes, treat as unsigned - if under four bytes, signed if four bytes - """ - def decorate(method): - get_args = [_arg_mapping[x] for x in args] - - @wraps(method) - def wrapper(self, byte): - if state is not None and self.state != state: - raise ValueError("state precondition failed") - return method(self, *[f(self, byte-min) for f in get_args]) - if max is None: - table[min] = wrapper - else: - for i in range(min, max+1): - assert table[i] is None - table[i] = wrapper - return wrapper - return decorate - - -class Dvi: - """ - A reader for a dvi ("device-independent") file, as produced by TeX. - - The current implementation can only iterate through pages in order, - and does not even attempt to verify the postamble. - - This class can be used as a context manager to close the underlying - file upon exit. Pages can be read via iteration. Here is an overly - simple way to extract text without trying to detect whitespace:: - - >>> with matplotlib.dviread.Dvi('input.dvi', 72) as dvi: - ... for page in dvi: - ... print(''.join(chr(t.glyph) for t in page.text)) - """ - # dispatch table - _dtable = [None] * 256 - _dispatch = partial(_dispatch, _dtable) - - def __init__(self, filename, dpi): - """ - Read the data from the file named *filename* and convert - TeX's internal units to units of *dpi* per inch. - *dpi* only sets the units and does not limit the resolution. - Use None to return TeX's internal units. - """ - _log.debug('Dvi: %s', filename) - self.file = open(filename, 'rb') - self.dpi = dpi - self.fonts = {} - self.state = _dvistate.pre - - baseline = _api.deprecated("3.5")(property(lambda self: None)) - - def __enter__(self): - """Context manager enter method, does nothing.""" - return self - - def __exit__(self, etype, evalue, etrace): - """ - Context manager exit method, closes the underlying file if it is open. - """ - self.close() - - def __iter__(self): - """ - Iterate through the pages of the file. - - Yields - ------ - Page - Details of all the text and box objects on the page. - The Page tuple contains lists of Text and Box tuples and - the page dimensions, and the Text and Box tuples contain - coordinates transformed into a standard Cartesian - coordinate system at the dpi value given when initializing. - The coordinates are floating point numbers, but otherwise - precision is not lost and coordinate values are not clipped to - integers. - """ - while self._read(): - yield self._output() - - def close(self): - """Close the underlying file if it is open.""" - if not self.file.closed: - self.file.close() - - def _output(self): - """ - Output the text and boxes belonging to the most recent page. - page = dvi._output() - """ - minx, miny, maxx, maxy = np.inf, np.inf, -np.inf, -np.inf - maxy_pure = -np.inf - for elt in self.text + self.boxes: - if isinstance(elt, Box): - x, y, h, w = elt - e = 0 # zero depth - else: # glyph - x, y, font, g, w = elt - h, e = font._height_depth_of(g) - minx = min(minx, x) - miny = min(miny, y - h) - maxx = max(maxx, x + w) - maxy = max(maxy, y + e) - maxy_pure = max(maxy_pure, y) - if self._baseline_v is not None: - maxy_pure = self._baseline_v # This should normally be the case. - self._baseline_v = None - - if not self.text and not self.boxes: # Avoid infs/nans from inf+/-inf. - return Page(text=[], boxes=[], width=0, height=0, descent=0) - - if self.dpi is None: - # special case for ease of debugging: output raw dvi coordinates - return Page(text=self.text, boxes=self.boxes, - width=maxx-minx, height=maxy_pure-miny, - descent=maxy-maxy_pure) - - # convert from TeX's "scaled points" to dpi units - d = self.dpi / (72.27 * 2**16) - descent = (maxy - maxy_pure) * d - - text = [Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d) - for (x, y, f, g, w) in self.text] - boxes = [Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d) - for (x, y, h, w) in self.boxes] - - return Page(text=text, boxes=boxes, width=(maxx-minx)*d, - height=(maxy_pure-miny)*d, descent=descent) - - def _read(self): - """ - Read one page from the file. Return True if successful, - False if there were no more pages. - """ - # Pages appear to start with the sequence - # bop (begin of page) - # xxx comment - # # if using chemformula - # down - # push - # down - # # if using xcolor - # down - # push - # down (possibly multiple) - # push <= here, v is the baseline position. - # etc. - # (dviasm is useful to explore this structure.) - # Thus, we use the vertical position at the first time the stack depth - # reaches 3, while at least three "downs" have been executed (excluding - # those popped out (corresponding to the chemformula preamble)), as the - # baseline (the "down" count is necessary to handle xcolor). - down_stack = [0] - self._baseline_v = None - while True: - byte = self.file.read(1)[0] - self._dtable[byte](self, byte) - name = self._dtable[byte].__name__ - if name == "_push": - down_stack.append(down_stack[-1]) - elif name == "_pop": - down_stack.pop() - elif name == "_down": - down_stack[-1] += 1 - if (self._baseline_v is None - and len(getattr(self, "stack", [])) == 3 - and down_stack[-1] >= 4): - self._baseline_v = self.v - if byte == 140: # end of page - return True - if self.state is _dvistate.post_post: # end of file - self.close() - return False - - def _arg(self, nbytes, signed=False): - """ - Read and return an integer argument *nbytes* long. - Signedness is determined by the *signed* keyword. - """ - buf = self.file.read(nbytes) - value = buf[0] - if signed and value >= 0x80: - value = value - 0x100 - for b in buf[1:]: - value = 0x100*value + b - return value - - @_dispatch(min=0, max=127, state=_dvistate.inpage) - def _set_char_immediate(self, char): - self._put_char_real(char) - self.h += self.fonts[self.f]._width_of(char) - - @_dispatch(min=128, max=131, state=_dvistate.inpage, args=('olen1',)) - def _set_char(self, char): - self._put_char_real(char) - self.h += self.fonts[self.f]._width_of(char) - - @_dispatch(132, state=_dvistate.inpage, args=('s4', 's4')) - def _set_rule(self, a, b): - self._put_rule_real(a, b) - self.h += b - - @_dispatch(min=133, max=136, state=_dvistate.inpage, args=('olen1',)) - def _put_char(self, char): - self._put_char_real(char) - - def _put_char_real(self, char): - font = self.fonts[self.f] - if font._vf is None: - self.text.append(Text(self.h, self.v, font, char, - font._width_of(char))) - else: - scale = font._scale - for x, y, f, g, w in font._vf[char].text: - newf = DviFont(scale=_mul2012(scale, f._scale), - tfm=f._tfm, texname=f.texname, vf=f._vf) - self.text.append(Text(self.h + _mul2012(x, scale), - self.v + _mul2012(y, scale), - newf, g, newf._width_of(g))) - self.boxes.extend([Box(self.h + _mul2012(x, scale), - self.v + _mul2012(y, scale), - _mul2012(a, scale), _mul2012(b, scale)) - for x, y, a, b in font._vf[char].boxes]) - - @_dispatch(137, state=_dvistate.inpage, args=('s4', 's4')) - def _put_rule(self, a, b): - self._put_rule_real(a, b) - - def _put_rule_real(self, a, b): - if a > 0 and b > 0: - self.boxes.append(Box(self.h, self.v, a, b)) - - @_dispatch(138) - def _nop(self, _): - pass - - @_dispatch(139, state=_dvistate.outer, args=('s4',)*11) - def _bop(self, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, p): - self.state = _dvistate.inpage - self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0 - self.stack = [] - self.text = [] # list of Text objects - self.boxes = [] # list of Box objects - - @_dispatch(140, state=_dvistate.inpage) - def _eop(self, _): - self.state = _dvistate.outer - del self.h, self.v, self.w, self.x, self.y, self.z, self.stack - - @_dispatch(141, state=_dvistate.inpage) - def _push(self, _): - self.stack.append((self.h, self.v, self.w, self.x, self.y, self.z)) - - @_dispatch(142, state=_dvistate.inpage) - def _pop(self, _): - self.h, self.v, self.w, self.x, self.y, self.z = self.stack.pop() - - @_dispatch(min=143, max=146, state=_dvistate.inpage, args=('slen1',)) - def _right(self, b): - self.h += b - - @_dispatch(min=147, max=151, state=_dvistate.inpage, args=('slen',)) - def _right_w(self, new_w): - if new_w is not None: - self.w = new_w - self.h += self.w - - @_dispatch(min=152, max=156, state=_dvistate.inpage, args=('slen',)) - def _right_x(self, new_x): - if new_x is not None: - self.x = new_x - self.h += self.x - - @_dispatch(min=157, max=160, state=_dvistate.inpage, args=('slen1',)) - def _down(self, a): - self.v += a - - @_dispatch(min=161, max=165, state=_dvistate.inpage, args=('slen',)) - def _down_y(self, new_y): - if new_y is not None: - self.y = new_y - self.v += self.y - - @_dispatch(min=166, max=170, state=_dvistate.inpage, args=('slen',)) - def _down_z(self, new_z): - if new_z is not None: - self.z = new_z - self.v += self.z - - @_dispatch(min=171, max=234, state=_dvistate.inpage) - def _fnt_num_immediate(self, k): - self.f = k - - @_dispatch(min=235, max=238, state=_dvistate.inpage, args=('olen1',)) - def _fnt_num(self, new_f): - self.f = new_f - - @_dispatch(min=239, max=242, args=('ulen1',)) - def _xxx(self, datalen): - special = self.file.read(datalen) - _log.debug( - 'Dvi._xxx: encountered special: %s', - ''.join([chr(ch) if 32 <= ch < 127 else '<%02x>' % ch - for ch in special])) - - @_dispatch(min=243, max=246, args=('olen1', 'u4', 'u4', 'u4', 'u1', 'u1')) - def _fnt_def(self, k, c, s, d, a, l): - self._fnt_def_real(k, c, s, d, a, l) - - def _fnt_def_real(self, k, c, s, d, a, l): - n = self.file.read(a + l) - fontname = n[-l:].decode('ascii') - tfm = _tfmfile(fontname) - if c != 0 and tfm.checksum != 0 and c != tfm.checksum: - raise ValueError('tfm checksum mismatch: %s' % n) - try: - vf = _vffile(fontname) - except FileNotFoundError: - vf = None - self.fonts[k] = DviFont(scale=s, tfm=tfm, texname=n, vf=vf) - - @_dispatch(247, state=_dvistate.pre, args=('u1', 'u4', 'u4', 'u4', 'u1')) - def _pre(self, i, num, den, mag, k): - self.file.read(k) # comment in the dvi file - if i != 2: - raise ValueError("Unknown dvi format %d" % i) - if num != 25400000 or den != 7227 * 2**16: - raise ValueError("Nonstandard units in dvi file") - # meaning: TeX always uses those exact values, so it - # should be enough for us to support those - # (There are 72.27 pt to an inch so 7227 pt = - # 7227 * 2**16 sp to 100 in. The numerator is multiplied - # by 10^5 to get units of 10**-7 meters.) - if mag != 1000: - raise ValueError("Nonstandard magnification in dvi file") - # meaning: LaTeX seems to frown on setting \mag, so - # I think we can assume this is constant - self.state = _dvistate.outer - - @_dispatch(248, state=_dvistate.outer) - def _post(self, _): - self.state = _dvistate.post_post - # TODO: actually read the postamble and finale? - # currently post_post just triggers closing the file - - @_dispatch(249) - def _post_post(self, _): - raise NotImplementedError - - @_dispatch(min=250, max=255) - def _malformed(self, offset): - raise ValueError(f"unknown command: byte {250 + offset}") - - -class DviFont: - """ - Encapsulation of a font that a DVI file can refer to. - - This class holds a font's texname and size, supports comparison, - and knows the widths of glyphs in the same units as the AFM file. - There are also internal attributes (for use by dviread.py) that - are *not* used for comparison. - - The size is in Adobe points (converted from TeX points). - - Parameters - ---------- - scale : float - Factor by which the font is scaled from its natural size. - tfm : Tfm - TeX font metrics for this font - texname : bytes - Name of the font as used internally by TeX and friends, as an ASCII - bytestring. This is usually very different from any external font - names; `PsfontsMap` can be used to find the external name of the font. - vf : Vf - A TeX "virtual font" file, or None if this font is not virtual. - - Attributes - ---------- - texname : bytes - size : float - Size of the font in Adobe points, converted from the slightly - smaller TeX points. - widths : list - Widths of glyphs in glyph-space units, typically 1/1000ths of - the point size. - - """ - __slots__ = ('texname', 'size', 'widths', '_scale', '_vf', '_tfm') - - def __init__(self, scale, tfm, texname, vf): - _api.check_isinstance(bytes, texname=texname) - self._scale = scale - self._tfm = tfm - self.texname = texname - self._vf = vf - self.size = scale * (72.0 / (72.27 * 2**16)) - try: - nchars = max(tfm.width) + 1 - except ValueError: - nchars = 0 - self.widths = [(1000*tfm.width.get(char, 0)) >> 20 - for char in range(nchars)] - - def __eq__(self, other): - return (type(self) == type(other) - and self.texname == other.texname and self.size == other.size) - - def __ne__(self, other): - return not self.__eq__(other) - - def __repr__(self): - return "<{}: {}>".format(type(self).__name__, self.texname) - - def _width_of(self, char): - """Width of char in dvi units.""" - width = self._tfm.width.get(char, None) - if width is not None: - return _mul2012(width, self._scale) - _log.debug('No width for char %d in font %s.', char, self.texname) - return 0 - - def _height_depth_of(self, char): - """Height and depth of char in dvi units.""" - result = [] - for metric, name in ((self._tfm.height, "height"), - (self._tfm.depth, "depth")): - value = metric.get(char, None) - if value is None: - _log.debug('No %s for char %d in font %s', - name, char, self.texname) - result.append(0) - else: - result.append(_mul2012(value, self._scale)) - # cmsyXX (symbols font) glyph 0 ("minus") has a nonzero descent - # so that TeX aligns equations properly - # (https://tex.stackexchange.com/q/526103/) - # but we actually care about the rasterization depth to align - # the dvipng-generated images. - if re.match(br'^cmsy\d+$', self.texname) and char == 0: - result[-1] = 0 - return result - class Vf(Dvi): r""" @@ -714,406 +120,4 @@ def _pre(self, i, x, cs, ds): # cs = checksum, ds = design size -def _mul2012(num1, num2): - """Multiply two numbers in 20.12 fixed point format.""" - # Separated into a function because >> has surprising precedence - return (num1*num2) >> 20 - - -class Tfm: - """ - A TeX Font Metric file. - - This implementation covers only the bare minimum needed by the Dvi class. - - Parameters - ---------- - filename : str or path-like - - Attributes - ---------- - checksum : int - Used for verifying against the dvi file. - design_size : int - Design size of the font (unknown units) - width, height, depth : dict - Dimensions of each character, need to be scaled by the factor - specified in the dvi file. These are dicts because indexing may - not start from 0. - """ - __slots__ = ('checksum', 'design_size', 'width', 'height', 'depth') - - def __init__(self, filename): - _log.debug('opening tfm file %s', filename) - with open(filename, 'rb') as file: - header1 = file.read(24) - lh, bc, ec, nw, nh, nd = struct.unpack('!6H', header1[2:14]) - _log.debug('lh=%d, bc=%d, ec=%d, nw=%d, nh=%d, nd=%d', - lh, bc, ec, nw, nh, nd) - header2 = file.read(4*lh) - self.checksum, self.design_size = struct.unpack('!2I', header2[:8]) - # there is also encoding information etc. - char_info = file.read(4*(ec-bc+1)) - widths = struct.unpack(f'!{nw}i', file.read(4*nw)) - heights = struct.unpack(f'!{nh}i', file.read(4*nh)) - depths = struct.unpack(f'!{nd}i', file.read(4*nd)) - self.width, self.height, self.depth = {}, {}, {} - for idx, char in enumerate(range(bc, ec+1)): - byte0 = char_info[4*idx] - byte1 = char_info[4*idx+1] - self.width[char] = widths[byte0] - self.height[char] = heights[byte1 >> 4] - self.depth[char] = depths[byte1 & 0xf] - - -PsFont = namedtuple('PsFont', 'texname psname effects encoding filename') - - -class PsfontsMap: - """ - A psfonts.map formatted file, mapping TeX fonts to PS fonts. - - Parameters - ---------- - filename : str or path-like - - Notes - ----- - For historical reasons, TeX knows many Type-1 fonts by different - names than the outside world. (For one thing, the names have to - fit in eight characters.) Also, TeX's native fonts are not Type-1 - but Metafont, which is nontrivial to convert to PostScript except - as a bitmap. While high-quality conversions to Type-1 format exist - and are shipped with modern TeX distributions, we need to know - which Type-1 fonts are the counterparts of which native fonts. For - these reasons a mapping is needed from internal font names to font - file names. - - A texmf tree typically includes mapping files called e.g. - :file:`psfonts.map`, :file:`pdftex.map`, or :file:`dvipdfm.map`. - The file :file:`psfonts.map` is used by :program:`dvips`, - :file:`pdftex.map` by :program:`pdfTeX`, and :file:`dvipdfm.map` - by :program:`dvipdfm`. :file:`psfonts.map` might avoid embedding - the 35 PostScript fonts (i.e., have no filename for them, as in - the Times-Bold example above), while the pdf-related files perhaps - only avoid the "Base 14" pdf fonts. But the user may have - configured these files differently. - - Examples - -------- - >>> map = PsfontsMap(find_tex_file('pdftex.map')) - >>> entry = map[b'ptmbo8r'] - >>> entry.texname - b'ptmbo8r' - >>> entry.psname - b'Times-Bold' - >>> entry.encoding - '/usr/local/texlive/2008/texmf-dist/fonts/enc/dvips/base/8r.enc' - >>> entry.effects - {'slant': 0.16700000000000001} - >>> entry.filename - """ - __slots__ = ('_filename', '_unparsed', '_parsed') - - # Create a filename -> PsfontsMap cache, so that calling - # `PsfontsMap(filename)` with the same filename a second time immediately - # returns the same object. - @lru_cache() - def __new__(cls, filename): - self = object.__new__(cls) - self._filename = os.fsdecode(filename) - # Some TeX distributions have enormous pdftex.map files which would - # take hundreds of milliseconds to parse, but it is easy enough to just - # store the unparsed lines (keyed by the first word, which is the - # texname) and parse them on-demand. - with open(filename, 'rb') as file: - self._unparsed = {} - for line in file: - tfmname = line.split(b' ', 1)[0] - self._unparsed.setdefault(tfmname, []).append(line) - self._parsed = {} - return self - - def __getitem__(self, texname): - assert isinstance(texname, bytes) - if texname in self._unparsed: - for line in self._unparsed.pop(texname): - if self._parse_and_cache_line(line): - break - try: - return self._parsed[texname] - except KeyError: - raise LookupError( - f"An associated PostScript font (required by Matplotlib) " - f"could not be found for TeX font {texname.decode('ascii')!r} " - f"in {self._filename!r}; this problem can often be solved by " - f"installing a suitable PostScript font package in your TeX " - f"package manager") from None - - def _parse_and_cache_line(self, line): - """ - Parse a line in the font mapping file. - - The format is (partially) documented at - http://mirrors.ctan.org/systems/doc/pdftex/manual/pdftex-a.pdf - https://tug.org/texinfohtml/dvips.html#psfonts_002emap - Each line can have the following fields: - - - tfmname (first, only required field), - - psname (defaults to tfmname, must come immediately after tfmname if - present), - - fontflags (integer, must come immediately after psname if present, - ignored by us), - - special (SlantFont and ExtendFont, only field that is double-quoted), - - fontfile, encodingfile (optional, prefixed by <, <<, or <[; << always - precedes a font, <[ always precedes an encoding, < can precede either - but then an encoding file must have extension .enc; < and << also - request different font subsetting behaviors but we ignore that; < can - be separated from the filename by whitespace). - - special, fontfile, and encodingfile can appear in any order. - """ - # If the map file specifies multiple encodings for a font, we - # follow pdfTeX in choosing the last one specified. Such - # entries are probably mistakes but they have occurred. - # https://tex.stackexchange.com/q/10826/ - - if not line or line.startswith((b" ", b"%", b"*", b";", b"#")): - return - tfmname = basename = special = encodingfile = fontfile = None - is_subsetted = is_t1 = is_truetype = False - matches = re.finditer(br'"([^"]*)(?:"|$)|(\S+)', line) - for match in matches: - quoted, unquoted = match.groups() - if unquoted: - if unquoted.startswith(b"<<"): # font - fontfile = unquoted[2:] - elif unquoted.startswith(b"<["): # encoding - encodingfile = unquoted[2:] - elif unquoted.startswith(b"<"): # font or encoding - word = ( - # foo - unquoted[1:] - # < by itself => read the next word - or next(filter(None, next(matches).groups()))) - if word.endswith(b".enc"): - encodingfile = word - else: - fontfile = word - is_subsetted = True - elif tfmname is None: - tfmname = unquoted - elif basename is None: - basename = unquoted - elif quoted: - special = quoted - effects = {} - if special: - words = reversed(special.split()) - for word in words: - if word == b"SlantFont": - effects["slant"] = float(next(words)) - elif word == b"ExtendFont": - effects["extend"] = float(next(words)) - - # Verify some properties of the line that would cause it to be ignored - # otherwise. - if fontfile is not None: - if fontfile.endswith((b".ttf", b".ttc")): - is_truetype = True - elif not fontfile.endswith(b".otf"): - is_t1 = True - elif basename is not None: - is_t1 = True - if is_truetype and is_subsetted and encodingfile is None: - return - if not is_t1 and ("slant" in effects or "extend" in effects): - return - if abs(effects.get("slant", 0)) > 1: - return - if abs(effects.get("extend", 0)) > 2: - return - - if basename is None: - basename = tfmname - if encodingfile is not None: - encodingfile = _find_tex_file(encodingfile) - if fontfile is not None: - fontfile = _find_tex_file(fontfile) - self._parsed[tfmname] = PsFont( - texname=tfmname, psname=basename, effects=effects, - encoding=encodingfile, filename=fontfile) - return True - - -def _parse_enc(path): - r""" - Parse a \*.enc file referenced from a psfonts.map style file. - - The format supported by this function is a tiny subset of PostScript. - - Parameters - ---------- - path : os.PathLike - - Returns - ------- - list - The nth entry of the list is the PostScript glyph name of the nth - glyph. - """ - no_comments = re.sub("%.*", "", Path(path).read_text(encoding="ascii")) - array = re.search(r"(?s)\[(.*)\]", no_comments).group(1) - lines = [line for line in array.split() if line] - if all(line.startswith("/") for line in lines): - return [line[1:] for line in lines] - else: - raise ValueError( - "Failed to parse {} as Postscript encoding".format(path)) - - -class _LuatexKpsewhich: - @lru_cache() # A singleton. - def __new__(cls): - self = object.__new__(cls) - self._proc = self._new_proc() - return self - - def _new_proc(self): - return subprocess.Popen( - ["luatex", "--luaonly", - str(cbook._get_data_path("kpsewhich.lua"))], - stdin=subprocess.PIPE, stdout=subprocess.PIPE) - - def search(self, filename): - if self._proc.poll() is not None: # Dead, restart it. - self._proc = self._new_proc() - self._proc.stdin.write(os.fsencode(filename) + b"\n") - self._proc.stdin.flush() - out = self._proc.stdout.readline().rstrip() - return None if out == b"nil" else os.fsdecode(out) - - -@lru_cache() -@_api.delete_parameter("3.5", "format") -def _find_tex_file(filename, format=None): - """ - Find a file in the texmf tree using kpathsea_. - - The kpathsea library, provided by most existing TeX distributions, both - on Unix-like systems and on Windows (MikTeX), is invoked via a long-lived - luatex process if luatex is installed, or via kpsewhich otherwise. - - .. _kpathsea: https://www.tug.org/kpathsea/ - - Parameters - ---------- - filename : str or path-like - format : str or bytes - Used as the value of the ``--format`` option to :program:`kpsewhich`. - Could be e.g. 'tfm' or 'vf' to limit the search to that type of files. - Deprecated. - - Raises - ------ - FileNotFoundError - If the file is not found. - """ - - # we expect these to always be ascii encoded, but use utf-8 - # out of caution - if isinstance(filename, bytes): - filename = filename.decode('utf-8', errors='replace') - if isinstance(format, bytes): - format = format.decode('utf-8', errors='replace') - - try: - lk = _LuatexKpsewhich() - except FileNotFoundError: - lk = None # Fallback to directly calling kpsewhich, as below. - - if lk and format is None: - path = lk.search(filename) - - else: - if os.name == 'nt': - # On Windows only, kpathsea can use utf-8 for cmd args and output. - # The `command_line_encoding` environment variable is set to force - # it to always use utf-8 encoding. See Matplotlib issue #11848. - kwargs = {'env': {**os.environ, 'command_line_encoding': 'utf-8'}, - 'encoding': 'utf-8'} - else: # On POSIX, run through the equivalent of os.fsdecode(). - kwargs = {'encoding': sys.getfilesystemencoding(), - 'errors': 'surrogateescape'} - - cmd = ['kpsewhich'] - if format is not None: - cmd += ['--format=' + format] - cmd += [filename] - try: - path = (cbook._check_and_log_subprocess(cmd, _log, **kwargs) - .rstrip('\n')) - except (FileNotFoundError, RuntimeError): - path = None - - if path: - return path - else: - raise FileNotFoundError( - f"Matplotlib's TeX implementation searched for a file named " - f"{filename!r} in your texmf tree, but could not find it") - - -# After the deprecation period elapses, delete this shim and rename -# _find_tex_file to find_tex_file everywhere. -@_api.delete_parameter("3.5", "format") -def find_tex_file(filename, format=None): - try: - return (_find_tex_file(filename, format) if format is not None else - _find_tex_file(filename)) - except FileNotFoundError as exc: - _api.warn_deprecated( - "3.6", message=f"{exc.args[0]}; in the future, this will raise a " - f"FileNotFoundError.") - return "" - - -find_tex_file.__doc__ = _find_tex_file.__doc__ - - -@lru_cache() -def _fontfile(cls, suffix, texname): - return cls(_find_tex_file(texname + suffix)) - - -_tfmfile = partial(_fontfile, Tfm, ".tfm") _vffile = partial(_fontfile, Vf, ".vf") - - -if __name__ == '__main__': - from argparse import ArgumentParser - import itertools - - parser = ArgumentParser() - parser.add_argument("filename") - parser.add_argument("dpi", nargs="?", type=float, default=None) - args = parser.parse_args() - with Dvi(args.filename, args.dpi) as dvi: - fontmap = PsfontsMap(_find_tex_file('pdftex.map')) - for page in dvi: - print(f"=== new page === " - f"(w: {page.width}, h: {page.height}, d: {page.descent})") - for font, group in itertools.groupby( - page.text, lambda text: text.font): - print(f"font: {font.texname.decode('latin-1')!r}\t" - f"scale: {font._scale / 2 ** 20}") - print("x", "y", "glyph", "chr", "w", "(glyphs)", sep="\t") - for text in group: - print(text.x, text.y, text.glyph, - chr(text.glyph) if chr(text.glyph).isprintable() - else ".", - text.width, sep="\t") - if page.boxes: - print("x", "y", "w", "h", "", "(boxes)", sep="\t") - for x, y, w, h in page.boxes: - print(x, y, w, h, sep="\t") diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py index 9eb0d76a1e52..dd2f83c28fad 100644 --- a/lib/matplotlib/backends/backend_pdf.py +++ b/lib/matplotlib/backends/backend_pdf.py @@ -36,6 +36,7 @@ from matplotlib.font_manager import findfont, get_font from matplotlib.afm import AFM import matplotlib.type1font as type1font +import matplotlib._dviread as _dviread import matplotlib.dviread as dviread from matplotlib.ft2font import (FIXED_WIDTH, ITALIC, LOAD_NO_SCALE, LOAD_NO_HINTING, KERNING_UNFITTED, FT2Font) @@ -891,7 +892,8 @@ def dviFontName(self, dvifont): if dvi_info is not None: return dvi_info.pdfname - tex_font_map = dviread.PsfontsMap(dviread._find_tex_file('pdftex.map')) + tex_font_map = _dviread.PsfontsMap( + _dviread._find_tex_file('pdftex.map')) psfont = tex_font_map[dvifont.texname] if psfont.filename is None: raise ValueError( @@ -966,7 +968,7 @@ def _embedTeXFont(self, fontinfo): fontdict['Encoding'] = { 'Type': Name('Encoding'), 'Differences': [ - 0, *map(Name, dviread._parse_enc(fontinfo.encodingfile))], + 0, *map(Name, _dviread._parse_enc(fontinfo.encodingfile))], } # If no file is specified, stop short diff --git a/lib/matplotlib/backends/backend_ps.py b/lib/matplotlib/backends/backend_ps.py index 33c41202dbb4..1bdb2ab1b2ab 100644 --- a/lib/matplotlib/backends/backend_ps.py +++ b/lib/matplotlib/backends/backend_ps.py @@ -33,7 +33,7 @@ from matplotlib.mathtext import MathTextParser from matplotlib._mathtext_data import uni2type1 from matplotlib.path import Path -from matplotlib.texmanager import TexManager +from matplotlib._texmanager import TexManager from matplotlib.transforms import Affine2D from matplotlib.backends.backend_mixed import MixedModeRenderer from . import _backend_pdf_ps diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py index 7f90a13f1086..9b831acba361 100644 --- a/lib/matplotlib/dviread.py +++ b/lib/matplotlib/dviread.py @@ -18,19 +18,15 @@ """ from collections import namedtuple +from functools import lru_cache, partial import enum -from functools import lru_cache, partial, wraps import logging import os -from pathlib import Path import re -import struct -import subprocess -import sys import numpy as np -from matplotlib import _api, cbook +from matplotlib import _api, cbook, _dviread, _dviread_vf _log = logging.getLogger(__name__) @@ -56,123 +52,6 @@ _dvistate = enum.Enum('DviState', 'pre outer inpage post_post finale') -# The marks on a page consist of text and boxes. A page also has dimensions. -Page = namedtuple('Page', 'text boxes height width descent') -Text = namedtuple('Text', 'x y font glyph width') -Box = namedtuple('Box', 'x y height width') - - -# Opcode argument parsing -# -# Each of the following functions takes a Dvi object and delta, -# which is the difference between the opcode and the minimum opcode -# with the same meaning. Dvi opcodes often encode the number of -# argument bytes in this delta. - -def _arg_raw(dvi, delta): - """Return *delta* without reading anything more from the dvi file.""" - return delta - - -def _arg(nbytes, signed, dvi, _): - """ - Read *nbytes* bytes, returning the bytes interpreted as a signed integer - if *signed* is true, unsigned otherwise. - """ - return dvi._arg(nbytes, signed) - - -def _arg_slen(dvi, delta): - """ - Read *delta* bytes, returning None if *delta* is zero, and the bytes - interpreted as a signed integer otherwise. - """ - if delta == 0: - return None - return dvi._arg(delta, True) - - -def _arg_slen1(dvi, delta): - """ - Read *delta*+1 bytes, returning the bytes interpreted as signed. - """ - return dvi._arg(delta + 1, True) - - -def _arg_ulen1(dvi, delta): - """ - Read *delta*+1 bytes, returning the bytes interpreted as unsigned. - """ - return dvi._arg(delta + 1, False) - - -def _arg_olen1(dvi, delta): - """ - Read *delta*+1 bytes, returning the bytes interpreted as - unsigned integer for 0<=*delta*<3 and signed if *delta*==3. - """ - return dvi._arg(delta + 1, delta == 3) - - -_arg_mapping = dict(raw=_arg_raw, - u1=partial(_arg, 1, False), - u4=partial(_arg, 4, False), - s4=partial(_arg, 4, True), - slen=_arg_slen, - olen1=_arg_olen1, - slen1=_arg_slen1, - ulen1=_arg_ulen1) - - -def _dispatch(table, min, max=None, state=None, args=('raw',)): - """ - Decorator for dispatch by opcode. Sets the values in *table* - from *min* to *max* to this method, adds a check that the Dvi state - matches *state* if not None, reads arguments from the file according - to *args*. - - Parameters - ---------- - table : dict[int, callable] - The dispatch table to be filled in. - - min, max : int - Range of opcodes that calls the registered function; *max* defaults to - *min*. - - state : _dvistate, optional - State of the Dvi object in which these opcodes are allowed. - - args : list[str], default: ['raw'] - Sequence of argument specifications: - - - 'raw': opcode minus minimum - - 'u1': read one unsigned byte - - 'u4': read four bytes, treat as an unsigned number - - 's4': read four bytes, treat as a signed number - - 'slen': read (opcode - minimum) bytes, treat as signed - - 'slen1': read (opcode - minimum + 1) bytes, treat as signed - - 'ulen1': read (opcode - minimum + 1) bytes, treat as unsigned - - 'olen1': read (opcode - minimum + 1) bytes, treat as unsigned - if under four bytes, signed if four bytes - """ - def decorate(method): - get_args = [_arg_mapping[x] for x in args] - - @wraps(method) - def wrapper(self, byte): - if state is not None and self.state != state: - raise ValueError("state precondition failed") - return method(self, *[f(self, byte-min) for f in get_args]) - if max is None: - table[min] = wrapper - else: - for i in range(min, max+1): - assert table[i] is None - table[i] = wrapper - return wrapper - return decorate - class Dvi: """ @@ -191,7 +70,7 @@ class Dvi: """ # dispatch table _dtable = [None] * 256 - _dispatch = partial(_dispatch, _dtable) + _dispatch = partial(_dviread._dispatch, _dtable) def __init__(self, filename, dpi): """ @@ -250,7 +129,7 @@ def _output(self): minx, miny, maxx, maxy = np.inf, np.inf, -np.inf, -np.inf maxy_pure = -np.inf for elt in self.text + self.boxes: - if isinstance(elt, Box): + if isinstance(elt, _dviread.Box): x, y, h, w = elt e = 0 # zero depth else: # glyph @@ -266,25 +145,26 @@ def _output(self): self._baseline_v = None if not self.text and not self.boxes: # Avoid infs/nans from inf+/-inf. - return Page(text=[], boxes=[], width=0, height=0, descent=0) + return _dviread.Page(text=[], boxes=[], width=0, + height=0, descent=0) if self.dpi is None: # special case for ease of debugging: output raw dvi coordinates - return Page(text=self.text, boxes=self.boxes, - width=maxx-minx, height=maxy_pure-miny, - descent=maxy-maxy_pure) + return _dviread.Page(text=self.text, boxes=self.boxes, + width=maxx-minx, height=maxy_pure-miny, + descent=maxy-maxy_pure) # convert from TeX's "scaled points" to dpi units d = self.dpi / (72.27 * 2**16) descent = (maxy - maxy_pure) * d - text = [Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d) + text = [_dviread.Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d) for (x, y, f, g, w) in self.text] - boxes = [Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d) + boxes = [_dviread.Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d) for (x, y, h, w) in self.boxes] - return Page(text=text, boxes=boxes, width=(maxx-minx)*d, - height=(maxy_pure-miny)*d, descent=descent) + return _dviread.Page(text=text, boxes=boxes, width=(maxx-minx)*d, + height=(maxy_pure-miny)*d, descent=descent) def _read(self): """ @@ -366,20 +246,24 @@ def _put_char(self, char): def _put_char_real(self, char): font = self.fonts[self.f] if font._vf is None: - self.text.append(Text(self.h, self.v, font, char, - font._width_of(char))) + self.text.append(_dviread.Text(self.h, self.v, font, char, + font._width_of(char))) else: scale = font._scale for x, y, f, g, w in font._vf[char].text: - newf = DviFont(scale=_mul2012(scale, f._scale), - tfm=f._tfm, texname=f.texname, vf=f._vf) - self.text.append(Text(self.h + _mul2012(x, scale), - self.v + _mul2012(y, scale), - newf, g, newf._width_of(g))) - self.boxes.extend([Box(self.h + _mul2012(x, scale), - self.v + _mul2012(y, scale), - _mul2012(a, scale), _mul2012(b, scale)) - for x, y, a, b in font._vf[char].boxes]) + newf = _dviread.DviFont( + scale=_dviread._mul2012(scale, f._scale), + tfm=f._tfm, texname=f.texname, vf=f._vf) + self.text.append(_dviread.Text( + self.h + _dviread._mul2012(x, scale), + self.v + _dviread._mul2012(y, scale), + newf, g, newf._width_of(g))) + self.boxes.extend( + [_dviread.Box(self.h + _dviread._mul2012(x, scale), + self.v + _dviread._mul2012(y, scale), + _dviread._mul2012(a, scale), + _dviread._mul2012(b, scale)) + for x, y, a, b in font._vf[char].boxes]) @_dispatch(137, state=_dvistate.inpage, args=('s4', 's4')) def _put_rule(self, a, b): @@ -387,7 +271,7 @@ def _put_rule(self, a, b): def _put_rule_real(self, a, b): if a > 0 and b > 0: - self.boxes.append(Box(self.h, self.v, a, b)) + self.boxes.append(_dviread.Box(self.h, self.v, a, b)) @_dispatch(138) def _nop(self, _): @@ -469,14 +353,14 @@ def _fnt_def(self, k, c, s, d, a, l): def _fnt_def_real(self, k, c, s, d, a, l): n = self.file.read(a + l) fontname = n[-l:].decode('ascii') - tfm = _tfmfile(fontname) + tfm = _dviread._tfmfile(fontname) if c != 0 and tfm.checksum != 0 and c != tfm.checksum: raise ValueError('tfm checksum mismatch: %s' % n) try: - vf = _vffile(fontname) + vf = _dviread_vf._vffile(fontname) except FileNotFoundError: vf = None - self.fonts[k] = DviFont(scale=s, tfm=tfm, texname=n, vf=vf) + self.fonts[k] = _dviread.DviFont(scale=s, tfm=tfm, texname=n, vf=vf) @_dispatch(247, state=_dvistate.pre, args=('u1', 'u4', 'u4', 'u4', 'u1')) def _pre(self, i, num, den, mag, k): @@ -511,261 +395,6 @@ def _malformed(self, offset): raise ValueError(f"unknown command: byte {250 + offset}") -class DviFont: - """ - Encapsulation of a font that a DVI file can refer to. - - This class holds a font's texname and size, supports comparison, - and knows the widths of glyphs in the same units as the AFM file. - There are also internal attributes (for use by dviread.py) that - are *not* used for comparison. - - The size is in Adobe points (converted from TeX points). - - Parameters - ---------- - scale : float - Factor by which the font is scaled from its natural size. - tfm : Tfm - TeX font metrics for this font - texname : bytes - Name of the font as used internally by TeX and friends, as an ASCII - bytestring. This is usually very different from any external font - names; `PsfontsMap` can be used to find the external name of the font. - vf : Vf - A TeX "virtual font" file, or None if this font is not virtual. - - Attributes - ---------- - texname : bytes - size : float - Size of the font in Adobe points, converted from the slightly - smaller TeX points. - widths : list - Widths of glyphs in glyph-space units, typically 1/1000ths of - the point size. - - """ - __slots__ = ('texname', 'size', 'widths', '_scale', '_vf', '_tfm') - - def __init__(self, scale, tfm, texname, vf): - _api.check_isinstance(bytes, texname=texname) - self._scale = scale - self._tfm = tfm - self.texname = texname - self._vf = vf - self.size = scale * (72.0 / (72.27 * 2**16)) - try: - nchars = max(tfm.width) + 1 - except ValueError: - nchars = 0 - self.widths = [(1000*tfm.width.get(char, 0)) >> 20 - for char in range(nchars)] - - def __eq__(self, other): - return (type(self) == type(other) - and self.texname == other.texname and self.size == other.size) - - def __ne__(self, other): - return not self.__eq__(other) - - def __repr__(self): - return "<{}: {}>".format(type(self).__name__, self.texname) - - def _width_of(self, char): - """Width of char in dvi units.""" - width = self._tfm.width.get(char, None) - if width is not None: - return _mul2012(width, self._scale) - _log.debug('No width for char %d in font %s.', char, self.texname) - return 0 - - def _height_depth_of(self, char): - """Height and depth of char in dvi units.""" - result = [] - for metric, name in ((self._tfm.height, "height"), - (self._tfm.depth, "depth")): - value = metric.get(char, None) - if value is None: - _log.debug('No %s for char %d in font %s', - name, char, self.texname) - result.append(0) - else: - result.append(_mul2012(value, self._scale)) - # cmsyXX (symbols font) glyph 0 ("minus") has a nonzero descent - # so that TeX aligns equations properly - # (https://tex.stackexchange.com/q/526103/) - # but we actually care about the rasterization depth to align - # the dvipng-generated images. - if re.match(br'^cmsy\d+$', self.texname) and char == 0: - result[-1] = 0 - return result - - -class Vf(Dvi): - r""" - A virtual font (\*.vf file) containing subroutines for dvi files. - - Parameters - ---------- - filename : str or path-like - - Notes - ----- - The virtual font format is a derivative of dvi: - http://mirrors.ctan.org/info/knuth/virtual-fonts - This class reuses some of the machinery of `Dvi` - but replaces the `_read` loop and dispatch mechanism. - - Examples - -------- - :: - - vf = Vf(filename) - glyph = vf[code] - glyph.text, glyph.boxes, glyph.width - """ - - def __init__(self, filename): - super().__init__(filename, 0) - try: - self._first_font = None - self._chars = {} - self._read() - finally: - self.close() - - def __getitem__(self, code): - return self._chars[code] - - def _read(self): - """ - Read one page from the file. Return True if successful, - False if there were no more pages. - """ - packet_char, packet_ends = None, None - packet_len, packet_width = None, None - while True: - byte = self.file.read(1)[0] - # If we are in a packet, execute the dvi instructions - if self.state is _dvistate.inpage: - byte_at = self.file.tell()-1 - if byte_at == packet_ends: - self._finalize_packet(packet_char, packet_width) - packet_len, packet_char, packet_width = None, None, None - # fall through to out-of-packet code - elif byte_at > packet_ends: - raise ValueError("Packet length mismatch in vf file") - else: - if byte in (139, 140) or byte >= 243: - raise ValueError( - "Inappropriate opcode %d in vf file" % byte) - Dvi._dtable[byte](self, byte) - continue - - # We are outside a packet - if byte < 242: # a short packet (length given by byte) - packet_len = byte - packet_char, packet_width = self._arg(1), self._arg(3) - packet_ends = self._init_packet(byte) - self.state = _dvistate.inpage - elif byte == 242: # a long packet - packet_len, packet_char, packet_width = \ - [self._arg(x) for x in (4, 4, 4)] - self._init_packet(packet_len) - elif 243 <= byte <= 246: - k = self._arg(byte - 242, byte == 246) - c, s, d, a, l = [self._arg(x) for x in (4, 4, 4, 1, 1)] - self._fnt_def_real(k, c, s, d, a, l) - if self._first_font is None: - self._first_font = k - elif byte == 247: # preamble - i, k = self._arg(1), self._arg(1) - x = self.file.read(k) - cs, ds = self._arg(4), self._arg(4) - self._pre(i, x, cs, ds) - elif byte == 248: # postamble (just some number of 248s) - break - else: - raise ValueError("Unknown vf opcode %d" % byte) - - def _init_packet(self, pl): - if self.state != _dvistate.outer: - raise ValueError("Misplaced packet in vf file") - self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0 - self.stack, self.text, self.boxes = [], [], [] - self.f = self._first_font - return self.file.tell() + pl - - def _finalize_packet(self, packet_char, packet_width): - self._chars[packet_char] = Page( - text=self.text, boxes=self.boxes, width=packet_width, - height=None, descent=None) - self.state = _dvistate.outer - - def _pre(self, i, x, cs, ds): - if self.state is not _dvistate.pre: - raise ValueError("pre command in middle of vf file") - if i != 202: - raise ValueError("Unknown vf format %d" % i) - if len(x): - _log.debug('vf file comment: %s', x) - self.state = _dvistate.outer - # cs = checksum, ds = design size - - -def _mul2012(num1, num2): - """Multiply two numbers in 20.12 fixed point format.""" - # Separated into a function because >> has surprising precedence - return (num1*num2) >> 20 - - -class Tfm: - """ - A TeX Font Metric file. - - This implementation covers only the bare minimum needed by the Dvi class. - - Parameters - ---------- - filename : str or path-like - - Attributes - ---------- - checksum : int - Used for verifying against the dvi file. - design_size : int - Design size of the font (unknown units) - width, height, depth : dict - Dimensions of each character, need to be scaled by the factor - specified in the dvi file. These are dicts because indexing may - not start from 0. - """ - __slots__ = ('checksum', 'design_size', 'width', 'height', 'depth') - - def __init__(self, filename): - _log.debug('opening tfm file %s', filename) - with open(filename, 'rb') as file: - header1 = file.read(24) - lh, bc, ec, nw, nh, nd = struct.unpack('!6H', header1[2:14]) - _log.debug('lh=%d, bc=%d, ec=%d, nw=%d, nh=%d, nd=%d', - lh, bc, ec, nw, nh, nd) - header2 = file.read(4*lh) - self.checksum, self.design_size = struct.unpack('!2I', header2[:8]) - # there is also encoding information etc. - char_info = file.read(4*(ec-bc+1)) - widths = struct.unpack(f'!{nw}i', file.read(4*nw)) - heights = struct.unpack(f'!{nh}i', file.read(4*nh)) - depths = struct.unpack(f'!{nd}i', file.read(4*nd)) - self.width, self.height, self.depth = {}, {}, {} - for idx, char in enumerate(range(bc, ec+1)): - byte0 = char_info[4*idx] - byte1 = char_info[4*idx+1] - self.width[char] = widths[byte0] - self.height[char] = heights[byte1 >> 4] - self.depth[char] = depths[byte1 & 0xf] - - PsFont = namedtuple('PsFont', 'texname psname effects encoding filename') @@ -937,159 +566,15 @@ def _parse_and_cache_line(self, line): if basename is None: basename = tfmname if encodingfile is not None: - encodingfile = _find_tex_file(encodingfile) + encodingfile = _dviread._find_tex_file(encodingfile) if fontfile is not None: - fontfile = _find_tex_file(fontfile) + fontfile = _dviread._find_tex_file(fontfile) self._parsed[tfmname] = PsFont( texname=tfmname, psname=basename, effects=effects, encoding=encodingfile, filename=fontfile) return True -def _parse_enc(path): - r""" - Parse a \*.enc file referenced from a psfonts.map style file. - - The format supported by this function is a tiny subset of PostScript. - - Parameters - ---------- - path : os.PathLike - - Returns - ------- - list - The nth entry of the list is the PostScript glyph name of the nth - glyph. - """ - no_comments = re.sub("%.*", "", Path(path).read_text(encoding="ascii")) - array = re.search(r"(?s)\[(.*)\]", no_comments).group(1) - lines = [line for line in array.split() if line] - if all(line.startswith("/") for line in lines): - return [line[1:] for line in lines] - else: - raise ValueError( - "Failed to parse {} as Postscript encoding".format(path)) - - -class _LuatexKpsewhich: - @lru_cache() # A singleton. - def __new__(cls): - self = object.__new__(cls) - self._proc = self._new_proc() - return self - - def _new_proc(self): - return subprocess.Popen( - ["luatex", "--luaonly", - str(cbook._get_data_path("kpsewhich.lua"))], - stdin=subprocess.PIPE, stdout=subprocess.PIPE) - - def search(self, filename): - if self._proc.poll() is not None: # Dead, restart it. - self._proc = self._new_proc() - self._proc.stdin.write(os.fsencode(filename) + b"\n") - self._proc.stdin.flush() - out = self._proc.stdout.readline().rstrip() - return None if out == b"nil" else os.fsdecode(out) - - -@lru_cache() -@_api.delete_parameter("3.5", "format") -def _find_tex_file(filename, format=None): - """ - Find a file in the texmf tree using kpathsea_. - - The kpathsea library, provided by most existing TeX distributions, both - on Unix-like systems and on Windows (MikTeX), is invoked via a long-lived - luatex process if luatex is installed, or via kpsewhich otherwise. - - .. _kpathsea: https://www.tug.org/kpathsea/ - - Parameters - ---------- - filename : str or path-like - format : str or bytes - Used as the value of the ``--format`` option to :program:`kpsewhich`. - Could be e.g. 'tfm' or 'vf' to limit the search to that type of files. - Deprecated. - - Raises - ------ - FileNotFoundError - If the file is not found. - """ - - # we expect these to always be ascii encoded, but use utf-8 - # out of caution - if isinstance(filename, bytes): - filename = filename.decode('utf-8', errors='replace') - if isinstance(format, bytes): - format = format.decode('utf-8', errors='replace') - - try: - lk = _LuatexKpsewhich() - except FileNotFoundError: - lk = None # Fallback to directly calling kpsewhich, as below. - - if lk and format is None: - path = lk.search(filename) - - else: - if os.name == 'nt': - # On Windows only, kpathsea can use utf-8 for cmd args and output. - # The `command_line_encoding` environment variable is set to force - # it to always use utf-8 encoding. See Matplotlib issue #11848. - kwargs = {'env': {**os.environ, 'command_line_encoding': 'utf-8'}, - 'encoding': 'utf-8'} - else: # On POSIX, run through the equivalent of os.fsdecode(). - kwargs = {'encoding': sys.getfilesystemencoding(), - 'errors': 'surrogateescape'} - - cmd = ['kpsewhich'] - if format is not None: - cmd += ['--format=' + format] - cmd += [filename] - try: - path = (cbook._check_and_log_subprocess(cmd, _log, **kwargs) - .rstrip('\n')) - except (FileNotFoundError, RuntimeError): - path = None - - if path: - return path - else: - raise FileNotFoundError( - f"Matplotlib's TeX implementation searched for a file named " - f"{filename!r} in your texmf tree, but could not find it") - - -# After the deprecation period elapses, delete this shim and rename -# _find_tex_file to find_tex_file everywhere. -@_api.delete_parameter("3.5", "format") -def find_tex_file(filename, format=None): - try: - return (_find_tex_file(filename, format) if format is not None else - _find_tex_file(filename)) - except FileNotFoundError as exc: - _api.warn_deprecated( - "3.6", message=f"{exc.args[0]}; in the future, this will raise a " - f"FileNotFoundError.") - return "" - - -find_tex_file.__doc__ = _find_tex_file.__doc__ - - -@lru_cache() -def _fontfile(cls, suffix, texname): - return cls(_find_tex_file(texname + suffix)) - - -_tfmfile = partial(_fontfile, Tfm, ".tfm") -_vffile = partial(_fontfile, Vf, ".vf") - - if __name__ == '__main__': from argparse import ArgumentParser import itertools @@ -1099,7 +584,7 @@ def _fontfile(cls, suffix, texname): parser.add_argument("dpi", nargs="?", type=float, default=None) args = parser.parse_args() with Dvi(args.filename, args.dpi) as dvi: - fontmap = PsfontsMap(_find_tex_file('pdftex.map')) + fontmap = PsfontsMap(_dviread._find_tex_file('pdftex.map')) for page in dvi: print(f"=== new page === " f"(w: {page.width}, h: {page.height}, d: {page.descent})") diff --git a/lib/matplotlib/mpl-data/kpsewhich.lua b/lib/matplotlib/mpl-data/kpsewhich.lua index be07b5858ae8..4ea2eb1d9995 100644 --- a/lib/matplotlib/mpl-data/kpsewhich.lua +++ b/lib/matplotlib/mpl-data/kpsewhich.lua @@ -1,3 +1,3 @@ --- see dviread._LuatexKpsewhich +-- see _dviread._LuatexKpsewhich kpse.set_program_name("tex") while true do print(kpse.lookup(io.read():gsub("\r", ""))); io.flush(); end diff --git a/lib/matplotlib/testing/__init__.py b/lib/matplotlib/testing/__init__.py index 754277c41f43..9f279cd62d5c 100644 --- a/lib/matplotlib/testing/__init__.py +++ b/lib/matplotlib/testing/__init__.py @@ -79,7 +79,7 @@ def _check_for_pgf(texsystem): def _has_tex_package(package): try: - mpl.dviread._find_tex_file(f"{package}.sty") + mpl._dviread._find_tex_file(f"{package}.sty") return True except FileNotFoundError: return False diff --git a/lib/matplotlib/tests/test_dviread.py b/lib/matplotlib/tests/test_dviread.py index 7e10975f44d5..f4e54007e9b6 100644 --- a/lib/matplotlib/tests/test_dviread.py +++ b/lib/matplotlib/tests/test_dviread.py @@ -3,14 +3,19 @@ import shutil import matplotlib.dviread as dr +import matplotlib._dviread as _dr import pytest def test_PsfontsMap(monkeypatch): - monkeypatch.setattr(dr, '_find_tex_file', lambda x: x) + monkeypatch.setattr(_dr, '_find_tex_file', lambda x: x) filename = str(Path(__file__).parent / 'baseline_images/dviread/test.map') - fontmap = dr.PsfontsMap(filename) + fontmap = _dr.PsfontsMap(filename) + fontmap2 = dr.get_tex_font_map(filename) + + assert fontmap == fontmap2 + # Check all properties of a few fonts for n in [1, 2, 3, 4, 5]: key = b'TeXfont%d' % n diff --git a/lib/matplotlib/tests/test_texmanager.py b/lib/matplotlib/tests/test_texmanager.py index 5f98112043bb..503ccf0b6d05 100644 --- a/lib/matplotlib/tests/test_texmanager.py +++ b/lib/matplotlib/tests/test_texmanager.py @@ -2,7 +2,7 @@ import re import matplotlib.pyplot as plt -from matplotlib.texmanager import TexManager +from matplotlib._texmanager import TexManager import pytest diff --git a/lib/matplotlib/tests/test_usetex.py b/lib/matplotlib/tests/test_usetex.py index b726f9c26cfb..770599a1a571 100644 --- a/lib/matplotlib/tests/test_usetex.py +++ b/lib/matplotlib/tests/test_usetex.py @@ -4,7 +4,7 @@ import pytest import matplotlib as mpl -from matplotlib import dviread +from matplotlib import _dviread from matplotlib.testing import _has_tex_package from matplotlib.testing.decorators import check_figures_equal, image_comparison import matplotlib.pyplot as plt @@ -129,8 +129,8 @@ def test_usetex_with_underscore(): def test_missing_psfont(fmt, monkeypatch): """An error is raised if a TeX font lacks a Type-1 equivalent""" monkeypatch.setattr( - dviread.PsfontsMap, '__getitem__', - lambda self, k: dviread.PsFont( + _dviread.PsfontsMap, '__getitem__', + lambda self, k: _dviread.PsFont( texname='texfont', psname='Some Font', effects=None, encoding=None, filename=None)) mpl.rcParams['text.usetex'] = True diff --git a/lib/matplotlib/texmanager.py b/lib/matplotlib/texmanager.py new file mode 100644 index 000000000000..e9c910192470 --- /dev/null +++ b/lib/matplotlib/texmanager.py @@ -0,0 +1,5 @@ +from matplotlib._texmanager import * # noqa: F401, F403 +from matplotlib import _api +_api.warn_deprecated( + "3.6", message="The module %(name)s is deprecated since %(since)s.", + name=f"{__name__}") diff --git a/lib/matplotlib/textpath.py b/lib/matplotlib/textpath.py index 5ef56e4be885..2e1493015047 100644 --- a/lib/matplotlib/textpath.py +++ b/lib/matplotlib/textpath.py @@ -5,7 +5,7 @@ import numpy as np -from matplotlib import _text_helpers, dviread, font_manager +from matplotlib import _text_helpers, dviread, font_manager, _dviread from matplotlib.font_manager import FontProperties, get_font from matplotlib.ft2font import LOAD_NO_HINTING, LOAD_TARGET_LIGHT from matplotlib.mathtext import MathTextParser @@ -218,7 +218,7 @@ def get_glyphs_mathtext(self, prop, s, glyph_map=None, def get_texmanager(self): """Return the cached `~.texmanager.TexManager` instance.""" if self._texmanager is None: - from matplotlib.texmanager import TexManager + from matplotlib._texmanager import TexManager self._texmanager = TexManager() return self._texmanager @@ -279,7 +279,7 @@ def get_glyphs_tex(self, prop, s, glyph_map=None, @staticmethod @functools.lru_cache(50) def _get_ps_font_and_encoding(texname): - tex_font_map = dviread.PsfontsMap(dviread._find_tex_file('pdftex.map')) + tex_font_map = dviread.get_tex_font_map('pdftex.map') psfont = tex_font_map[texname] if psfont.filename is None: raise ValueError( @@ -296,7 +296,7 @@ def _get_ps_font_and_encoding(texname): # FT_Get_Name_Index/get_name_index), and load the glyph using # FT_Load_Glyph/load_glyph. (That charmap has a coverage at least # as good as, and possibly better than, the native charmaps.) - enc = dviread._parse_enc(psfont.encoding) + enc = _dviread._parse_enc(psfont.encoding) else: # If psfonts.map specifies no encoding, the indices directly # map to the font's "native" charmap; so don't use the From 478baf91f16c39925a8117bd455b87917c5c2b8c Mon Sep 17 00:00:00 2001 From: Oscar Gustafsson Date: Sat, 8 Jan 2022 12:55:59 +0100 Subject: [PATCH 6/7] Renamed _dviread_vf to _vf --- lib/matplotlib/{_dviread_vf.py => _vf.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename lib/matplotlib/{_dviread_vf.py => _vf.py} (100%) diff --git a/lib/matplotlib/_dviread_vf.py b/lib/matplotlib/_vf.py similarity index 100% rename from lib/matplotlib/_dviread_vf.py rename to lib/matplotlib/_vf.py From 92ac8f67516f522dbb8eaac13a07d3c78209717f Mon Sep 17 00:00:00 2001 From: Oscar Gustafsson Date: Sat, 8 Jan 2022 13:29:53 +0100 Subject: [PATCH 7/7] Added keyword argument to PsfontsMap to allow find_tex_file --- .../deprecations/22127-OG.rst | 22 +++ lib/matplotlib/_dviread.py | 181 ------------------ lib/matplotlib/backends/backend_pdf.py | 7 +- lib/matplotlib/dviread.py | 38 +++- lib/matplotlib/textpath.py | 2 +- 5 files changed, 57 insertions(+), 193 deletions(-) create mode 100644 doc/api/next_api_changes/deprecations/22127-OG.rst diff --git a/doc/api/next_api_changes/deprecations/22127-OG.rst b/doc/api/next_api_changes/deprecations/22127-OG.rst new file mode 100644 index 000000000000..0abcb2882800 --- /dev/null +++ b/doc/api/next_api_changes/deprecations/22127-OG.rst @@ -0,0 +1,22 @@ +Classes, functions, and named tuples in ``dviread`` deprecated +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The classes +- ``DviFont``, +- ``Vf``, +- ``Tfm``, +the function +- ``find_tex_file`` +and the named tuples +- ``Box``, +- ``Page``, +- ``Text`` +from the module ``matplotlib.dviread`` are considered internal and public +access is deprecated. + + +Module ``texmanager`` deprecated +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The module ``matplotlib.texmanager`` is considered internal and public access +is deprecated. diff --git a/lib/matplotlib/_dviread.py b/lib/matplotlib/_dviread.py index af1de84779f5..655515da4e79 100644 --- a/lib/matplotlib/_dviread.py +++ b/lib/matplotlib/_dviread.py @@ -18,7 +18,6 @@ """ from collections import namedtuple -import enum from functools import lru_cache, partial, wraps import logging import os @@ -297,186 +296,6 @@ def __init__(self, filename): self.depth[char] = depths[byte1 & 0xf] -PsFont = namedtuple('PsFont', 'texname psname effects encoding filename') - - -class PsfontsMap: - """ - A psfonts.map formatted file, mapping TeX fonts to PS fonts. - - Parameters - ---------- - filename : str or path-like - - Notes - ----- - For historical reasons, TeX knows many Type-1 fonts by different - names than the outside world. (For one thing, the names have to - fit in eight characters.) Also, TeX's native fonts are not Type-1 - but Metafont, which is nontrivial to convert to PostScript except - as a bitmap. While high-quality conversions to Type-1 format exist - and are shipped with modern TeX distributions, we need to know - which Type-1 fonts are the counterparts of which native fonts. For - these reasons a mapping is needed from internal font names to font - file names. - - A texmf tree typically includes mapping files called e.g. - :file:`psfonts.map`, :file:`pdftex.map`, or :file:`dvipdfm.map`. - The file :file:`psfonts.map` is used by :program:`dvips`, - :file:`pdftex.map` by :program:`pdfTeX`, and :file:`dvipdfm.map` - by :program:`dvipdfm`. :file:`psfonts.map` might avoid embedding - the 35 PostScript fonts (i.e., have no filename for them, as in - the Times-Bold example above), while the pdf-related files perhaps - only avoid the "Base 14" pdf fonts. But the user may have - configured these files differently. - - Examples - -------- - >>> map = PsfontsMap(find_tex_file('pdftex.map')) - >>> entry = map[b'ptmbo8r'] - >>> entry.texname - b'ptmbo8r' - >>> entry.psname - b'Times-Bold' - >>> entry.encoding - '/usr/local/texlive/2008/texmf-dist/fonts/enc/dvips/base/8r.enc' - >>> entry.effects - {'slant': 0.16700000000000001} - >>> entry.filename - """ - __slots__ = ('_filename', '_unparsed', '_parsed') - - # Create a filename -> PsfontsMap cache, so that calling - # `PsfontsMap(filename)` with the same filename a second time immediately - # returns the same object. - @lru_cache() - def __new__(cls, filename): - self = object.__new__(cls) - self._filename = os.fsdecode(filename) - # Some TeX distributions have enormous pdftex.map files which would - # take hundreds of milliseconds to parse, but it is easy enough to just - # store the unparsed lines (keyed by the first word, which is the - # texname) and parse them on-demand. - with open(filename, 'rb') as file: - self._unparsed = {} - for line in file: - tfmname = line.split(b' ', 1)[0] - self._unparsed.setdefault(tfmname, []).append(line) - self._parsed = {} - return self - - def __getitem__(self, texname): - assert isinstance(texname, bytes) - if texname in self._unparsed: - for line in self._unparsed.pop(texname): - if self._parse_and_cache_line(line): - break - try: - return self._parsed[texname] - except KeyError: - raise LookupError( - f"An associated PostScript font (required by Matplotlib) " - f"could not be found for TeX font {texname.decode('ascii')!r} " - f"in {self._filename!r}; this problem can often be solved by " - f"installing a suitable PostScript font package in your TeX " - f"package manager") from None - - def _parse_and_cache_line(self, line): - """ - Parse a line in the font mapping file. - - The format is (partially) documented at - http://mirrors.ctan.org/systems/doc/pdftex/manual/pdftex-a.pdf - https://tug.org/texinfohtml/dvips.html#psfonts_002emap - Each line can have the following fields: - - - tfmname (first, only required field), - - psname (defaults to tfmname, must come immediately after tfmname if - present), - - fontflags (integer, must come immediately after psname if present, - ignored by us), - - special (SlantFont and ExtendFont, only field that is double-quoted), - - fontfile, encodingfile (optional, prefixed by <, <<, or <[; << always - precedes a font, <[ always precedes an encoding, < can precede either - but then an encoding file must have extension .enc; < and << also - request different font subsetting behaviors but we ignore that; < can - be separated from the filename by whitespace). - - special, fontfile, and encodingfile can appear in any order. - """ - # If the map file specifies multiple encodings for a font, we - # follow pdfTeX in choosing the last one specified. Such - # entries are probably mistakes but they have occurred. - # https://tex.stackexchange.com/q/10826/ - - if not line or line.startswith((b" ", b"%", b"*", b";", b"#")): - return - tfmname = basename = special = encodingfile = fontfile = None - is_subsetted = is_t1 = is_truetype = False - matches = re.finditer(br'"([^"]*)(?:"|$)|(\S+)', line) - for match in matches: - quoted, unquoted = match.groups() - if unquoted: - if unquoted.startswith(b"<<"): # font - fontfile = unquoted[2:] - elif unquoted.startswith(b"<["): # encoding - encodingfile = unquoted[2:] - elif unquoted.startswith(b"<"): # font or encoding - word = ( - # foo - unquoted[1:] - # < by itself => read the next word - or next(filter(None, next(matches).groups()))) - if word.endswith(b".enc"): - encodingfile = word - else: - fontfile = word - is_subsetted = True - elif tfmname is None: - tfmname = unquoted - elif basename is None: - basename = unquoted - elif quoted: - special = quoted - effects = {} - if special: - words = reversed(special.split()) - for word in words: - if word == b"SlantFont": - effects["slant"] = float(next(words)) - elif word == b"ExtendFont": - effects["extend"] = float(next(words)) - - # Verify some properties of the line that would cause it to be ignored - # otherwise. - if fontfile is not None: - if fontfile.endswith((b".ttf", b".ttc")): - is_truetype = True - elif not fontfile.endswith(b".otf"): - is_t1 = True - elif basename is not None: - is_t1 = True - if is_truetype and is_subsetted and encodingfile is None: - return - if not is_t1 and ("slant" in effects or "extend" in effects): - return - if abs(effects.get("slant", 0)) > 1: - return - if abs(effects.get("extend", 0)) > 2: - return - - if basename is None: - basename = tfmname - if encodingfile is not None: - encodingfile = _find_tex_file(encodingfile) - if fontfile is not None: - fontfile = _find_tex_file(fontfile) - self._parsed[tfmname] = PsFont( - texname=tfmname, psname=basename, effects=effects, - encoding=encodingfile, filename=fontfile) - return True - - def _parse_enc(path): r""" Parse a \*.enc file referenced from a psfonts.map style file. diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py index dd2f83c28fad..b1a10e055dba 100644 --- a/lib/matplotlib/backends/backend_pdf.py +++ b/lib/matplotlib/backends/backend_pdf.py @@ -26,7 +26,7 @@ from PIL import Image import matplotlib as mpl -from matplotlib import _api, _text_helpers, cbook +from matplotlib import _api, _dviread, _text_helpers, cbook, dviread from matplotlib._pylab_helpers import Gcf from matplotlib.backend_bases import ( _Backend, FigureCanvasBase, FigureManagerBase, GraphicsContextBase, @@ -36,8 +36,6 @@ from matplotlib.font_manager import findfont, get_font from matplotlib.afm import AFM import matplotlib.type1font as type1font -import matplotlib._dviread as _dviread -import matplotlib.dviread as dviread from matplotlib.ft2font import (FIXED_WIDTH, ITALIC, LOAD_NO_SCALE, LOAD_NO_HINTING, KERNING_UNFITTED, FT2Font) from matplotlib.mathtext import MathTextParser @@ -892,8 +890,7 @@ def dviFontName(self, dvifont): if dvi_info is not None: return dvi_info.pdfname - tex_font_map = _dviread.PsfontsMap( - _dviread._find_tex_file('pdftex.map')) + tex_font_map = dviread.PsfontsMap() psfont = tex_font_map[dvifont.texname] if psfont.filename is None: raise ValueError( diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py index 9b831acba361..0cc584bf6198 100644 --- a/lib/matplotlib/dviread.py +++ b/lib/matplotlib/dviread.py @@ -26,10 +26,28 @@ import numpy as np -from matplotlib import _api, cbook, _dviread, _dviread_vf +from matplotlib import _api, _dviread _log = logging.getLogger(__name__) + +@_api.caching_module_getattr +class __getattr__: + locals().update({ + name: _api.deprecated("3.6")( + property(lambda self, _mod=_dviread, + _name=name: getattr(_mod, _name))) + for name in ["Book", "Page", "Text", "DviFont", "Tfm", + "find_tex_file"]}) + + +@_api.deprecated("3.6") +class Vf: + def __init__(self, filename): + from matplotlib import _vf + return _vf.Vf(filename) + + # Many dvi related files are looked for by external processes, require # additional parsing, and are used many times per rendering, which is why they # are cached using lru_cache(). @@ -357,7 +375,8 @@ def _fnt_def_real(self, k, c, s, d, a, l): if c != 0 and tfm.checksum != 0 and c != tfm.checksum: raise ValueError('tfm checksum mismatch: %s' % n) try: - vf = _dviread_vf._vffile(fontname) + from matplotlib._vf import _vffile + vf = _vffile(fontname) except FileNotFoundError: vf = None self.fonts[k] = _dviread.DviFont(scale=s, tfm=tfm, texname=n, vf=vf) @@ -406,6 +425,10 @@ class PsfontsMap: ---------- filename : str or path-like + find_tex_file : bool (default False) + If ``True``, *filename* is looked up in the tex build directory. + If ``False`` (default), *filename* must be a fully qualified path. + Notes ----- For historical reasons, TeX knows many Type-1 fonts by different @@ -430,7 +453,7 @@ class PsfontsMap: Examples -------- - >>> map = PsfontsMap(find_tex_file('pdftex.map')) + >>> map = PsfontsMap('pdftex.map', find_tex_file=True) >>> entry = map[b'ptmbo8r'] >>> entry.texname b'ptmbo8r' @@ -448,9 +471,12 @@ class PsfontsMap: # `PsfontsMap(filename)` with the same filename a second time immediately # returns the same object. @lru_cache() - def __new__(cls, filename): + def __new__(cls, filename, *, find_tex_file=False): self = object.__new__(cls) - self._filename = os.fsdecode(filename) + if find_tex_file: + self._filename = os.fsdecode(_dviread.find_tex_file(filename)) + else: + self._filename = os.fsdecode(filename) # Some TeX distributions have enormous pdftex.map files which would # take hundreds of milliseconds to parse, but it is easy enough to just # store the unparsed lines (keyed by the first word, which is the @@ -584,7 +610,7 @@ def _parse_and_cache_line(self, line): parser.add_argument("dpi", nargs="?", type=float, default=None) args = parser.parse_args() with Dvi(args.filename, args.dpi) as dvi: - fontmap = PsfontsMap(_dviread._find_tex_file('pdftex.map')) + fontmap = PsfontsMap("pdftex.map", find_tex_file=True) for page in dvi: print(f"=== new page === " f"(w: {page.width}, h: {page.height}, d: {page.descent})") diff --git a/lib/matplotlib/textpath.py b/lib/matplotlib/textpath.py index 2e1493015047..88fc64ed910a 100644 --- a/lib/matplotlib/textpath.py +++ b/lib/matplotlib/textpath.py @@ -279,7 +279,7 @@ def get_glyphs_tex(self, prop, s, glyph_map=None, @staticmethod @functools.lru_cache(50) def _get_ps_font_and_encoding(texname): - tex_font_map = dviread.get_tex_font_map('pdftex.map') + tex_font_map = dviread.PsfontsMap('pdftex.map', find_tex_file=True) psfont = tex_font_map[texname] if psfont.filename is None: raise ValueError(