From 645c8da7506ef235ba8ca4f17ca13463a4ca6284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Fri, 12 Jan 2018 16:02:36 +0200 Subject: [PATCH 1/7] Cache kpsewhich results persistently And allow batching them. This commit does not yet use the batching but makes it possible. --- doc/api/next_api_changes/2018-02-16-JKS.rst | 8 + doc/users/next_whats_new/texsupport_cache.rst | 22 ++ lib/matplotlib/dviread.py | 251 ++++++++++++++++-- lib/matplotlib/tests/test_dviread.py | 84 ++++++ 4 files changed, 347 insertions(+), 18 deletions(-) create mode 100644 doc/api/next_api_changes/2018-02-16-JKS.rst create mode 100644 doc/users/next_whats_new/texsupport_cache.rst diff --git a/doc/api/next_api_changes/2018-02-16-JKS.rst b/doc/api/next_api_changes/2018-02-16-JKS.rst new file mode 100644 index 000000000000..f38ad6d50932 --- /dev/null +++ b/doc/api/next_api_changes/2018-02-16-JKS.rst @@ -0,0 +1,8 @@ +dviread changes +--------------- + +The ``format`` keyword argument to ``dviread.find_tex_file`` has been +deprecated. The function without the ``format`` argument, as well as +the new ``dviread.find_tex_files`` function, cache their results in +``texsupport.N.db`` in the cache directory to speed up dvi file +processing. diff --git a/doc/users/next_whats_new/texsupport_cache.rst b/doc/users/next_whats_new/texsupport_cache.rst new file mode 100644 index 000000000000..b823e962a1d9 --- /dev/null +++ b/doc/users/next_whats_new/texsupport_cache.rst @@ -0,0 +1,22 @@ +TeX support cache +----------------- + +The `usetex` feature sends snippets of TeX code to LaTeX and related +external tools for processing. This causes a nontrivial number of +helper processes to be spawned, which can be slow on some platforms. +A new cache database helps reduce the need to spawn these helper +processes, which should improve `usetex` processing speed. + +The new cache files +~~~~~~~~~~~~~~~~~~~ + +The cache database is stored in a file named `texsupport.N.db` in the +standard cache directory (traditionally `$HOME/.matplotlib` but +possibly `$HOME/.cache/matplotlib`), where `N` stands for a version +number. The version number is incremented when new kinds of items are +added to the caching code, in order to avoid version clashes when +using multiple different versions of Matplotlib. The auxiliary files +`texsupport.N.db-wal` and `texsupport.N.db-shm` help coordinate usage +of the cache between concurrently running instances. All of these +cache files may be deleted when Matplotlib is not running, and +subsequent calls to the `usetex` code will recompute the TeX results. diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py index e0048d8b8c3f..52a66d276e86 100644 --- a/lib/matplotlib/dviread.py +++ b/lib/matplotlib/dviread.py @@ -24,12 +24,13 @@ import os import re import struct +import sqlite3 import sys import textwrap import numpy as np -from matplotlib import cbook, rcParams +from matplotlib import cbook, get_cachedir, rcParams from matplotlib.compat import subprocess _log = logging.getLogger(__name__) @@ -980,45 +981,259 @@ def _parse(self, file): return re.findall(br'/([^][{}<>\s]+)', data) -def find_tex_file(filename, format=None): +class TeXSupportCacheError(Exception): + pass + + +class TeXSupportCache: + """A persistent cache of data related to support files related to dvi + files produced by TeX. Currently holds results from :program:`kpsewhich`, + in future versions could hold pre-parsed font data etc. + + Usage:: + + # create or get the singleton instance + cache = TeXSupportCache.get_cache() + with cache.connection as transaction: + cache.update_pathnames( + {"pdftex.map": "/usr/local/pdftex.map", + "cmsy10.pfb": "/usr/local/fonts/cmsy10.pfb"}, + transaction) + pathnames = cache.get_pathnames(["pdftex.map", "cmr10.pfb"]) + # now pathnames = {"pdftex.map": "/usr/local/pdftex.map"} + + # optional after inserting new data, may improve query performance: + cache.optimize() + + Parameters + ---------- + + filename : str, optional + File in which to store the cache. Defaults to `texsupport.N.db` in + the standard cache directory where N is the current schema version. + + Attributes + ---------- + + connection + This database connection object has a context manager to set up + a transaction. Transactions are passed into methods that write to + the database. """ - Find a file in the texmf tree. + + __slots__ = ('connection') + schema_version = 1 # should match PRAGMA user_version in _create + instance = None + + @classmethod + def get_cache(cls): + "Return the singleton instance of the cache, at the default location" + if cls.instance is None: + cls.instance = cls() + return cls.instance + + def __init__(self, filename=None): + if filename is None: + filename = os.path.join(get_cachedir(), 'texsupport.%d.db' + % self.schema_version) + + self.connection = sqlite3.connect( + filename, isolation_level="DEFERRED") + with self.connection as conn: + conn.execute("PRAGMA journal_mode=WAL;") + version, = conn.execute("PRAGMA user_version;").fetchone() + + if version == 0: + self._create() + elif version != self.schema_version: + raise TeXSupportCacheError( + "support database %s has version %d, expected %d" + % (filename, version, self.schema_version)) + + def _create(self): + """Create the database.""" + with self.connection as conn: + conn.executescript( + """ + PRAGMA page_size=4096; + CREATE TABLE file_path( + filename TEXT PRIMARY KEY NOT NULL, + pathname TEXT + ) WITHOUT ROWID; + PRAGMA user_version=1; + """) + + def optimize(self): + """Optional optimization phase after updating data. + Executes sqlite's `PRAGMA optimize` statement, which can call + `ANALYZE` or other functions that can improve future query performance + by spending some time up-front.""" + with self.connection as conn: + conn.execute("PRAGMA optimize;") + + def get_pathnames(self, filenames): + """Query the cache for pathnames related to `filenames`. + + Parameters + ---------- + filenames : iterable of str + + Returns + ------- + mapping from str to (str or None) + For those filenames that exist in the cache, the mapping + includes either the related pathname or None to indicate that + the named file does not exist. + """ + rows = self.connection.execute( + "SELECT filename, pathname FROM file_path WHERE filename IN " + "(%s)" + % ','.join('?' for _ in filenames), + filenames).fetchall() + return {filename: pathname for (filename, pathname) in rows} + + def update_pathnames(self, mapping, transaction): + """Update the cache with the given filename-to-pathname mapping + + Parameters + ---------- + mapping : mapping from str to (str or None) + Mapping from filenames to the corresponding full pathnames + or None to indicate that the named file does not exist. + transaction : obtained via the context manager of self.connection + """ + transaction.executemany( + "INSERT OR REPLACE INTO file_path (filename, pathname) " + "VALUES (?, ?)", + mapping.items()) + + +def find_tex_files(filenames, cache=None): + """Find multiple files in the texmf tree. This can be more efficient + than `find_tex_file` because it makes only one call to `kpsewhich`. Calls :program:`kpsewhich` which is an interface to the kpathsea library [1]_. Most existing TeX distributions on Unix-like systems use kpathsea. It is also available as part of MikTeX, a popular distribution on Windows. + The results are cached into the TeX support database. In case of + mistaken results, deleting the database resets the cache. + Parameters ---------- filename : string or bytestring - format : string or bytestring - Used as the value of the `--format` option to :program:`kpsewhich`. - Could be e.g. 'tfm' or 'vf' to limit the search to that type of files. + cache : TeXSupportCache, optional + Cache instance to use, defaults to the singleton instance of the class. References ---------- .. [1] `Kpathsea documentation `_ The library that :program:`kpsewhich` is part of. + """ # we expect these to always be ascii encoded, but use utf-8 # out of caution - if isinstance(filename, bytes): - filename = filename.decode('utf-8', errors='replace') - if isinstance(format, bytes): - format = format.decode('utf-8', errors='replace') + filenames = [f.decode('utf-8', errors='replace') + if isinstance(f, bytes) else f + for f in filenames] + if cache is None: + cache = TeXSupportCache.get_cache() + result = cache.get_pathnames(filenames) + + filenames = [f for f in filenames if f not in result] + if not filenames: + return result - cmd = ['kpsewhich'] - if format is not None: - cmd += ['--format=' + format] - cmd += [filename] - _log.debug('find_tex_file(%s): %s', filename, cmd) + cmd = ['kpsewhich'] + list(filenames) + _log.debug('find_tex_files: %s', cmd) pipe = subprocess.Popen(cmd, stdout=subprocess.PIPE) - result = pipe.communicate()[0].rstrip() - _log.debug('find_tex_file result: %s', result) - return result.decode('ascii') + output = pipe.communicate()[0].decode('ascii').splitlines() + _log.debug('find_tex_files result: %s', output) + mapping = _match(filenames, output) + with cache.connection as transaction: + cache.update_pathnames(mapping, transaction) + result.update(mapping) + + return result + + +def _match(filenames, pathnames): + """ + Match filenames to pathnames in lists that are in matching order, + except that some filenames may lack pathnames. + """ + result = {f: None for f in filenames} + filenames, pathnames = iter(filenames), iter(pathnames) + try: + filename, pathname = next(filenames), next(pathnames) + while True: + if pathname.endswith(os.path.sep + filename): + result[filename] = pathname + pathname = next(pathnames) + filename = next(filenames) + except StopIteration: + return result + + +def find_tex_file(filename, format=None, cache=None): + """ + Find a file in the texmf tree. + + Calls :program:`kpsewhich` which is an interface to the kpathsea + library [1]_. Most existing TeX distributions on Unix-like systems use + kpathsea. It is also available as part of MikTeX, a popular + distribution on Windows. + + The results are cached into a database whose location defaults to + :file:`~/.matplotlib/texsupport.db`. In case of mistaken results, + deleting this file resets the cache. + + Parameters + ---------- + filename : string or bytestring + format : string or bytestring, DEPRECATED + Used as the value of the `--format` option to :program:`kpsewhich`. + Could be e.g. 'tfm' or 'vf' to limit the search to that type of files. + Deprecated to allow batching multiple filenames into one kpsewhich + call, since any format option would apply to all filenames at once. + cache : TeXSupportCache, optional + Cache instance to use, defaults to the singleton instance of the class. + + References + ---------- + + .. [1] `Kpathsea documentation `_ + The library that :program:`kpsewhich` is part of. + """ + + if format is not None: + cbook.warn_deprecated( + "3.0", + "The format option to find_tex_file is deprecated " + "to allow batching multiple filenames into one call. " + "Omitting the option should not change the result, as " + "kpsewhich uses the filename extension to choose the path.") + # we expect these to always be ascii encoded, but use utf-8 + # out of caution + if isinstance(filename, bytes): + filename = filename.decode('utf-8', errors='replace') + if isinstance(format, bytes): + format = format.decode('utf-8', errors='replace') + + cmd = ['kpsewhich'] + if format is not None: + cmd += ['--format=' + format] + cmd += [filename] + _log.debug('find_tex_file(%s): %s', filename, cmd) + pipe = subprocess.Popen(cmd, stdout=subprocess.PIPE) + result = pipe.communicate()[0].rstrip() + _log.debug('find_tex_file result: %s', result) + return result.decode('ascii') + + return list(find_tex_files([filename], cache).values())[0] # With multiple text objects per figure (e.g., tick labels) we may end diff --git a/lib/matplotlib/tests/test_dviread.py b/lib/matplotlib/tests/test_dviread.py index 6b005fd34170..4a5b924a2312 100644 --- a/lib/matplotlib/tests/test_dviread.py +++ b/lib/matplotlib/tests/test_dviread.py @@ -1,9 +1,16 @@ from matplotlib.testing.decorators import skip_if_command_unavailable +try: + from unittest import mock +except ImportError: + import mock + import matplotlib.dviread as dr import os.path import json import pytest +import sqlite3 +import warnings def test_PsfontsMap(monkeypatch): @@ -68,3 +75,80 @@ def test_dviread(): 'boxes': [[b.x, b.y, b.height, b.width] for b in page.boxes]} for page in dvi] assert data == correct + + +def test_TeXSupportCache(tmpdir): + dbfile = str(tmpdir / "test.db") + cache = dr.TeXSupportCache(filename=dbfile) + assert cache.get_pathnames(['foo', 'bar']) == {} + with cache.connection as transaction: + cache.update_pathnames({'foo': '/tmp/foo', + 'xyzzy': '/xyzzy.dat', + 'fontfile': None}, transaction) + assert cache.get_pathnames(['foo', 'bar']) == {'foo': '/tmp/foo'} + assert cache.get_pathnames(['xyzzy', 'fontfile']) == \ + {'xyzzy': '/xyzzy.dat', 'fontfile': None} + + +def test_TeXSupportCache_versioning(tmpdir): + dbfile = str(tmpdir / "test.db") + cache1 = dr.TeXSupportCache(dbfile) + with cache1.connection as transaction: + cache1.update_pathnames({'foo': '/tmp/foo'}, transaction) + + with sqlite3.connect(dbfile, isolation_level="DEFERRED") as conn: + conn.executescript('PRAGMA user_version=1000000000;') + + with pytest.raises(dr.TeXSupportCacheError): + cache2 = dr.TeXSupportCache(dbfile) + + +def test_find_tex_files(tmpdir): + with mock.patch('matplotlib.dviread.subprocess.Popen') as mock_popen: + mock_proc = mock.Mock() + stdout = '{s}tmp{s}foo.pfb\n{s}tmp{s}bar.map\n'.\ + format(s=os.path.sep).encode('ascii') + mock_proc.configure_mock(**{'communicate.return_value': (stdout, b'')}) + mock_popen.return_value = mock_proc + + # first call uses the results from kpsewhich + cache = dr.TeXSupportCache(filename=str(tmpdir / "test.db")) + assert dr.find_tex_files( + ['foo.pfb', 'cmsy10.pfb', 'bar.tmp', 'bar.map'], cache) \ + == {'foo.pfb': '{s}tmp{s}foo.pfb'.format(s=os.path.sep), + 'bar.map': '{s}tmp{s}bar.map'.format(s=os.path.sep), + 'cmsy10.pfb': None, 'bar.tmp': None} + assert mock_popen.called + + # second call (subset of the first one) uses only the cache + mock_popen.reset_mock() + assert dr.find_tex_files(['foo.pfb', 'cmsy10.pfb'], cache) \ + == {'foo.pfb': '{s}tmp{s}foo.pfb'.format(s=os.path.sep), + 'cmsy10.pfb': None} + assert not mock_popen.called + + # third call (includes more than the first one) uses kpsewhich again + mock_popen.reset_mock() + stdout = '{s}usr{s}local{s}cmr10.tfm\n'.\ + format(s=os.path.sep).encode('ascii') + mock_proc.configure_mock(**{'communicate.return_value': (stdout, b'')}) + mock_popen.return_value = mock_proc + assert dr.find_tex_files(['foo.pfb', 'cmr10.tfm'], cache) == \ + {'foo.pfb': '{s}tmp{s}foo.pfb'.format(s=os.path.sep), + 'cmr10.tfm': '{s}usr{s}local{s}cmr10.tfm'.format(s=os.path.sep)} + assert mock_popen.called + + +def test_find_tex_file_format(): + with mock.patch('matplotlib.dviread.subprocess.Popen') as mock_popen: + mock_proc = mock.Mock() + stdout = b'/foo/bar/baz\n' + mock_proc.configure_mock(**{'communicate.return_value': (stdout, b'')}) + mock_popen.return_value = mock_proc + + warnings.filterwarnings( + 'ignore', + 'The format option to find_tex_file is deprecated.*', + UserWarning) + assert dr.find_tex_file('foobar', format='tfm') == '/foo/bar/baz' + assert mock_popen.called From 2124ac8bcfe9a9d0d096cc21da40415301b6f653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Fri, 16 Feb 2018 18:00:39 +0200 Subject: [PATCH 2/7] Include next_whats_new/* again --- doc/users/whats_new.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/users/whats_new.rst b/doc/users/whats_new.rst index 648daabc7c5b..3c5fdc23af73 100644 --- a/doc/users/whats_new.rst +++ b/doc/users/whats_new.rst @@ -14,12 +14,12 @@ revision, see the :ref:`github-stats`. .. For a release, add a new section after this, then comment out the include and toctree below by indenting them. Uncomment them after the release. - .. include:: next_whats_new/README.rst - .. toctree:: - :glob: - :maxdepth: 1 +.. include:: next_whats_new/README.rst +.. toctree:: + :glob: + :maxdepth: 1 - next_whats_new/* + next_whats_new/* New in Matplotlib 2.2 From 3ce3061a018a622ff9a7b2e113a53c8d00a0a548 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Sun, 18 Feb 2018 14:47:20 +0200 Subject: [PATCH 3/7] Enable some sqlite and pysqlite options - synchronous=normal (fewer disk writes, still safe in WAL mode) - foreign key enforcement - log sql statements at debug level - use sqlite3.Row (enables accessing columns by name) --- lib/matplotlib/dviread.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py index 52a66d276e86..77a930de990f 100644 --- a/lib/matplotlib/dviread.py +++ b/lib/matplotlib/dviread.py @@ -1039,8 +1039,17 @@ def __init__(self, filename=None): self.connection = sqlite3.connect( filename, isolation_level="DEFERRED") + if _log.isEnabledFor(logging.DEBUG): + def debug_sql(sql): + _log.debug(' '.join(sql.splitlines()).strip()) + self.connection.set_trace_callback(debug_sql) + self.connection.row_factory = sqlite3.Row with self.connection as conn: - conn.execute("PRAGMA journal_mode=WAL;") + conn.executescript(""" + PRAGMA journal_mode=WAL; + PRAGMA synchronous=NORMAL; + PRAGMA foreign_keys=ON; + """) version, = conn.execute("PRAGMA user_version;").fetchone() if version == 0: From ba47418796e5b2a8a84bbe8799f1ca01b92c1c73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Fri, 12 Jan 2018 20:29:10 +0200 Subject: [PATCH 4/7] Call kpsewhich with more arguments at one time This should improve performance if there is a significant startup cost to running kpsewhich, as reported by some users in #4880. --- lib/matplotlib/dviread.py | 121 ++++++++++++++++-- .../tests/baseline_images/dviread/broken1.dvi | Bin 0 -> 851 bytes .../tests/baseline_images/dviread/broken2.dvi | Bin 0 -> 856 bytes .../tests/baseline_images/dviread/broken3.dvi | Bin 0 -> 856 bytes .../tests/baseline_images/dviread/broken4.dvi | Bin 0 -> 856 bytes .../tests/baseline_images/dviread/broken5.dvi | Bin 0 -> 856 bytes .../tests/baseline_images/dviread/virtual.vf | Bin 0 -> 56 bytes .../tests/baseline_images/dviread/virtual.vpl | 23 ++++ lib/matplotlib/tests/test_dviread.py | 22 ++++ 9 files changed, 156 insertions(+), 10 deletions(-) create mode 100644 lib/matplotlib/tests/baseline_images/dviread/broken1.dvi create mode 100644 lib/matplotlib/tests/baseline_images/dviread/broken2.dvi create mode 100644 lib/matplotlib/tests/baseline_images/dviread/broken3.dvi create mode 100644 lib/matplotlib/tests/baseline_images/dviread/broken4.dvi create mode 100644 lib/matplotlib/tests/baseline_images/dviread/broken5.dvi create mode 100644 lib/matplotlib/tests/baseline_images/dviread/virtual.vf create mode 100644 lib/matplotlib/tests/baseline_images/dviread/virtual.vpl diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py index 77a930de990f..7d464209d06c 100644 --- a/lib/matplotlib/dviread.py +++ b/lib/matplotlib/dviread.py @@ -171,8 +171,7 @@ def wrapper(self, byte): class Dvi(object): """ A reader for a dvi ("device-independent") file, as produced by TeX. - The current implementation can only iterate through pages in order, - and does not even attempt to verify the postamble. + The current implementation can only iterate through pages in order. This class can be used as a context manager to close the underlying file upon exit. Pages can be read via iteration. Here is an overly @@ -180,13 +179,26 @@ class Dvi(object): >>> with matplotlib.dviread.Dvi('input.dvi', 72) as dvi: >>> for page in dvi: - >>> print(''.join(unichr(t.glyph) for t in page.text)) + >>> print(''.join(chr(t.glyph) for t in page.text)) + + Parameters + ---------- + + filename : str + dvi file to read + dpi : number or None + Dots per inch, can be floating-point; this affects the + coordinates returned. Use None to get TeX's internal units + which are likely only useful for debugging. + cache : TeXSupportCache instance, optional + Support file cache instance, defaults to the TeXSupportCache + singleton. """ # dispatch table _dtable = [None] * 256 _dispatch = partial(_dispatch, _dtable) - def __init__(self, filename, dpi): + def __init__(self, filename, dpi, cache=None): """ Read the data from the file named *filename* and convert TeX's internal units to units of *dpi* per inch. @@ -194,11 +206,20 @@ def __init__(self, filename, dpi): Use None to return TeX's internal units. """ _log.debug('Dvi: %s', filename) + if cache is None: + cache = TeXSupportCache.get_cache() + self.cache = cache self.file = open(filename, 'rb') self.dpi = dpi self.fonts = {} self.state = _dvistate.pre self.baseline = self._get_baseline(filename) + self.fontnames = sorted(set(self._read_fonts())) + # populate kpsewhich cache with font pathnames + find_tex_files([x + suffix for x in self.fontnames + for suffix in ('.tfm', '.vf', '.pfb')], + cache) + cache.optimize() def _get_baseline(self, filename): if rcParams['text.latex.preview']: @@ -206,8 +227,8 @@ def _get_baseline(self, filename): baseline_filename = base + ".baseline" if os.path.exists(baseline_filename): with open(baseline_filename, 'rb') as fd: - l = fd.read().split() - height, depth, width = l + line = fd.read().split() + height, depth, width = line return float(depth) return None @@ -294,6 +315,61 @@ def _output(self): return Page(text=text, boxes=boxes, width=(maxx-minx)*d, height=(maxy_pure-miny)*d, descent=descent) + def _read_fonts(self): + """Read the postamble of the file and return a list of fonts used.""" + + file = self.file + offset = -1 + while offset > -100: + file.seek(offset, 2) + byte = file.read(1)[0] + if byte != 223: + break + offset -= 1 + if offset >= -4: + raise ValueError( + "malformed dvi file %s: too few 223 bytes" % file.name) + if byte != 2: + raise ValueError( + ("malformed dvi file %s: post-postamble " + "identification byte not 2") % file.name) + file.seek(offset - 4, 2) + offset = struct.unpack('!I', file.read(4))[0] + file.seek(offset, 0) + try: + byte = file.read(1)[0] + except IndexError: + raise ValueError( + "malformed dvi file %s: postamble offset %d out of range" + % (file.name, offset)) + if byte != 248: + raise ValueError( + "malformed dvi file %s: postamble not found at offset %d" + % (file.name, offset)) + + fonts = [] + file.seek(28, 1) + while True: + byte = file.read(1)[0] + if 243 <= byte <= 246: + _, _, _, _, a, length = ( + _arg_olen1(self, byte-243), + _arg(4, False, self, None), + _arg(4, False, self, None), + _arg(4, False, self, None), + _arg(1, False, self, None), + _arg(1, False, self, None)) + fontname = file.read(a + length)[-length:].decode('ascii') + fonts.append(fontname) + elif byte == 249: + break + else: + raise ValueError( + "malformed dvi file %s: opcode %d in postamble" + % (file.name, byte)) + file.seek(0, 0) + return fonts + def _read(self): """ Read one page from the file. Return True if successful, @@ -593,6 +669,10 @@ class Vf(Dvi): ---------- filename : string or bytestring + vf file to read + cache : TeXSupportCache instance, optional + Support file cache instance, defaults to the TeXSupportCache + singleton. Notes ----- @@ -603,8 +683,8 @@ class Vf(Dvi): but replaces the `_read` loop and dispatch mechanism. """ - def __init__(self, filename): - Dvi.__init__(self, filename, 0) + def __init__(self, filename, cache=None): + Dvi.__init__(self, filename, dpi=0, cache=cache) try: self._first_font = None self._chars = {} @@ -615,6 +695,27 @@ def __init__(self, filename): def __getitem__(self, code): return self._chars[code] + def _read_fonts(self): + """Read through the font-definition section of the vf file + and return the list of font names.""" + fonts = [] + self.file.seek(0, 0) + while True: + byte = self.file.read(1)[0] + if byte <= 242 or byte >= 248: + break + elif 243 <= byte <= 246: + _ = self._arg(byte - 242) + _, _, _, a, length = [self._arg(x) for x in (4, 4, 4, 1, 1)] + fontname = self.file.read(a + length)[-length:].decode('ascii') + fonts.append(fontname) + elif byte == 247: + _, k = self._arg(1), self._arg(1) + _ = self.file.read(k) + _, _ = self._arg(4), self._arg(4) + self.file.seek(0, 0) + return fonts + def _read(self): """ Read one page from the file. Return True if successful, @@ -652,8 +753,8 @@ def _read(self): self._init_packet(packet_len) elif 243 <= byte <= 246: k = self._arg(byte - 242, byte == 246) - c, s, d, a, l = [self._arg(x) for x in (4, 4, 4, 1, 1)] - self._fnt_def_real(k, c, s, d, a, l) + c, s, d, a, length = [self._arg(x) for x in (4, 4, 4, 1, 1)] + self._fnt_def_real(k, c, s, d, a, length) if self._first_font is None: self._first_font = k elif byte == 247: # preamble diff --git a/lib/matplotlib/tests/baseline_images/dviread/broken1.dvi b/lib/matplotlib/tests/baseline_images/dviread/broken1.dvi new file mode 100644 index 0000000000000000000000000000000000000000..6e960f435de970a10bc9dd9644db1c94cf0cb0f1 GIT binary patch literal 851 zcmey)#MnIPfQ&T*5HP=xRtQOrP{=PWDJU&bFfuSS)iX5EGcd6-G%&U32C85LDI)~_ z1Hl5ON(P4B1%DSaFf3rQ2Q!}l$%(!U44>J(KPFT%Z~=`0Vb z8P~pv%#SM~CYPiZmrPE{pWrT=T$-DjH(|%)lD;K8Ae-$N7}D~KKvsa%Wagz$&P^;S z$jL9s$xKo&o}5yaS(KWX(zmb|th5E>RmMmLhQ7Y}ia##&_RfEI&jCpO4dCgW#Bkxd z{bvq;-diyYtRNpie36@Jx>>RhEIFT1S*;UjUn&Dj=&AY4Ul;XGC=dP2+QvLP3a$g> z6VnYweLxEY6%X_RO+9_me*UNA_RkEzHkJy(!-p+7H?_jhV09EwA>YYAKy_y(DQz_9 z>+1#Sxq8xJ{+DM7s|A4O00rj%v{3}xYpadsW>yQZzAu`f^^rJ!(~FM0CMdye3O)a1;>9L2uA9#B$Y0%Br-7M`TCDKRgl@as+ literal 0 HcmV?d00001 diff --git a/lib/matplotlib/tests/baseline_images/dviread/broken2.dvi b/lib/matplotlib/tests/baseline_images/dviread/broken2.dvi new file mode 100644 index 0000000000000000000000000000000000000000..bd2b7479534633dc526652e7cb7f5bbe3a4bfaf3 GIT binary patch literal 856 zcmey)#MnIPfQ&T*5HP=xRtQOrP{=PWDJU&bFfuSS)iX5EGcd6-G%&U32C85LDI)~_ z1Hl5ON(P4B1%DSaFf3rQ2Q!}l$%(!U44>J(KPFT%Z~=`0Vb z8P~pv%#SM~CYPiZmrPE{pWrT=T$-DjH(|%)lD;K8Ae-$N7}D~KKvsa%Wagz$&P^;S z$jL9s$xKo&o}5yaS(KWX(zmb|th5E>RmMmLhQ7Y}ia##&_RfEI&jCpO4dCgW#Bkxd z{bvq;-diyYtRNpie36@Jx>>RhEIFT1S*;UjUn&Dj=&AY4Ul;XGC=dP2+QvLP3a$g> z6VnYweLxEY6%X_RO+9_me*UNA_RkEzHkJy(!-p+7H?_jhV09EwA>YYAKy_y(DQz_9 z>+1#Sxq8xJ{+DM7s|A4O00rj%v{3}xYpadsW>yQZzAu`f^^rJ!(~FM0CMdye3O)a1;>9L2uA9#B$Y0%Br-7M`TCDKRgJ(KPFT%Z~=`0Vb z8P~pv%#SM~CYPiZmrPE{pWrT=T$-DjH(|%)lD;K8Ae-$N7}D~KKvsa%Wagz$&P^;S z$jL9s$xKo&o}5yaS(KWX(zmb|th5E>RmMmLhQ7Y}ia##&_RfEI&jCpO4dCgW#Bkxd z{bvq;-diyYtRNpie36@Jx>>RhEIFT1S*;UjUn&Dj=&AY4Ul;XGC=dP2+QvLP3a$g> z6VnYweLxEY6%X_RO+9_me*UNA_RkEzHkJy(!-p+7H?_jhV09EwA>YYAKy_y(DQz_9 z>+1#Sxq8xJ{+DM7s|A4O00rj%v{3}xYpadsW>yQZzAu`f^^rJ!(~FM0CMdye3O)a1;>9L2uA9#B$Y0%Br-7M`TCDKRgJ(KPFT%Z~=`0Vb z8P~pv%#SM~CYPiZmrPE{pWrT=T$-DjH(|%)lD;K8Ae-$N7}D~KKvsa%Wagz$&P^;S z$jL9s$xKo&o}5yaS(KWX(zmb|th5E>RmMmLhQ7Y}ia##&_RfEI&jCpO4dCgW#Bkxd z{bvq;-diyYtRNpie36@Jx>>RhEIFT1S*;UjUn&Dj=&AY4Ul;XGC=dP2+QvLP3a$g> z6VnYweLxEY6%X_RO+9_me*UNA_RkEzHkJy(!-p+7H?_jhV09EwA>YYAKy_y(DQz_9 z>+1#Sxq8xJ{+DM7s|A4O00rj%v{3}xYpadsW>yQZzAu`f^^rJ!(~FM0CMdye3O)a1;>9L2uA9#B$Y0%Br-7M`TCDKRgJ(KPFT%Z~=`0Vb z8P~pv%#SM~CYPiZmrPE{pWrT=T$-DjH(|%)lD;K8Ae-$N7}D~KKvsa%Wagz$&P^;S z$jL9s$xKo&o}5yaS(KWX(zmb|th5E>RmMmLhQ7Y}ia##&_RfEI&jCpO4dCgW#Bkxd z{bvq;-diyYtRNpie36@Jx>>RhEIFT1S*;UjUn&Dj=&AY4Ul;XGC=dP2+QvLP3a$g> z6VnYweLxEY6%X_RO+9_me*UNA_RkEzHkJy(!-p+7H?_jhV09EwA>YYAKy_y(DQz_9 z>+1#Sxq8xJ{+DM7s|A4O00rj%v{3}xYpadsW>yQZzAu`f^^rJ!(~FM0CMdye3O)a1;>9L2uA9#B$Y0%Br-7M`TCDKRg Date: Fri, 19 Jan 2018 11:08:20 +0200 Subject: [PATCH 5/7] Small improvements to the Vf class Expose the scale attribute, allow overriding the widths, add some convenience methods. --- lib/matplotlib/dviread.py | 52 +++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py index 7d464209d06c..fb69369fc248 100644 --- a/lib/matplotlib/dviread.py +++ b/lib/matplotlib/dviread.py @@ -422,9 +422,9 @@ def _put_char_real(self, char): self.text.append(Text(self.h, self.v, font, char, font._width_of(char))) else: - scale = font._scale + scale = font.scale for x, y, f, g, w in font._vf[char].text: - newf = DviFont(scale=_mul2012(scale, f._scale), + newf = DviFont(scale=_mul2012(scale, f.scale), tfm=f._tfm, texname=f.texname, vf=f._vf) self.text.append(Text(self.h + _mul2012(x, scale), self.v + _mul2012(y, scale), @@ -580,16 +580,19 @@ class DviFont(object): ---------- scale : float - Factor by which the font is scaled from its natural size. - tfm : Tfm - TeX font metrics for this font + Factor by which the font is scaled from its natural size, + represented as an integer in 20.12 fixed-point format. + tfm : Tfm, may be None if widths given + TeX Font Metrics file for this font texname : bytes Name of the font as used internally by TeX and friends, as an ASCII bytestring. This is usually very different from any external font names, and :class:`dviread.PsfontsMap` can be used to find the external name of the font. - vf : Vf + vf : Vf or None A TeX "virtual font" file, or None if this font is not virtual. + widths : list of integers, optional + Widths for this font. Overrides the widths read from the tfm file. Attributes ---------- @@ -598,26 +601,37 @@ class DviFont(object): size : float Size of the font in Adobe points, converted from the slightly smaller TeX points. + scale : int + Factor by which the font is scaled from its natural size, + represented as an integer in 20.12 fixed-point format. widths : list Widths of glyphs in glyph-space units, typically 1/1000ths of the point size. """ - __slots__ = ('texname', 'size', 'widths', '_scale', '_vf', '_tfm') + __slots__ = ('texname', 'size', 'widths', 'scale', '_vf', '_tfm') - def __init__(self, scale, tfm, texname, vf): + def __init__(self, scale, tfm, texname, vf, widths=None): if not isinstance(texname, bytes): raise ValueError("texname must be a bytestring, got %s" % type(texname)) - self._scale, self._tfm, self.texname, self._vf = \ - scale, tfm, texname, vf + self.scale, self._tfm, self.texname, self._vf, self.widths = \ + scale, tfm, texname, vf, widths self.size = scale * (72.0 / (72.27 * 2**16)) - try: - nchars = max(tfm.width) + 1 - except ValueError: - nchars = 0 - self.widths = [(1000*tfm.width.get(char, 0)) >> 20 - for char in range(nchars)] + + if self.widths is None: + try: + nchars = max(tfm.width) + 1 + except ValueError: + nchars = 0 + self.widths = [(1000*tfm.width.get(char, 0)) >> 20 + for char in range(nchars)] + + def __repr__(self): + return '' % (self.texname, self.scale / 2**20) + + def __hash__(self): + return 1001 * hash(self.texname) + hash(self.size) def __eq__(self, other): return self.__class__ == other.__class__ and \ @@ -633,7 +647,7 @@ def _width_of(self, char): width = self._tfm.width.get(char, None) if width is not None: - return _mul2012(width, self._scale) + return _mul2012(width, self.scale) _log.debug('No width for char %d in font %s.', char, self.texname) return 0 @@ -651,7 +665,7 @@ def _height_depth_of(self, char): name, char, self.texname) result.append(0) else: - result.append(_mul2012(value, self._scale)) + result.append(_mul2012(value, self.scale)) return result @@ -1374,7 +1388,7 @@ def _fontfile(cls, suffix, texname): fPrev = None for x, y, f, c, w in page.text: if f != fPrev: - print('font', f.texname, 'scaled', f._scale/pow(2.0, 20)) + print('font', f.texname, 'scaled', f.scale/pow(2.0, 20)) fPrev = f print(x, y, c, 32 <= c < 128 and chr(c) or '.', w) for x, y, w, h in page.boxes: From 545b4077f46a9fabb283e9e202dd4bed956695a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Fri, 19 Jan 2018 11:39:25 +0200 Subject: [PATCH 6/7] Add a way to represent dvi files in the cache Along with methods to add and query the tables. --- lib/matplotlib/dviread.py | 390 ++++++++++++++++++++++++++- lib/matplotlib/tests/test_dviread.py | 17 ++ 2 files changed, 402 insertions(+), 5 deletions(-) diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py index fb69369fc248..cfd0afe647a0 100644 --- a/lib/matplotlib/dviread.py +++ b/lib/matplotlib/dviread.py @@ -27,8 +27,8 @@ import sqlite3 import sys import textwrap - import numpy as np +import zlib from matplotlib import cbook, get_cachedir, rcParams from matplotlib.compat import subprocess @@ -1102,13 +1102,16 @@ class TeXSupportCacheError(Exception): class TeXSupportCache: """A persistent cache of data related to support files related to dvi - files produced by TeX. Currently holds results from :program:`kpsewhich`, - in future versions could hold pre-parsed font data etc. + files produced by TeX. Currently holds results from :program:`kpsewhich` + and the contents of parsed dvi files, in future versions could include + pre-parsed font data etc. Usage:: # create or get the singleton instance cache = TeXSupportCache.get_cache() + + # insert and query some pathnames with cache.connection as transaction: cache.update_pathnames( {"pdftex.map": "/usr/local/pdftex.map", @@ -1120,6 +1123,25 @@ class TeXSupportCache: # optional after inserting new data, may improve query performance: cache.optimize() + # insert and query some dvi file contents + with cache.connection as transaction: + id = cache.dvi_new_file("/path/to/foobar.dvi", transaction) + font_ids = cache.dvi_font_sync_ids(['font1', 'font2'], transaction) + cache.dvi_font_sync_metrics(DviFont1, transaction) + cache.dvi_font_sync_metrics(DviFont2, transaction) + for i, box in enumerate(boxes): + cache.dvi_add_box(box, id, 0, i, transaction) + for i, text in enumerate(texts): + cache.dvi_add_text(text, id, 0, i, font_ids['font1'], + transaction) + fonts = cache.dvi_fonts(id) + assert cache.dvi_page_exists(id, 0) + bbox = cache.dvi_page_boundingbox(id, 0) + for box in dvi_page_boxes(id, 0): + handle_box(box) + for text in dvi_page_texts(id, 0): + handle_text(text) + Parameters ---------- @@ -1137,7 +1159,7 @@ class TeXSupportCache: """ __slots__ = ('connection') - schema_version = 1 # should match PRAGMA user_version in _create + schema_version = 2 # should match PRAGMA user_version in _create instance = None @classmethod @@ -1177,6 +1199,7 @@ def debug_sql(sql): def _create(self): """Create the database.""" with self.connection as conn: + # kpsewhich results conn.executescript( """ PRAGMA page_size=4096; @@ -1184,7 +1207,50 @@ def _create(self): filename TEXT PRIMARY KEY NOT NULL, pathname TEXT ) WITHOUT ROWID; - PRAGMA user_version=1; + """) + # dvi files + conn.executescript( + """ + CREATE TABLE dvi_file( + id INTEGER PRIMARY KEY, + name UNIQUE NOT NULL, + mtime INTEGER, + size INTEGER + ); + CREATE TABLE dvi_font( + id INTEGER PRIMARY KEY, + texname UNIQUE NOT NULL + ); + CREATE TABLE dvi_font_metrics( + id INTEGER NOT NULL + REFERENCES dvi_font(id) ON DELETE CASCADE, + scale INTEGER NOT NULL, + widths BLOB NOT NULL, + PRIMARY KEY (id, scale) + ); + CREATE TABLE dvi( + fileid INTEGER NOT NULL + REFERENCES dvi_file(id) ON DELETE CASCADE, + pageno INTEGER NOT NULL, + seq INTEGER NOT NULL, + x INTEGER NOT NULL, + y INTEGER NOT NULL, + height INTEGER NOT NULL, + width INTEGER NOT NULL, + depth INTEGER NOT NULL, + fontid INTEGER, + fontscale INTEGER, + glyph INTEGER, + PRIMARY KEY (fileid, pageno, seq) + ) WITHOUT ROWID; + CREATE TABLE dvi_baseline( + fileid INTEGER NOT NULL + REFERENCES dvi_file(id) ON DELETE CASCADE, + pageno INTEGER NOT NULL, + baseline REAL NOT NULL, + PRIMARY KEY (fileid, pageno) + ) WITHOUT ROWID; + PRAGMA user_version=2; """) def optimize(self): @@ -1231,6 +1297,320 @@ def update_pathnames(self, mapping, transaction): "VALUES (?, ?)", mapping.items()) + # Dvi files + + def dvi_new_file(self, name, transaction): + """Record a dvi file in the cache. + + Parameters + ---------- + name : str + Name of the file to add. + transaction : obtained via the context manager of self.connection + """ + + stat = os.stat(name) + transaction.execute("DELETE FROM dvi_file WHERE name=?", (name,)) + transaction.execute( + "INSERT INTO dvi_file (name, mtime, size) VALUES (?, ?, ?)", + (name, int(stat.st_mtime), int(stat.st_size))) + return transaction.execute("SELECT last_insert_rowid()").fetchone()[0] + + def dvi_id(self, name): + """Query the database identifier of a given dvi file. + + Parameters + ---------- + name : str + Name of the file to query. + + Returns + ------- + int or None + """ + + rows = self.connection.execute( + "SELECT id, mtime, size FROM dvi_file WHERE name=? LIMIT 1", + (name,)).fetchall() + if rows: + id, mtime, size = rows[0] + stat = os.stat(name) + if mtime == int(stat.st_mtime) and size == stat.st_size: + return id + + def dvi_font_sync_ids(self, fontnames, transaction): + """Record dvi fonts in the cache and return their database + identifiers. + + Parameters + ---------- + fontnames : list of str + TeX names of fonts + transaction : obtained via the context manager of self.connection + + Returns + ------- + mapping from texname to int + """ + + transaction.executemany( + "INSERT OR IGNORE INTO dvi_font (texname) VALUES (?)", + ((name,) for name in fontnames)) + fontid = {} + for name in fontnames: + fontid[name], = transaction.execute( + "SELECT id FROM dvi_font WHERE texname=?", + (name,)).fetchone() + return fontid + + def dvi_font_sync_metrics(self, dvifont, transaction): + """Record dvi font metrics in the cache. + + Parameters + ---------- + dvifont : DviFont + transaction : obtained via the context manager of self.connection + """ + + exists = bool(transaction.execute(""" + SELECT 1 FROM dvi_font_metrics m, dvi_font f + WHERE m.id=f.id AND f.texname=:texname + AND m.scale=:scale LIMIT 1 + """, { + "texname": dvifont.texname.decode('ascii'), + "scale": dvifont.scale + }).fetchall()) + + if not exists: + # Widths are given in 32-bit words in tfm, although the normal + # range is around 1000 units. This and the repetition of values + # make the width data very compressible. + widths = struct.pack('<{}I'.format(len(dvifont.widths)), + *dvifont.widths) + widths = zlib.compress(widths, 9) + transaction.execute(""" + INSERT INTO dvi_font_metrics (id, scale, widths) + SELECT id, :scale, :widths FROM dvi_font WHERE texname=:texname + """, { + "texname": dvifont.texname.decode('ascii'), + "scale": dvifont.scale, + "widths": widths + }) + + def dvi_fonts(self, fileid): + """Query the dvi fonts of a given dvi file. + + Parameters + ---------- + fileid : int + File identifier as returned by dvi_id + + Returns + ------- + mapping from (str, float) to DviFont + Maps from (TeX name, scale) to DviFont objects. + """ + + rows = self.connection.execute(""" + SELECT texname, fontscale, widths FROM + (SELECT DISTINCT fontid, fontscale FROM dvi WHERE fileid=?) d + JOIN dvi_font f ON (d.fontid=f.id) + JOIN dvi_font_metrics m ON (d.fontid=m.id AND d.fontscale=m.scale) + """, (fileid,)).fetchall() + + def decode(widths): + data = zlib.decompress(widths) + n = len(data) // 4 + return struct.unpack('<{}I'.format(n), data) + + return {(row['texname'], row['fontscale']): + DviFont(texname=row['texname'].encode('ascii'), + scale=row['fontscale'], + widths=decode(row['widths']), + tfm=None, vf=None) + for row in rows} + + def dvi_add_box(self, box, fileid, pageno, seq, transaction): + """Record a box object of a dvi file. + + Parameters + ---------- + box : Box + fileid : int + As returned by dvi_id + pageno : int + Page number + seq : int + Used to order the boxes + transaction : obtained via the context manager of self.connection + """ + + transaction.execute(""" + INSERT INTO dvi ( + fileid, pageno, seq, x, y, height, width, depth + ) VALUES (:fileid, :pageno, :seq, :x, :y, :height, :width, 0) + """, { + "fileid": fileid, "pageno": pageno, "seq": seq, + "x": box.x, "y": box.y, "height": box.height, "width": box.width + }) + + def dvi_add_text(self, text, fileid, pageno, seq, fontid, transaction): + """Record a box object of a dvi file. + + Parameters + ---------- + box : Text + fileid : int + As returned by dvi_id + pageno : int + Page number + seq : int + Used to order the boxes + fontid : int + As returned by dvi_font_sync_ids + transaction : obtained via the context manager of self.connection + """ + + height, depth = text.font._height_depth_of(text.glyph) + transaction.execute(""" + INSERT INTO dvi ( + fileid, pageno, seq, + x, y, height, width, depth, fontid, fontscale, glyph + ) VALUES ( + :fileid, :pageno, :seq, + :x, :y, :height, :width, :depth, :fontid, :fontscale, :glyph + ) + """, { + "fileid": fileid, "pageno": pageno, "seq": seq, + "x": text.x, "y": text.y, "width": text.width, + "height": height, "depth": depth, + "fontid": fontid, "fontscale": text.font.scale, "glyph": text.glyph + }) + + def dvi_page_exists(self, fileid, pageno): + """Query if a page exists in the dvi file. + + Parameters + ---------- + fileid : int + As returned by dvi_id + pageno : int + Page number + + Returns + ------- + boolean + """ + return bool(self.connection.execute( + "SELECT 1 FROM dvi WHERE fileid=? AND pageno=? LIMIT 1", + (fileid, pageno)).fetchall()) + + def dvi_page_boundingbox(self, fileid, pageno): + """Query the bounding box of a page + + Parameters + ---------- + fileid : int + As returned by dvi_id + pageno + Page number + + Returns + ------- + A namedtuple-like object with fields min_x, min_y, max_x, + max_y and max_y_pure (like max_y but ignores depth). + """ + + return self.connection.execute(""" + SELECT min(x) min_x, + min(y - height) min_y, + max(x + width) max_x, + max(y + depth) max_y, + max(y) max_y_pure + FROM dvi WHERE fileid=? AND pageno=? + """, (fileid, pageno)).fetchone() + + def dvi_page_boxes(self, fileid, pageno): + """Query the boxes of a page + + Parameters + ---------- + fileid : int + As returned by dvi_id + pageno + Page number + + Returns + ------- + An iterator of (x, y, height, width) tuples of boxes + """ + + return self.connection.execute(""" + SELECT x, y, height, width FROM dvi + WHERE fileid=? AND pageno=? AND fontid IS NULL ORDER BY seq + """, (fileid, pageno)).fetchall() + + def dvi_page_text(self, fileid, pageno): + """Query the text of a page + + Parameters + ---------- + fileid : int + As returned by dvi_id + pageno + Page number + + Returns + ------- + An iterator of (x, y, height, width, depth, texname, fontscale) + tuples of text + """ + + return self.connection.execute(""" + SELECT x, y, height, width, depth, f.texname, fontscale, glyph + FROM dvi JOIN dvi_font f ON (dvi.fontid=f.id) + WHERE fileid=? AND pageno=? AND fontid IS NOT NULL ORDER BY seq + """, (fileid, pageno)).fetchall() + + def dvi_add_baseline(self, fileid, pageno, baseline, transaction): + """Record the baseline of a dvi page + + Parameters + ---------- + fileid : int + As returned by dvi_id + pageno : int + Page number + baseline : float + transaction : obtained via the context manager of self.connection + """ + + transaction.execute(""" + INSERT INTO dvi_baseline (fileid, pageno, baseline) + VALUES (:fileid, :pageno, :baseline) + """, {"fileid": fileid, "pageno": pageno, "baseline": baseline}) + + def dvi_get_baseline(self, fileid, pageno): + """Query the baseline of a dvi page + + Parameters + ---------- + fileid : int + As returned by dvi_id + pageno : int + Page number + + Returns + ------- + float + """ + + rows = self.connection.execute( + "SELECT baseline FROM dvi_baseline WHERE fileid=? AND pageno=?", + (fileid, pageno)).fetchall() + if rows: + return rows[0][0] + def find_tex_files(filenames, cache=None): """Find multiple files in the texmf tree. This can be more efficient diff --git a/lib/matplotlib/tests/test_dviread.py b/lib/matplotlib/tests/test_dviread.py index 6091c106db22..3d61a4140eb3 100644 --- a/lib/matplotlib/tests/test_dviread.py +++ b/lib/matplotlib/tests/test_dviread.py @@ -111,6 +111,23 @@ def test_TeXSupportCache(tmpdir): assert cache.get_pathnames(['xyzzy', 'fontfile']) == \ {'xyzzy': '/xyzzy.dat', 'fontfile': None} + # check that modifying a dvi file invalidates the cache + filename = str(tmpdir / "file.dvi") + with open(filename, "wb") as f: + f.write(b'qwerty') + os.utime(filename, (0, 0)) + with cache.connection as t: + id1 = cache.dvi_new_file(filename, t) + assert cache.dvi_id(filename) == id1 + + with open(filename, "wb") as f: + f.write(b'asfdg') + os.utime(filename, (0, 0)) + assert cache.dvi_id(filename) is None + with cache.connection as t: + id2 = cache.dvi_new_file(filename, t) + assert cache.dvi_id(filename) == id2 + def test_TeXSupportCache_versioning(tmpdir): dbfile = str(tmpdir / "test.db") From 2418413fcef3619ec1ce300cb2bad31d586c5285 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Fri, 19 Jan 2018 11:44:10 +0200 Subject: [PATCH 7/7] Implement reading dvi files into the cache Rename the Dvi class to _DviReader and use it only for storing the files into the cache. The new Dvi class reads from the cache, after calling _DviReader to insert the file into it. --- lib/matplotlib/dviread.py | 253 +++++++++++++++------------ lib/matplotlib/tests/test_dviread.py | 10 +- 2 files changed, 144 insertions(+), 119 deletions(-) diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py index cfd0afe647a0..d40d9ac244a3 100644 --- a/lib/matplotlib/dviread.py +++ b/lib/matplotlib/dviread.py @@ -20,6 +20,7 @@ from collections import namedtuple import enum from functools import lru_cache, partial, wraps +from itertools import chain import logging import os import re @@ -168,28 +169,23 @@ def wrapper(self, byte): return decorate -class Dvi(object): - """ - A reader for a dvi ("device-independent") file, as produced by TeX. - The current implementation can only iterate through pages in order. +def _keep(func, keys): + """Return mapping from each k in keys to func(k) + such that func(k) is not None""" + return dict((k, v) for k, v in zip(keys, map(func, keys)) if v is not None) - This class can be used as a context manager to close the underlying - file upon exit. Pages can be read via iteration. Here is an overly - simple way to extract text without trying to detect whitespace:: - >>> with matplotlib.dviread.Dvi('input.dvi', 72) as dvi: - >>> for page in dvi: - >>> print(''.join(chr(t.glyph) for t in page.text)) +class _DviReader(object): + """ + A reader for a dvi ("device-independent") file, as produced by TeX. + This implementation is only used to store the file in a cache, from + which it is read by Dvi. Parameters ---------- filename : str dvi file to read - dpi : number or None - Dots per inch, can be floating-point; this affects the - coordinates returned. Use None to get TeX's internal units - which are likely only useful for debugging. cache : TeXSupportCache instance, optional Support file cache instance, defaults to the TeXSupportCache singleton. @@ -198,28 +194,28 @@ class Dvi(object): _dtable = [None] * 256 _dispatch = partial(_dispatch, _dtable) - def __init__(self, filename, dpi, cache=None): - """ - Read the data from the file named *filename* and convert - TeX's internal units to units of *dpi* per inch. - *dpi* only sets the units and does not limit the resolution. - Use None to return TeX's internal units. - """ + def __init__(self, filename, cache=None): _log.debug('Dvi: %s', filename) if cache is None: cache = TeXSupportCache.get_cache() self.cache = cache self.file = open(filename, 'rb') - self.dpi = dpi self.fonts = {} + self.recursive_fonts = set() self.state = _dvistate.pre self.baseline = self._get_baseline(filename) - self.fontnames = sorted(set(self._read_fonts())) + self.fontnames = set(self._read_fonts()) # populate kpsewhich cache with font pathnames find_tex_files([x + suffix for x in self.fontnames for suffix in ('.tfm', '.vf', '.pfb')], cache) - cache.optimize() + self._tfm = _keep(_tfmfile, self.fontnames) + self._vf = _keep(_vffile, self.fontnames) + for vf in self._vf.values(): + self.fontnames.update(vf.fontnames) + + def close(self): + self.file.close() def _get_baseline(self, filename): if rcParams['text.latex.preview']: @@ -232,88 +228,32 @@ def _get_baseline(self, filename): return float(depth) return None - def __enter__(self): - """ - Context manager enter method, does nothing. - """ - return self - - def __exit__(self, etype, evalue, etrace): - """ - Context manager exit method, closes the underlying file if it is open. - """ - self.close() - - def __iter__(self): - """ - Iterate through the pages of the file. - - Yields - ------ - Page - Details of all the text and box objects on the page. - The Page tuple contains lists of Text and Box tuples and - the page dimensions, and the Text and Box tuples contain - coordinates transformed into a standard Cartesian - coordinate system at the dpi value given when initializing. - The coordinates are floating point numbers, but otherwise - precision is not lost and coordinate values are not clipped to - integers. - """ - while True: - have_page = self._read() - if have_page: - yield self._output() - else: - break - - def close(self): - """ - Close the underlying file if it is open. - """ - if not self.file.closed: - self.file.close() - - def _output(self): - """ - Output the text and boxes belonging to the most recent page. - page = dvi._output() - """ - minx, miny, maxx, maxy = np.inf, np.inf, -np.inf, -np.inf - maxy_pure = -np.inf - for elt in self.text + self.boxes: - if isinstance(elt, Box): - x, y, h, w = elt - e = 0 # zero depth - else: # glyph - x, y, font, g, w = elt - h, e = font._height_depth_of(g) - minx = min(minx, x) - miny = min(miny, y - h) - maxx = max(maxx, x + w) - maxy = max(maxy, y + e) - maxy_pure = max(maxy_pure, y) - - if self.dpi is None: - # special case for ease of debugging: output raw dvi coordinates - return Page(text=self.text, boxes=self.boxes, - width=maxx-minx, height=maxy_pure-miny, - descent=maxy-maxy_pure) - - # convert from TeX's "scaled points" to dpi units - d = self.dpi / (72.27 * 2**16) - if self.baseline is None: - descent = (maxy - maxy_pure) * d - else: - descent = self.baseline - - text = [Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d) - for (x, y, f, g, w) in self.text] - boxes = [Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d) - for (x, y, h, w) in self.boxes] - - return Page(text=text, boxes=boxes, width=(maxx-minx)*d, - height=(maxy_pure-miny)*d, descent=descent) + def store(self): + c = self.cache + with c.connection as t: + fileid = c.dvi_new_file(self.file.name, t) + _log.debug('fontnames is %s', self.fontnames) + fontid = c.dvi_font_sync_ids(self.fontnames, t) + + pageno = 0 + while True: + if not self._read(): + break + for seq, elt in enumerate(self.text + self.boxes): + if isinstance(elt, Box): + c.dvi_add_box(elt, fileid, pageno, seq, t) + else: + texname = elt.font.texname.decode('ascii') + c.dvi_add_text(elt, fileid, pageno, seq, + fontid[texname], t) + pageno += 1 + + for dvifont in chain(self.recursive_fonts, self.fonts.values()): + c.dvi_font_sync_metrics(dvifont, t) + if self.baseline is not None: + c.dvi_add_baseline(fileid, 0, self.baseline, t) + c.optimize() + return fileid def _read_fonts(self): """Read the postamble of the file and return a list of fonts used.""" @@ -360,6 +300,8 @@ def _read_fonts(self): _arg(1, False, self, None), _arg(1, False, self, None)) fontname = file.read(a + length)[-length:].decode('ascii') + _log.debug('dvi._read_fonts(%s): encountered %s', + self.file.name, fontname) fonts.append(fontname) elif byte == 249: break @@ -426,6 +368,7 @@ def _put_char_real(self, char): for x, y, f, g, w in font._vf[char].text: newf = DviFont(scale=_mul2012(scale, f.scale), tfm=f._tfm, texname=f.texname, vf=f._vf) + self.recursive_fonts.add(newf) self.text.append(Text(self.h + _mul2012(x, scale), self.v + _mul2012(y, scale), newf, g, newf._width_of(g))) @@ -522,14 +465,12 @@ def _fnt_def(self, k, c, s, d, a, l): def _fnt_def_real(self, k, c, s, d, a, l): n = self.file.read(a + l) fontname = n[-l:].decode('ascii') - tfm = _tfmfile(fontname) + tfm = self._tfm.get(fontname) if tfm is None: raise FileNotFoundError("missing font metrics file: %s" % fontname) if c != 0 and tfm.checksum != 0 and c != tfm.checksum: raise ValueError('tfm checksum mismatch: %s' % n) - - vf = _vffile(fontname) - + vf = self._vf.get(fontname) self.fonts[k] = DviFont(scale=s, tfm=tfm, texname=n, vf=vf) @_dispatch(247, state=_dvistate.pre, args=('u1', 'u4', 'u4', 'u4', 'u1')) @@ -669,7 +610,89 @@ def _height_depth_of(self, char): return result -class Vf(Dvi): +class Dvi(object): + """ + A representation of a dvi ("device-independent") file, as produced by TeX. + + Parameters + ---------- + + filename : str + dpi : float or None + cache : TeXSupportCache, optional + + Attributes + ---------- + + filename : str + dpi : float or None + cache : TeXSupportCache + + + """ + def __init__(self, filename, dpi, cache=None): + if cache is None: + cache = TeXSupportCache.get_cache() + self.cache = cache + self.filename = filename + self.dpi = dpi + self._filename_id = cache.dvi_id(filename) + if self._filename_id is None: + self._filename_id = _DviReader(filename, cache).store() + self._fonts = cache.dvi_fonts(self._filename_id) + + def __enter__(self): + return self + + def __exit__(self, etype, evalue, etrace): + pass + + def __getitem__(self, pageno): + if self.cache.dvi_page_exists(self._filename_id, pageno): + return self._output(pageno) + raise IndexError + + def _output(self, page): + extrema = self.cache.dvi_page_boundingbox(self._filename_id, page) + min_x, min_y, max_x, max_y, max_y_pure = ( + extrema[n] for n in ('min_x', 'min_y', 'max_x', + 'max_y', 'max_y_pure')) + boxes = self.cache.dvi_page_boxes(self._filename_id, page) + text = self.cache.dvi_page_text(self._filename_id, page) + baseline = self.cache.dvi_get_baseline(self._filename_id, page) + if self.dpi is None: + return Page(text=[Text(x=row['x'], y=row['y'], + font=self._fonts[(row['texname'], + row['fontscale'])], + glyph=row['glyph'], width=row['width']) + for row in text], + boxes=[Box(x=row['x'], y=row['y'], + height=row['height'], width=row['width']) + for row in boxes], + width=max_x-min_x, + height=max_y_pure-min_y, + descent=max_y-max_y_pure) + d = self.dpi / (72.27 * 2**16) + descent = \ + baseline if baseline is not None else (max_y - max_y_pure) * d + + return Page(text=[Text((row['x'] - min_x) * d, + (max_y - row['y']) * d - descent, + self._fonts[(row['texname'], row['fontscale'])], + row['glyph'], + row['width'] * d) + for row in text], + boxes=[Box((row['x'] - min_x) * d, + (max_y - row['y']) * d - descent, + row['height'] * d, + row['width'] * d) + for row in boxes], + width=(max_x - min_x) * d, + height=(max_y_pure - min_y) * d, + descent=descent) + + +class Vf(_DviReader): """ A virtual font (\\*.vf file) containing subroutines for dvi files. @@ -693,12 +716,12 @@ class Vf(Dvi): The virtual font format is a derivative of dvi: http://mirrors.ctan.org/info/knuth/virtual-fonts - This class reuses some of the machinery of `Dvi` + This class reuses some of the machinery of `_DviReader` but replaces the `_read` loop and dispatch mechanism. """ def __init__(self, filename, cache=None): - Dvi.__init__(self, filename, dpi=0, cache=cache) + _DviReader.__init__(self, filename, cache=cache) try: self._first_font = None self._chars = {} @@ -723,6 +746,8 @@ def _read_fonts(self): _, _, _, a, length = [self._arg(x) for x in (4, 4, 4, 1, 1)] fontname = self.file.read(a + length)[-length:].decode('ascii') fonts.append(fontname) + _log.debug('Vf._read_fonts(%s): encountered %s', + self.file.name, fontname) elif byte == 247: _, k = self._arg(1), self._arg(1) _ = self.file.read(k) @@ -752,7 +777,7 @@ def _read(self): if byte in (139, 140) or byte >= 243: raise ValueError( "Inappropriate opcode %d in vf file" % byte) - Dvi._dtable[byte](self, byte) + _DviReader._dtable[byte](self, byte) continue # We are outside a packet diff --git a/lib/matplotlib/tests/test_dviread.py b/lib/matplotlib/tests/test_dviread.py index 3d61a4140eb3..e3986ebb1c5f 100644 --- a/lib/matplotlib/tests/test_dviread.py +++ b/lib/matplotlib/tests/test_dviread.py @@ -80,11 +80,11 @@ def test_dviread(): @skip_if_command_unavailable(["kpsewhich", "-version"]) def test_dviread_get_fonts(): dir = os.path.join(os.path.dirname(__file__), 'baseline_images', 'dviread') - with dr.Dvi(os.path.join(dir, 'test.dvi'), None) as dvi: - assert dvi.fontnames == \ - ['cmex10', 'cmmi10', 'cmmi5', 'cmr10', 'cmr5', 'cmr7'] - with dr.Vf(os.path.join(dir, 'virtual.vf')) as vf: - assert vf.fontnames == ['cmex10', 'cmr10'] + dvi = dr._DviReader(os.path.join(dir, 'test.dvi'), None) + assert dvi.fontnames == \ + {'cmex10', 'cmmi10', 'cmmi5', 'cmr10', 'cmr5', 'cmr7'} + vf = dr.Vf(os.path.join(dir, 'virtual.vf')) + assert vf.fontnames == {'cmex10', 'cmr10'} def test_dviread_get_fonts_error_handling():