From 645c8da7506ef235ba8ca4f17ca13463a4ca6284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Fri, 12 Jan 2018 16:02:36 +0200 Subject: [PATCH 1/3] Cache kpsewhich results persistently And allow batching them. This commit does not yet use the batching but makes it possible. --- doc/api/next_api_changes/2018-02-16-JKS.rst | 8 + doc/users/next_whats_new/texsupport_cache.rst | 22 ++ lib/matplotlib/dviread.py | 251 ++++++++++++++++-- lib/matplotlib/tests/test_dviread.py | 84 ++++++ 4 files changed, 347 insertions(+), 18 deletions(-) create mode 100644 doc/api/next_api_changes/2018-02-16-JKS.rst create mode 100644 doc/users/next_whats_new/texsupport_cache.rst diff --git a/doc/api/next_api_changes/2018-02-16-JKS.rst b/doc/api/next_api_changes/2018-02-16-JKS.rst new file mode 100644 index 000000000000..f38ad6d50932 --- /dev/null +++ b/doc/api/next_api_changes/2018-02-16-JKS.rst @@ -0,0 +1,8 @@ +dviread changes +--------------- + +The ``format`` keyword argument to ``dviread.find_tex_file`` has been +deprecated. The function without the ``format`` argument, as well as +the new ``dviread.find_tex_files`` function, cache their results in +``texsupport.N.db`` in the cache directory to speed up dvi file +processing. diff --git a/doc/users/next_whats_new/texsupport_cache.rst b/doc/users/next_whats_new/texsupport_cache.rst new file mode 100644 index 000000000000..b823e962a1d9 --- /dev/null +++ b/doc/users/next_whats_new/texsupport_cache.rst @@ -0,0 +1,22 @@ +TeX support cache +----------------- + +The `usetex` feature sends snippets of TeX code to LaTeX and related +external tools for processing. This causes a nontrivial number of +helper processes to be spawned, which can be slow on some platforms. +A new cache database helps reduce the need to spawn these helper +processes, which should improve `usetex` processing speed. + +The new cache files +~~~~~~~~~~~~~~~~~~~ + +The cache database is stored in a file named `texsupport.N.db` in the +standard cache directory (traditionally `$HOME/.matplotlib` but +possibly `$HOME/.cache/matplotlib`), where `N` stands for a version +number. The version number is incremented when new kinds of items are +added to the caching code, in order to avoid version clashes when +using multiple different versions of Matplotlib. The auxiliary files +`texsupport.N.db-wal` and `texsupport.N.db-shm` help coordinate usage +of the cache between concurrently running instances. All of these +cache files may be deleted when Matplotlib is not running, and +subsequent calls to the `usetex` code will recompute the TeX results. diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py index e0048d8b8c3f..52a66d276e86 100644 --- a/lib/matplotlib/dviread.py +++ b/lib/matplotlib/dviread.py @@ -24,12 +24,13 @@ import os import re import struct +import sqlite3 import sys import textwrap import numpy as np -from matplotlib import cbook, rcParams +from matplotlib import cbook, get_cachedir, rcParams from matplotlib.compat import subprocess _log = logging.getLogger(__name__) @@ -980,45 +981,259 @@ def _parse(self, file): return re.findall(br'/([^][{}<>\s]+)', data) -def find_tex_file(filename, format=None): +class TeXSupportCacheError(Exception): + pass + + +class TeXSupportCache: + """A persistent cache of data related to support files related to dvi + files produced by TeX. Currently holds results from :program:`kpsewhich`, + in future versions could hold pre-parsed font data etc. + + Usage:: + + # create or get the singleton instance + cache = TeXSupportCache.get_cache() + with cache.connection as transaction: + cache.update_pathnames( + {"pdftex.map": "/usr/local/pdftex.map", + "cmsy10.pfb": "/usr/local/fonts/cmsy10.pfb"}, + transaction) + pathnames = cache.get_pathnames(["pdftex.map", "cmr10.pfb"]) + # now pathnames = {"pdftex.map": "/usr/local/pdftex.map"} + + # optional after inserting new data, may improve query performance: + cache.optimize() + + Parameters + ---------- + + filename : str, optional + File in which to store the cache. Defaults to `texsupport.N.db` in + the standard cache directory where N is the current schema version. + + Attributes + ---------- + + connection + This database connection object has a context manager to set up + a transaction. Transactions are passed into methods that write to + the database. """ - Find a file in the texmf tree. + + __slots__ = ('connection') + schema_version = 1 # should match PRAGMA user_version in _create + instance = None + + @classmethod + def get_cache(cls): + "Return the singleton instance of the cache, at the default location" + if cls.instance is None: + cls.instance = cls() + return cls.instance + + def __init__(self, filename=None): + if filename is None: + filename = os.path.join(get_cachedir(), 'texsupport.%d.db' + % self.schema_version) + + self.connection = sqlite3.connect( + filename, isolation_level="DEFERRED") + with self.connection as conn: + conn.execute("PRAGMA journal_mode=WAL;") + version, = conn.execute("PRAGMA user_version;").fetchone() + + if version == 0: + self._create() + elif version != self.schema_version: + raise TeXSupportCacheError( + "support database %s has version %d, expected %d" + % (filename, version, self.schema_version)) + + def _create(self): + """Create the database.""" + with self.connection as conn: + conn.executescript( + """ + PRAGMA page_size=4096; + CREATE TABLE file_path( + filename TEXT PRIMARY KEY NOT NULL, + pathname TEXT + ) WITHOUT ROWID; + PRAGMA user_version=1; + """) + + def optimize(self): + """Optional optimization phase after updating data. + Executes sqlite's `PRAGMA optimize` statement, which can call + `ANALYZE` or other functions that can improve future query performance + by spending some time up-front.""" + with self.connection as conn: + conn.execute("PRAGMA optimize;") + + def get_pathnames(self, filenames): + """Query the cache for pathnames related to `filenames`. + + Parameters + ---------- + filenames : iterable of str + + Returns + ------- + mapping from str to (str or None) + For those filenames that exist in the cache, the mapping + includes either the related pathname or None to indicate that + the named file does not exist. + """ + rows = self.connection.execute( + "SELECT filename, pathname FROM file_path WHERE filename IN " + "(%s)" + % ','.join('?' for _ in filenames), + filenames).fetchall() + return {filename: pathname for (filename, pathname) in rows} + + def update_pathnames(self, mapping, transaction): + """Update the cache with the given filename-to-pathname mapping + + Parameters + ---------- + mapping : mapping from str to (str or None) + Mapping from filenames to the corresponding full pathnames + or None to indicate that the named file does not exist. + transaction : obtained via the context manager of self.connection + """ + transaction.executemany( + "INSERT OR REPLACE INTO file_path (filename, pathname) " + "VALUES (?, ?)", + mapping.items()) + + +def find_tex_files(filenames, cache=None): + """Find multiple files in the texmf tree. This can be more efficient + than `find_tex_file` because it makes only one call to `kpsewhich`. Calls :program:`kpsewhich` which is an interface to the kpathsea library [1]_. Most existing TeX distributions on Unix-like systems use kpathsea. It is also available as part of MikTeX, a popular distribution on Windows. + The results are cached into the TeX support database. In case of + mistaken results, deleting the database resets the cache. + Parameters ---------- filename : string or bytestring - format : string or bytestring - Used as the value of the `--format` option to :program:`kpsewhich`. - Could be e.g. 'tfm' or 'vf' to limit the search to that type of files. + cache : TeXSupportCache, optional + Cache instance to use, defaults to the singleton instance of the class. References ---------- .. [1] `Kpathsea documentation `_ The library that :program:`kpsewhich` is part of. + """ # we expect these to always be ascii encoded, but use utf-8 # out of caution - if isinstance(filename, bytes): - filename = filename.decode('utf-8', errors='replace') - if isinstance(format, bytes): - format = format.decode('utf-8', errors='replace') + filenames = [f.decode('utf-8', errors='replace') + if isinstance(f, bytes) else f + for f in filenames] + if cache is None: + cache = TeXSupportCache.get_cache() + result = cache.get_pathnames(filenames) + + filenames = [f for f in filenames if f not in result] + if not filenames: + return result - cmd = ['kpsewhich'] - if format is not None: - cmd += ['--format=' + format] - cmd += [filename] - _log.debug('find_tex_file(%s): %s', filename, cmd) + cmd = ['kpsewhich'] + list(filenames) + _log.debug('find_tex_files: %s', cmd) pipe = subprocess.Popen(cmd, stdout=subprocess.PIPE) - result = pipe.communicate()[0].rstrip() - _log.debug('find_tex_file result: %s', result) - return result.decode('ascii') + output = pipe.communicate()[0].decode('ascii').splitlines() + _log.debug('find_tex_files result: %s', output) + mapping = _match(filenames, output) + with cache.connection as transaction: + cache.update_pathnames(mapping, transaction) + result.update(mapping) + + return result + + +def _match(filenames, pathnames): + """ + Match filenames to pathnames in lists that are in matching order, + except that some filenames may lack pathnames. + """ + result = {f: None for f in filenames} + filenames, pathnames = iter(filenames), iter(pathnames) + try: + filename, pathname = next(filenames), next(pathnames) + while True: + if pathname.endswith(os.path.sep + filename): + result[filename] = pathname + pathname = next(pathnames) + filename = next(filenames) + except StopIteration: + return result + + +def find_tex_file(filename, format=None, cache=None): + """ + Find a file in the texmf tree. + + Calls :program:`kpsewhich` which is an interface to the kpathsea + library [1]_. Most existing TeX distributions on Unix-like systems use + kpathsea. It is also available as part of MikTeX, a popular + distribution on Windows. + + The results are cached into a database whose location defaults to + :file:`~/.matplotlib/texsupport.db`. In case of mistaken results, + deleting this file resets the cache. + + Parameters + ---------- + filename : string or bytestring + format : string or bytestring, DEPRECATED + Used as the value of the `--format` option to :program:`kpsewhich`. + Could be e.g. 'tfm' or 'vf' to limit the search to that type of files. + Deprecated to allow batching multiple filenames into one kpsewhich + call, since any format option would apply to all filenames at once. + cache : TeXSupportCache, optional + Cache instance to use, defaults to the singleton instance of the class. + + References + ---------- + + .. [1] `Kpathsea documentation `_ + The library that :program:`kpsewhich` is part of. + """ + + if format is not None: + cbook.warn_deprecated( + "3.0", + "The format option to find_tex_file is deprecated " + "to allow batching multiple filenames into one call. " + "Omitting the option should not change the result, as " + "kpsewhich uses the filename extension to choose the path.") + # we expect these to always be ascii encoded, but use utf-8 + # out of caution + if isinstance(filename, bytes): + filename = filename.decode('utf-8', errors='replace') + if isinstance(format, bytes): + format = format.decode('utf-8', errors='replace') + + cmd = ['kpsewhich'] + if format is not None: + cmd += ['--format=' + format] + cmd += [filename] + _log.debug('find_tex_file(%s): %s', filename, cmd) + pipe = subprocess.Popen(cmd, stdout=subprocess.PIPE) + result = pipe.communicate()[0].rstrip() + _log.debug('find_tex_file result: %s', result) + return result.decode('ascii') + + return list(find_tex_files([filename], cache).values())[0] # With multiple text objects per figure (e.g., tick labels) we may end diff --git a/lib/matplotlib/tests/test_dviread.py b/lib/matplotlib/tests/test_dviread.py index 6b005fd34170..4a5b924a2312 100644 --- a/lib/matplotlib/tests/test_dviread.py +++ b/lib/matplotlib/tests/test_dviread.py @@ -1,9 +1,16 @@ from matplotlib.testing.decorators import skip_if_command_unavailable +try: + from unittest import mock +except ImportError: + import mock + import matplotlib.dviread as dr import os.path import json import pytest +import sqlite3 +import warnings def test_PsfontsMap(monkeypatch): @@ -68,3 +75,80 @@ def test_dviread(): 'boxes': [[b.x, b.y, b.height, b.width] for b in page.boxes]} for page in dvi] assert data == correct + + +def test_TeXSupportCache(tmpdir): + dbfile = str(tmpdir / "test.db") + cache = dr.TeXSupportCache(filename=dbfile) + assert cache.get_pathnames(['foo', 'bar']) == {} + with cache.connection as transaction: + cache.update_pathnames({'foo': '/tmp/foo', + 'xyzzy': '/xyzzy.dat', + 'fontfile': None}, transaction) + assert cache.get_pathnames(['foo', 'bar']) == {'foo': '/tmp/foo'} + assert cache.get_pathnames(['xyzzy', 'fontfile']) == \ + {'xyzzy': '/xyzzy.dat', 'fontfile': None} + + +def test_TeXSupportCache_versioning(tmpdir): + dbfile = str(tmpdir / "test.db") + cache1 = dr.TeXSupportCache(dbfile) + with cache1.connection as transaction: + cache1.update_pathnames({'foo': '/tmp/foo'}, transaction) + + with sqlite3.connect(dbfile, isolation_level="DEFERRED") as conn: + conn.executescript('PRAGMA user_version=1000000000;') + + with pytest.raises(dr.TeXSupportCacheError): + cache2 = dr.TeXSupportCache(dbfile) + + +def test_find_tex_files(tmpdir): + with mock.patch('matplotlib.dviread.subprocess.Popen') as mock_popen: + mock_proc = mock.Mock() + stdout = '{s}tmp{s}foo.pfb\n{s}tmp{s}bar.map\n'.\ + format(s=os.path.sep).encode('ascii') + mock_proc.configure_mock(**{'communicate.return_value': (stdout, b'')}) + mock_popen.return_value = mock_proc + + # first call uses the results from kpsewhich + cache = dr.TeXSupportCache(filename=str(tmpdir / "test.db")) + assert dr.find_tex_files( + ['foo.pfb', 'cmsy10.pfb', 'bar.tmp', 'bar.map'], cache) \ + == {'foo.pfb': '{s}tmp{s}foo.pfb'.format(s=os.path.sep), + 'bar.map': '{s}tmp{s}bar.map'.format(s=os.path.sep), + 'cmsy10.pfb': None, 'bar.tmp': None} + assert mock_popen.called + + # second call (subset of the first one) uses only the cache + mock_popen.reset_mock() + assert dr.find_tex_files(['foo.pfb', 'cmsy10.pfb'], cache) \ + == {'foo.pfb': '{s}tmp{s}foo.pfb'.format(s=os.path.sep), + 'cmsy10.pfb': None} + assert not mock_popen.called + + # third call (includes more than the first one) uses kpsewhich again + mock_popen.reset_mock() + stdout = '{s}usr{s}local{s}cmr10.tfm\n'.\ + format(s=os.path.sep).encode('ascii') + mock_proc.configure_mock(**{'communicate.return_value': (stdout, b'')}) + mock_popen.return_value = mock_proc + assert dr.find_tex_files(['foo.pfb', 'cmr10.tfm'], cache) == \ + {'foo.pfb': '{s}tmp{s}foo.pfb'.format(s=os.path.sep), + 'cmr10.tfm': '{s}usr{s}local{s}cmr10.tfm'.format(s=os.path.sep)} + assert mock_popen.called + + +def test_find_tex_file_format(): + with mock.patch('matplotlib.dviread.subprocess.Popen') as mock_popen: + mock_proc = mock.Mock() + stdout = b'/foo/bar/baz\n' + mock_proc.configure_mock(**{'communicate.return_value': (stdout, b'')}) + mock_popen.return_value = mock_proc + + warnings.filterwarnings( + 'ignore', + 'The format option to find_tex_file is deprecated.*', + UserWarning) + assert dr.find_tex_file('foobar', format='tfm') == '/foo/bar/baz' + assert mock_popen.called From 2124ac8bcfe9a9d0d096cc21da40415301b6f653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Fri, 16 Feb 2018 18:00:39 +0200 Subject: [PATCH 2/3] Include next_whats_new/* again --- doc/users/whats_new.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/users/whats_new.rst b/doc/users/whats_new.rst index 648daabc7c5b..3c5fdc23af73 100644 --- a/doc/users/whats_new.rst +++ b/doc/users/whats_new.rst @@ -14,12 +14,12 @@ revision, see the :ref:`github-stats`. .. For a release, add a new section after this, then comment out the include and toctree below by indenting them. Uncomment them after the release. - .. include:: next_whats_new/README.rst - .. toctree:: - :glob: - :maxdepth: 1 +.. include:: next_whats_new/README.rst +.. toctree:: + :glob: + :maxdepth: 1 - next_whats_new/* + next_whats_new/* New in Matplotlib 2.2 From 3ce3061a018a622ff9a7b2e113a53c8d00a0a548 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Sun, 18 Feb 2018 14:47:20 +0200 Subject: [PATCH 3/3] Enable some sqlite and pysqlite options - synchronous=normal (fewer disk writes, still safe in WAL mode) - foreign key enforcement - log sql statements at debug level - use sqlite3.Row (enables accessing columns by name) --- lib/matplotlib/dviread.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py index 52a66d276e86..77a930de990f 100644 --- a/lib/matplotlib/dviread.py +++ b/lib/matplotlib/dviread.py @@ -1039,8 +1039,17 @@ def __init__(self, filename=None): self.connection = sqlite3.connect( filename, isolation_level="DEFERRED") + if _log.isEnabledFor(logging.DEBUG): + def debug_sql(sql): + _log.debug(' '.join(sql.splitlines()).strip()) + self.connection.set_trace_callback(debug_sql) + self.connection.row_factory = sqlite3.Row with self.connection as conn: - conn.execute("PRAGMA journal_mode=WAL;") + conn.executescript(""" + PRAGMA journal_mode=WAL; + PRAGMA synchronous=NORMAL; + PRAGMA foreign_keys=ON; + """) version, = conn.execute("PRAGMA user_version;").fetchone() if version == 0: