From 0cac4142bb41474294ea7f3346edd0bbbfec0481 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Thu, 30 Jul 2020 17:38:44 +0300 Subject: [PATCH 1/4] Proof of concept: Type42 subsetting in pdf --- lib/matplotlib/backends/backend_pdf.py | 47 +++++++++++++++++++------- lib/matplotlib/testing/conftest.py | 2 +- setup.py | 1 + 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py index e8e8ece1310c..f6875564ecad 100644 --- a/lib/matplotlib/backends/backend_pdf.py +++ b/lib/matplotlib/backends/backend_pdf.py @@ -15,11 +15,13 @@ import os import re import struct +import tempfile import time import types import warnings import zlib +from fontTools import subset import numpy as np from PIL import Image @@ -36,7 +38,7 @@ import matplotlib.type1font as type1font import matplotlib.dviread as dviread from matplotlib.ft2font import (FIXED_WIDTH, ITALIC, LOAD_NO_SCALE, - LOAD_NO_HINTING, KERNING_UNFITTED) + LOAD_NO_HINTING, KERNING_UNFITTED, FT2Font) from matplotlib.mathtext import MathTextParser from matplotlib.transforms import Affine2D, BboxBase from matplotlib.path import Path @@ -1146,6 +1148,17 @@ def embedTTFType42(font, characters, descriptor): wObject = self.reserveObject('Type 0 widths') toUnicodeMapObject = self.reserveObject('ToUnicode map') + print(f"SUBSET {filename} characters: {''.join(chr(c) for c in characters)}") + fontdata = self.getSubset(filename, ''.join(chr(c) for c in characters)) + print(f'SUBSET {filename} {os.stat(filename).st_size} -> {len(fontdata)}') + + # reload the font object from the subset + # (all the necessary data could probably be obtained directly using fontLib.ttLib) + with tempfile.NamedTemporaryFile(suffix='.ttf') as tmp: + tmp.write(fontdata) + tmp.seek(0,0) + font = FT2Font(tmp.name) + cidFontDict = { 'Type': Name('Font'), 'Subtype': Name('CIDFontType2'), @@ -1170,21 +1183,12 @@ def embedTTFType42(font, characters, descriptor): # Make fontfile stream descriptor['FontFile2'] = fontfileObject - length1Object = self.reserveObject('decoded length of a font') self.beginStream( fontfileObject.id, self.reserveObject('length of font stream'), - {'Length1': length1Object}) - with open(filename, 'rb') as fontfile: - length1 = 0 - while True: - data = fontfile.read(4096) - if not data: - break - length1 += len(data) - self.currentstream.write(data) + {'Length1': len(fontdata)}) + self.currentstream.write(fontdata) self.endStream() - self.writeObject(length1Object, length1) # Make the 'W' (Widths) array, CidToGidMap and ToUnicode CMap # at the same time @@ -1307,6 +1311,25 @@ def embedTTFType42(font, characters, descriptor): elif fonttype == 42: return embedTTFType42(font, characters, descriptor) + @classmethod + def getSubset(self, fontfile, characters): + """Read TTF font from the given file and subset it for the given characters. + + Returns a serialization of the subset font as bytes.""" + + options = subset.Options(glyph_names=True, recommended_glyphs=True) + options.drop_tables += ['FFTM'] + font = subset.load_font(fontfile, options) + try: + subsetter = subset.Subsetter(options=options) + subsetter.populate(text=characters) + subsetter.subset(font) + fh = BytesIO() + font.save(fh, reorderTables=False) + return fh.getvalue() + finally: + font.close() + def alphaState(self, alpha): """Return name of an ExtGState that sets alpha to the given value.""" diff --git a/lib/matplotlib/testing/conftest.py b/lib/matplotlib/testing/conftest.py index 391dd5d49d38..a328f2dbb9cc 100644 --- a/lib/matplotlib/testing/conftest.py +++ b/lib/matplotlib/testing/conftest.py @@ -16,7 +16,7 @@ def pytest_configure(config): ("markers", "style: Set alternate Matplotlib style temporarily."), ("markers", "baseline_images: Compare output against references."), ("markers", "pytz: Tests that require pytz to be installed."), - ("filterwarnings", "error"), + #("filterwarnings", "error"), # fontTools.subset raises a pointless DeprecationWarning ]: config.addinivalue_line(key, value) diff --git a/setup.py b/setup.py index 7f08fa09d6eb..b6556174d0e6 100644 --- a/setup.py +++ b/setup.py @@ -279,6 +279,7 @@ def build_extensions(self): ], install_requires=[ "cycler>=0.10", + "fonttools>=4.13.0,<5.0", "kiwisolver>=1.0.1", "numpy>=1.16", "pillow>=6.2.0", From 468c52c38898b22872229450393c831e93194597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Sat, 1 Aug 2020 18:46:50 +0300 Subject: [PATCH 2/4] flake8 --- lib/matplotlib/backends/backend_pdf.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py index f6875564ecad..ee00d09404a2 100644 --- a/lib/matplotlib/backends/backend_pdf.py +++ b/lib/matplotlib/backends/backend_pdf.py @@ -1148,15 +1148,21 @@ def embedTTFType42(font, characters, descriptor): wObject = self.reserveObject('Type 0 widths') toUnicodeMapObject = self.reserveObject('ToUnicode map') - print(f"SUBSET {filename} characters: {''.join(chr(c) for c in characters)}") - fontdata = self.getSubset(filename, ''.join(chr(c) for c in characters)) - print(f'SUBSET {filename} {os.stat(filename).st_size} -> {len(fontdata)}') + print(f"SUBSET {filename} characters: " + f"{''.join(chr(c) for c in characters)}") + fontdata = self.getSubset( + filename, + ''.join(chr(c) for c in characters) + ) + print(f'SUBSET {filename} {os.stat(filename).st_size}' + f' ↦ {len(fontdata)}') # reload the font object from the subset - # (all the necessary data could probably be obtained directly using fontLib.ttLib) + # (all the necessary data could probably be obtained directly + # using fontLib.ttLib) with tempfile.NamedTemporaryFile(suffix='.ttf') as tmp: tmp.write(fontdata) - tmp.seek(0,0) + tmp.seek(0, 0) font = FT2Font(tmp.name) cidFontDict = { @@ -1313,9 +1319,11 @@ def embedTTFType42(font, characters, descriptor): @classmethod def getSubset(self, fontfile, characters): - """Read TTF font from the given file and subset it for the given characters. + """ + Read TTF font from the given file and subset it for the given characters. - Returns a serialization of the subset font as bytes.""" + Returns a serialization of the subset font as bytes. + """ options = subset.Options(glyph_names=True, recommended_glyphs=True) options.drop_tables += ['FFTM'] From 9e01acaa7afc8339ae06d0bd313b509aebec7a8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Sat, 1 Aug 2020 18:55:54 +0300 Subject: [PATCH 3/4] Filter out just the py23 warning --- lib/matplotlib/testing/conftest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/matplotlib/testing/conftest.py b/lib/matplotlib/testing/conftest.py index a328f2dbb9cc..8b8e3a8a3e9a 100644 --- a/lib/matplotlib/testing/conftest.py +++ b/lib/matplotlib/testing/conftest.py @@ -16,7 +16,9 @@ def pytest_configure(config): ("markers", "style: Set alternate Matplotlib style temporarily."), ("markers", "baseline_images: Compare output against references."), ("markers", "pytz: Tests that require pytz to be installed."), - #("filterwarnings", "error"), # fontTools.subset raises a pointless DeprecationWarning + ("filterwarnings", "error"), + ("filterwarnings", + "ignore:.*The py23 module has been deprecated:DeprecationWarning"), ]: config.addinivalue_line(key, value) From 591f9a89954daf405cecdcdb72cb4eee6ddc0153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Sat, 1 Aug 2020 19:06:34 +0300 Subject: [PATCH 4/4] More flake8 --- lib/matplotlib/backends/backend_pdf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py index ee00d09404a2..a3e30926a6f4 100644 --- a/lib/matplotlib/backends/backend_pdf.py +++ b/lib/matplotlib/backends/backend_pdf.py @@ -1320,8 +1320,9 @@ def embedTTFType42(font, characters, descriptor): @classmethod def getSubset(self, fontfile, characters): """ - Read TTF font from the given file and subset it for the given characters. + Subset a TTF font + Reads the named fontfile and restricts the font to the characters. Returns a serialization of the subset font as bytes. """