1313import logging
1414import math
1515import os
16- import re
1716import string
1817import struct
1918import sys
@@ -119,25 +118,6 @@ def _fill(strings, linelen=75):
119118 result .append (b' ' .join (strings [lasti :]))
120119 return b'\n ' .join (result )
121120
122- # PDF strings are supposed to be able to include any eight-bit data,
123- # except that unbalanced parens and backslashes must be escaped by a
124- # backslash. However, sf bug #2708559 shows that the carriage return
125- # character may get read as a newline; these characters correspond to
126- # \gamma and \Omega in TeX's math font encoding. Escaping them fixes
127- # the bug.
128- _string_escape_regex = re .compile (br'([\\()\r\n])' )
129-
130-
131- def _string_escape (match ):
132- m = match .group (0 )
133- if m in br'\()' :
134- return b'\\ ' + m
135- elif m == b'\n ' :
136- return br'\n'
137- elif m == b'\r ' :
138- return br'\r'
139- assert False
140-
141121
142122def _create_pdf_info_dict (backend , metadata ):
143123 """
@@ -267,6 +247,15 @@ def _get_link_annotation(gc, x, y, width, height):
267247 return link_annotation
268248
269249
250+ # PDF strings are supposed to be able to include any eight-bit data, except
251+ # that unbalanced parens and backslashes must be escaped by a backslash.
252+ # However, sf bug #2708559 shows that the carriage return character may get
253+ # read as a newline; these characters correspond to \gamma and \Omega in TeX's
254+ # math font encoding. Escaping them fixes the bug.
255+ _str_escapes = str .maketrans ({
256+ '\\ ' : '\\ \\ ' , '(' : '\\ (' , ')' : '\\ )' , '\n ' : '\\ n' , '\r ' : '\\ r' })
257+
258+
270259def pdfRepr (obj ):
271260 """Map Python objects to PDF syntax."""
272261
@@ -292,22 +281,21 @@ def pdfRepr(obj):
292281 elif isinstance (obj , (int , np .integer )):
293282 return b"%d" % obj
294283
295- # Unicode strings are encoded in UTF-16BE with byte-order mark.
284+ # Non-ASCII Unicode strings are encoded in UTF-16BE with byte-order mark.
296285 elif isinstance (obj , str ):
297- try :
298- # But maybe it's really ASCII?
299- s = obj .encode ('ASCII' )
300- return pdfRepr (s )
301- except UnicodeEncodeError :
302- s = codecs .BOM_UTF16_BE + obj .encode ('UTF-16BE' )
303- return pdfRepr (s )
286+ return pdfRepr (obj .encode ('ascii' ) if obj .isascii ()
287+ else codecs .BOM_UTF16_BE + obj .encode ('UTF-16BE' ))
304288
305289 # Strings are written in parentheses, with backslashes and parens
306290 # escaped. Actually balanced parens are allowed, but it is
307291 # simpler to escape them all. TODO: cut long strings into lines;
308292 # I believe there is some maximum line length in PDF.
293+ # Despite the extra decode/encode, translate is faster than regex.
309294 elif isinstance (obj , bytes ):
310- return b'(' + _string_escape_regex .sub (_string_escape , obj ) + b')'
295+ return (
296+ b'(' +
297+ obj .decode ('latin-1' ).translate (_str_escapes ).encode ('latin-1' )
298+ + b')' )
311299
312300 # Dictionaries. The keys must be PDF names, so if we find strings
313301 # there, we make Name objects from them. The values may be
0 commit comments