13
13
import logging
14
14
import math
15
15
import os
16
- import re
17
16
import string
18
17
import struct
19
18
import sys
@@ -119,25 +118,6 @@ def _fill(strings, linelen=75):
119
118
result .append (b' ' .join (strings [lasti :]))
120
119
return b'\n ' .join (result )
121
120
122
- # PDF strings are supposed to be able to include any eight-bit data,
123
- # except that unbalanced parens and backslashes must be escaped by a
124
- # backslash. However, sf bug #2708559 shows that the carriage return
125
- # character may get read as a newline; these characters correspond to
126
- # \gamma and \Omega in TeX's math font encoding. Escaping them fixes
127
- # the bug.
128
- _string_escape_regex = re .compile (br'([\\()\r\n])' )
129
-
130
-
131
- def _string_escape (match ):
132
- m = match .group (0 )
133
- if m in br'\()' :
134
- return b'\\ ' + m
135
- elif m == b'\n ' :
136
- return br'\n'
137
- elif m == b'\r ' :
138
- return br'\r'
139
- assert False
140
-
141
121
142
122
def _create_pdf_info_dict (backend , metadata ):
143
123
"""
@@ -250,6 +230,15 @@ def _datetime_to_pdf(d):
250
230
return r
251
231
252
232
233
+ # PDF strings are supposed to be able to include any eight-bit data, except
234
+ # that unbalanced parens and backslashes must be escaped by a backslash.
235
+ # However, sf bug #2708559 shows that the carriage return character may get
236
+ # read as a newline; these characters correspond to \gamma and \Omega in TeX's
237
+ # math font encoding. Escaping them fixes the bug.
238
+ _str_escapes = str .maketrans ({
239
+ '\\ ' : '\\ \\ ' , '(' : '\\ (' , ')' : '\\ )' , '\n ' : '\\ n' , '\r ' : '\\ r' })
240
+
241
+
253
242
def pdfRepr (obj ):
254
243
"""Map Python objects to PDF syntax."""
255
244
@@ -275,22 +264,21 @@ def pdfRepr(obj):
275
264
elif isinstance (obj , (int , np .integer )):
276
265
return b"%d" % obj
277
266
278
- # Unicode strings are encoded in UTF-16BE with byte-order mark.
267
+ # Non-ASCII Unicode strings are encoded in UTF-16BE with byte-order mark.
279
268
elif isinstance (obj , str ):
280
- try :
281
- # But maybe it's really ASCII?
282
- s = obj .encode ('ASCII' )
283
- return pdfRepr (s )
284
- except UnicodeEncodeError :
285
- s = codecs .BOM_UTF16_BE + obj .encode ('UTF-16BE' )
286
- return pdfRepr (s )
269
+ return pdfRepr (obj .encode ('ascii' ) if obj .isascii ()
270
+ else codecs .BOM_UTF16_BE + obj .encode ('UTF-16BE' ))
287
271
288
272
# Strings are written in parentheses, with backslashes and parens
289
273
# escaped. Actually balanced parens are allowed, but it is
290
274
# simpler to escape them all. TODO: cut long strings into lines;
291
275
# I believe there is some maximum line length in PDF.
276
+ # Despite the extra decode/encode, translate is faster than regex.
292
277
elif isinstance (obj , bytes ):
293
- return b'(' + _string_escape_regex .sub (_string_escape , obj ) + b')'
278
+ return (
279
+ b'(' +
280
+ obj .decode ('latin-1' ).translate (_str_escapes ).encode ('latin-1' )
281
+ + b')' )
294
282
295
283
# Dictionaries. The keys must be PDF names, so if we find strings
296
284
# there, we make Name objects from them. The values may be
0 commit comments