Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b6ed173

Browse files
Issue #17156: pygettext.py now uses an encoding of source file and correctly
writes and escapes non-ascii characters.
1 parent 041d553 commit b6ed173

2 files changed

Lines changed: 36 additions & 33 deletions

File tree

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,9 @@ Core and Builtins
215215
Library
216216
-------
217217

218+
- Issue #17156: pygettext.py now uses an encoding of source file and correctly
219+
writes and escapes non-ascii characters.
220+
218221
- Issue #16564: Fixed regression relative to Python2 in the operation of
219222
email.encoders.encode_noop when used with binary data.
220223

Tools/i18n/pygettext.py

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,8 @@
189189
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
190190
"Language-Team: LANGUAGE <[email protected]>\\n"
191191
"MIME-Version: 1.0\\n"
192-
"Content-Type: text/plain; charset=CHARSET\\n"
193-
"Content-Transfer-Encoding: ENCODING\\n"
192+
"Content-Type: text/plain; charset=%(charset)s\\n"
193+
"Content-Transfer-Encoding: %(encoding)s\\n"
194194
"Generated-By: pygettext.py %(version)s\\n"
195195
196196
''')
@@ -204,54 +204,51 @@ def usage(code, msg=''):
204204

205205

206206

207-
escapes = []
208-
209-
def make_escapes(pass_iso8859):
210-
global escapes
211-
if pass_iso8859:
212-
# Allow iso-8859 characters to pass through so that e.g. 'msgid
207+
def make_escapes(pass_nonascii):
208+
global escapes, escape
209+
if pass_nonascii:
210+
# Allow non-ascii characters to pass through so that e.g. 'msgid
213211
# "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
214212
# escape any character outside the 32..126 range.
215213
mod = 128
214+
escape = escape_ascii
216215
else:
217216
mod = 256
218-
for i in range(256):
219-
if 32 <= (i % mod) <= 126:
220-
escapes.append(chr(i))
221-
else:
222-
escapes.append("\\%03o" % i)
223-
escapes[ord('\\')] = '\\\\'
224-
escapes[ord('\t')] = '\\t'
225-
escapes[ord('\r')] = '\\r'
226-
escapes[ord('\n')] = '\\n'
227-
escapes[ord('\"')] = '\\"'
217+
escape = escape_nonascii
218+
escapes = [r"\%03o" % i for i in range(mod)]
219+
for i in range(32, 127):
220+
escapes[i] = chr(i)
221+
escapes[ord('\\')] = r'\\'
222+
escapes[ord('\t')] = r'\t'
223+
escapes[ord('\r')] = r'\r'
224+
escapes[ord('\n')] = r'\n'
225+
escapes[ord('\"')] = r'\"'
226+
228227

228+
def escape_ascii(s, encoding):
229+
return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
229230

230-
def escape(s):
231-
global escapes
232-
s = list(s)
233-
for i in range(len(s)):
234-
s[i] = escapes[ord(s[i])]
235-
return EMPTYSTRING.join(s)
231+
def escape_nonascii(s, encoding):
232+
return ''.join(escapes[b] for b in s.encode(encoding))
236233

237234

238235
def safe_eval(s):
239236
# unwrap quotes, safely
240237
return eval(s, {'__builtins__':{}}, {})
241238

242239

243-
def normalize(s):
240+
def normalize(s, encoding):
244241
# This converts the various Python string types into a format that is
245242
# appropriate for .po files, namely much closer to C style.
246243
lines = s.split('\n')
247244
if len(lines) == 1:
248-
s = '"' + escape(s) + '"'
245+
s = '"' + escape(s, encoding) + '"'
249246
else:
250247
if not lines[-1]:
251248
del lines[-1]
252249
lines[-1] = lines[-1] + '\n'
253250
for i in range(len(lines)):
254-
lines[i] = escape(lines[i])
251+
lines[i] = escape(lines[i], encoding)
255252
lineterm = '\\n"\n"'
256253
s = '""\n"' + lineterm.join(lines) + '"'
257254
return s
@@ -448,7 +445,10 @@ def write(self, fp):
448445
timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
449446
# The time stamp in the header doesn't have the same format as that
450447
# generated by xgettext...
451-
print(pot_header % {'time': timestamp, 'version': __version__}, file=fp)
448+
encoding = fp.encoding if fp.encoding else 'UTF-8'
449+
print(pot_header % {'time': timestamp, 'version': __version__,
450+
'charset': encoding,
451+
'encoding': '8bit'}, file=fp)
452452
# Sort the entries. First sort each particular entry's keys, then
453453
# sort all the entries by their first item.
454454
reverse = {}
@@ -492,7 +492,7 @@ def write(self, fp):
492492
print(locline, file=fp)
493493
if isdocstring:
494494
print('#, docstring', file=fp)
495-
print('msgid', normalize(k), file=fp)
495+
print('msgid', normalize(k, encoding), file=fp)
496496
print('msgstr ""\n', file=fp)
497497

498498

@@ -588,7 +588,7 @@ class Options:
588588
fp.close()
589589

590590
# calculate escapes
591-
make_escapes(options.escape)
591+
make_escapes(not options.escape)
592592

593593
# calculate all keywords
594594
options.keywords.extend(default_keywords)
@@ -621,17 +621,17 @@ class Options:
621621
if filename == '-':
622622
if options.verbose:
623623
print(_('Reading standard input'))
624-
fp = sys.stdin
624+
fp = sys.stdin.buffer
625625
closep = 0
626626
else:
627627
if options.verbose:
628628
print(_('Working on %s') % filename)
629-
fp = open(filename)
629+
fp = open(filename, 'rb')
630630
closep = 1
631631
try:
632632
eater.set_filename(filename)
633633
try:
634-
tokens = tokenize.generate_tokens(fp.readline)
634+
tokens = tokenize.tokenize(fp.readline)
635635
for _token in tokens:
636636
eater(*_token)
637637
except tokenize.TokenError as e:

0 commit comments

Comments
 (0)