Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 859cd47

Browse files
Issue #17156: pygettext.py now uses an encoding of source file and correctly
writes and escapes non-ascii characters.
2 parents 7451a72 + b6ed173 commit 859cd47

2 files changed

Lines changed: 36 additions & 33 deletions

File tree

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,9 @@ Core and Builtins
169169
Library
170170
-------
171171

172+
- Issue #17156: pygettext.py now uses an encoding of source file and correctly
173+
writes and escapes non-ascii characters.
174+
172175
- Issue #16564: Fixed regression relative to Python2 in the operation of
173176
email.encoders.encode_noop when used with binary data.
174177

Tools/i18n/pygettext.py

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,8 @@
188188
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
189189
"Language-Team: LANGUAGE <[email protected]>\\n"
190190
"MIME-Version: 1.0\\n"
191-
"Content-Type: text/plain; charset=CHARSET\\n"
192-
"Content-Transfer-Encoding: ENCODING\\n"
191+
"Content-Type: text/plain; charset=%(charset)s\\n"
192+
"Content-Transfer-Encoding: %(encoding)s\\n"
193193
"Generated-By: pygettext.py %(version)s\\n"
194194
195195
''')
@@ -203,54 +203,51 @@ def usage(code, msg=''):
203203

204204

205205

206-
escapes = []
207-
208-
def make_escapes(pass_iso8859):
209-
global escapes
210-
if pass_iso8859:
211-
# Allow iso-8859 characters to pass through so that e.g. 'msgid
206+
def make_escapes(pass_nonascii):
207+
global escapes, escape
208+
if pass_nonascii:
209+
# Allow non-ascii characters to pass through so that e.g. 'msgid
212210
# "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
213211
# escape any character outside the 32..126 range.
214212
mod = 128
213+
escape = escape_ascii
215214
else:
216215
mod = 256
217-
for i in range(256):
218-
if 32 <= (i % mod) <= 126:
219-
escapes.append(chr(i))
220-
else:
221-
escapes.append("\\%03o" % i)
222-
escapes[ord('\\')] = '\\\\'
223-
escapes[ord('\t')] = '\\t'
224-
escapes[ord('\r')] = '\\r'
225-
escapes[ord('\n')] = '\\n'
226-
escapes[ord('\"')] = '\\"'
216+
escape = escape_nonascii
217+
escapes = [r"\%03o" % i for i in range(mod)]
218+
for i in range(32, 127):
219+
escapes[i] = chr(i)
220+
escapes[ord('\\')] = r'\\'
221+
escapes[ord('\t')] = r'\t'
222+
escapes[ord('\r')] = r'\r'
223+
escapes[ord('\n')] = r'\n'
224+
escapes[ord('\"')] = r'\"'
225+
227226

227+
def escape_ascii(s, encoding):
228+
return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
228229

229-
def escape(s):
230-
global escapes
231-
s = list(s)
232-
for i in range(len(s)):
233-
s[i] = escapes[ord(s[i])]
234-
return EMPTYSTRING.join(s)
230+
def escape_nonascii(s, encoding):
231+
return ''.join(escapes[b] for b in s.encode(encoding))
235232

236233

237234
def safe_eval(s):
238235
# unwrap quotes, safely
239236
return eval(s, {'__builtins__':{}}, {})
240237

241238

242-
def normalize(s):
239+
def normalize(s, encoding):
243240
# This converts the various Python string types into a format that is
244241
# appropriate for .po files, namely much closer to C style.
245242
lines = s.split('\n')
246243
if len(lines) == 1:
247-
s = '"' + escape(s) + '"'
244+
s = '"' + escape(s, encoding) + '"'
248245
else:
249246
if not lines[-1]:
250247
del lines[-1]
251248
lines[-1] = lines[-1] + '\n'
252249
for i in range(len(lines)):
253-
lines[i] = escape(lines[i])
250+
lines[i] = escape(lines[i], encoding)
254251
lineterm = '\\n"\n"'
255252
s = '""\n"' + lineterm.join(lines) + '"'
256253
return s
@@ -447,7 +444,10 @@ def write(self, fp):
447444
timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
448445
# The time stamp in the header doesn't have the same format as that
449446
# generated by xgettext...
450-
print(pot_header % {'time': timestamp, 'version': __version__}, file=fp)
447+
encoding = fp.encoding if fp.encoding else 'UTF-8'
448+
print(pot_header % {'time': timestamp, 'version': __version__,
449+
'charset': encoding,
450+
'encoding': '8bit'}, file=fp)
451451
# Sort the entries. First sort each particular entry's keys, then
452452
# sort all the entries by their first item.
453453
reverse = {}
@@ -491,7 +491,7 @@ def write(self, fp):
491491
print(locline, file=fp)
492492
if isdocstring:
493493
print('#, docstring', file=fp)
494-
print('msgid', normalize(k), file=fp)
494+
print('msgid', normalize(k, encoding), file=fp)
495495
print('msgstr ""\n', file=fp)
496496

497497

@@ -587,7 +587,7 @@ class Options:
587587
fp.close()
588588

589589
# calculate escapes
590-
make_escapes(options.escape)
590+
make_escapes(not options.escape)
591591

592592
# calculate all keywords
593593
options.keywords.extend(default_keywords)
@@ -620,17 +620,17 @@ class Options:
620620
if filename == '-':
621621
if options.verbose:
622622
print(_('Reading standard input'))
623-
fp = sys.stdin
623+
fp = sys.stdin.buffer
624624
closep = 0
625625
else:
626626
if options.verbose:
627627
print(_('Working on %s') % filename)
628-
fp = open(filename)
628+
fp = open(filename, 'rb')
629629
closep = 1
630630
try:
631631
eater.set_filename(filename)
632632
try:
633-
tokens = tokenize.generate_tokens(fp.readline)
633+
tokens = tokenize.tokenize(fp.readline)
634634
for _token in tokens:
635635
eater(*_token)
636636
except tokenize.TokenError as e:

0 commit comments

Comments
 (0)