Issue #17156: pygettext.py now uses an encoding of source file and correctly

serhiy-storchaka · serhiy-storchaka · commit b6ed17344b45 · 2013-02-09T22:37:22.000+02:00
writes and escapes non-ascii characters.
diff --git a/Misc/NEWS b/Misc/NEWS
@@ -215,6 +215,9 @@ Core and Builtins
 Library
 -------
 
+- Issue #17156: pygettext.py now uses an encoding of source file and correctly
+  writes and escapes non-ascii characters.
+
 - Issue #16564: Fixed regression relative to Python2 in the operation of
   email.encoders.encode_noop when used with binary data.
 
diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py
@@ -189,8 +189,8 @@
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
 "Language-Team: LANGUAGE <LL@li.org>\\n"
 "MIME-Version: 1.0\\n"
-"Content-Type: text/plain; charset=CHARSET\\n"
-"Content-Transfer-Encoding: ENCODING\\n"
+"Content-Type: text/plain; charset=%(charset)s\\n"
+"Content-Transfer-Encoding: %(encoding)s\\n"
 "Generated-By: pygettext.py %(version)s\\n"
 
 ''')
@@ -204,54 +204,51 @@ def usage(code, msg=''):
 
 
 
-escapes = []
-
-def make_escapes(pass_iso8859):
-    global escapes
-    if pass_iso8859:
-        # Allow iso-8859 characters to pass through so that e.g. 'msgid
+def make_escapes(pass_nonascii):
+    global escapes, escape
+    if pass_nonascii:
+        # Allow non-ascii characters to pass through so that e.g. 'msgid
         # "H�he"' would result not result in 'msgid "H\366he"'.  Otherwise we
         # escape any character outside the 32..126 range.
         mod = 128
+        escape = escape_ascii
     else:
         mod = 256
-    for i in range(256):
-        if 32 <= (i % mod) <= 126:
-            escapes.append(chr(i))
-        else:
-            escapes.append("\\%03o" % i)
-    escapes[ord('\\')] = '\\\\'
-    escapes[ord('\t')] = '\\t'
-    escapes[ord('\r')] = '\\r'
-    escapes[ord('\n')] = '\\n'
-    escapes[ord('\"')] = '\\"'
+        escape = escape_nonascii
+    escapes = [r"\%03o" % i for i in range(mod)]
+    for i in range(32, 127):
+        escapes[i] = chr(i)
+    escapes[ord('\\')] = r'\\'
+    escapes[ord('\t')] = r'\t'
+    escapes[ord('\r')] = r'\r'
+    escapes[ord('\n')] = r'\n'
+    escapes[ord('\"')] = r'\"'
+
 
+def escape_ascii(s, encoding):
+    return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
 
-def escape(s):
-    global escapes
-    s = list(s)
-    for i in range(len(s)):
-        s[i] = escapes[ord(s[i])]
-    return EMPTYSTRING.join(s)
+def escape_nonascii(s, encoding):
+    return ''.join(escapes[b] for b in s.encode(encoding))
 
 
 def safe_eval(s):
     # unwrap quotes, safely
     return eval(s, {'__builtins__':{}}, {})
 
 
-def normalize(s):
+def normalize(s, encoding):
     # This converts the various Python string types into a format that is
     # appropriate for .po files, namely much closer to C style.
     lines = s.split('\n')
     if len(lines) == 1:
-        s = '"' + escape(s) + '"'
+        s = '"' + escape(s, encoding) + '"'
     else:
         if not lines[-1]:
             del lines[-1]
             lines[-1] = lines[-1] + '\n'
         for i in range(len(lines)):
-            lines[i] = escape(lines[i])
+            lines[i] = escape(lines[i], encoding)
         lineterm = '\\n"\n"'
         s = '""\n"' + lineterm.join(lines) + '"'
     return s
@@ -448,7 +445,10 @@ def write(self, fp):
         timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
         # The time stamp in the header doesn't have the same format as that
         # generated by xgettext...
-        print(pot_header % {'time': timestamp, 'version': __version__}, file=fp)
+        encoding = fp.encoding if fp.encoding else 'UTF-8'
+        print(pot_header % {'time': timestamp, 'version': __version__,
+                            'charset': encoding,
+                            'encoding': '8bit'}, file=fp)
         # Sort the entries.  First sort each particular entry's keys, then
         # sort all the entries by their first item.
         reverse = {}
@@ -492,7 +492,7 @@ def write(self, fp):
                         print(locline, file=fp)
                 if isdocstring:
                     print('#, docstring', file=fp)
-                print('msgid', normalize(k), file=fp)
+                print('msgid', normalize(k, encoding), file=fp)
                 print('msgstr ""\n', file=fp)
 
 
@@ -588,7 +588,7 @@ class Options:
                 fp.close()
 
     # calculate escapes
-    make_escapes(options.escape)
+    make_escapes(not options.escape)
 
     # calculate all keywords
     options.keywords.extend(default_keywords)
@@ -621,17 +621,17 @@ class Options:
         if filename == '-':
             if options.verbose:
                 print(_('Reading standard input'))
-            fp = sys.stdin
+            fp = sys.stdin.buffer
             closep = 0
         else:
             if options.verbose:
                 print(_('Working on %s') % filename)
-            fp = open(filename)
+            fp = open(filename, 'rb')
             closep = 1
         try:
             eater.set_filename(filename)
             try:
-                tokens = tokenize.generate_tokens(fp.readline)
+                tokens = tokenize.tokenize(fp.readline)
                 for _token in tokens:
                     eater(*_token)
             except tokenize.TokenError as e: