Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 5dbf526

Browse files
committed
Several improvements, some of where were contributed by Bernhard
Herzog <[email protected]>. Specifically, --verbose/-v flag added pot_header added to make msgmerge and Emacs po-mode work better normalize(), escape(), safe_eval(): Improved normalization of strings for more .po file compatibility (e.g. C style). Handles emmbedded newlines better. Also added an identity function called _() and use it in the file where messages are printed. This allows us to selftest pygettext.py with itself as input.
1 parent a507c32 commit 5dbf526

1 file changed

Lines changed: 86 additions & 44 deletions

File tree

Tools/i18n/pygettext.py

Lines changed: 86 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,19 @@
88
the programming language and can be used from within Python programs. Martin
99
von Loewis' work[1] helps considerably in this regard.
1010
11-
There's one hole though; xgettext is the program that scans source code
11+
There's one problem though; xgettext is the program that scans source code
1212
looking for message strings, but it groks only C (or C++). Python introduces
1313
a few wrinkles, such as dual quoting characters, triple quoted strings, and
1414
raw strings. xgettext understands none of this.
1515
1616
Enter pygettext, which uses Python's standard tokenize module to scan Python
1717
source code, generating .pot files identical to what GNU xgettext[2] generates
18-
for C and C++ code. From there, the standard GNU tools can be used.
18+
for C and C++ code. From there, the standard GNU tools can be used.
1919
2020
A word about marking Python strings as candidates for translation. GNU
2121
xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
2222
gettext_noop. But those can be a lot of text to include all over your code.
23-
C and C++ have a trick: they use the C preprocessor. Most internationalized C
23+
C and C++ have a trick: they use the C preprocessor. Most internationalized C
2424
source includes a #define for gettext() to _() so that what has to be written
2525
in the source is much less. Thus these are both translatable strings:
2626
@@ -34,7 +34,6 @@
3434
[1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
3535
[2] http://www.gnu.org/software/gettext/gettext.html
3636
37-
3837
NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
3938
where ever possible.
4039
@@ -74,6 +73,10 @@
7473
If style is omitted, Gnu is used. The style name is case
7574
insensitive. By default, locations are included.
7675
76+
-v
77+
--verbose
78+
Print the names of the files being processed.
79+
7780
--help
7881
-h
7982
print this help message and exit
@@ -87,49 +90,80 @@
8790
import getopt
8891
import tokenize
8992

90-
__version__ = '0.1'
93+
__version__ = '0.2'
9194

9295

96+
97+
# for selftesting
98+
def _(s): return s
99+
100+
101+
# The normal pot-file header. msgmerge and EMACS' po-mode work better if
102+
# it's there.
103+
pot_header = _('''\
104+
# SOME DESCRIPTIVE TITLE.
105+
# Copyright (C) YEAR ORGANIZATION
106+
# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
107+
#
108+
msgid ""
109+
msgstr ""
110+
"Project-Id-Version: PACKAGE VERSION\\n"
111+
"PO-Revision-Date: %(time)s\\n"
112+
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
113+
"Language-Team: LANGUAGE <[email protected]>\\n"
114+
"MIME-Version: 1.0\\n"
115+
"Content-Type: text/plain; charset=CHARSET\\n"
116+
"Content-Transfer-Encoding: ENCODING\\n"
117+
"Generated-By: pygettext.py %(version)s\\n"
118+
119+
''')
120+
93121

94122
def usage(code, msg=''):
95123
print __doc__ % globals()
96124
if msg:
97125
print msg
98126
sys.exit(code)
99127

100-
101128

129+
escapes = []
130+
for i in range(256):
131+
if i < 32 or i > 127:
132+
escapes.append("\\%03o" % i)
133+
else:
134+
escapes.append(chr(i))
135+
136+
escapes[ord('\\')] = '\\\\'
137+
escapes[ord('\t')] = '\\t'
138+
escapes[ord('\r')] = '\\r'
139+
escapes[ord('\n')] = '\\n'
140+
141+
def escape(s):
142+
s = list(s)
143+
for i in range(len(s)):
144+
s[i] = escapes[ord(s[i])]
145+
return string.join(s, '')
146+
147+
148+
def safe_eval(s):
149+
# unwrap quotes, safely
150+
return eval(s, {'__builtins__':{}}, {})
151+
152+
102153
def normalize(s):
103154
# This converts the various Python string types into a format that is
104155
# appropriate for .po files, namely much closer to C style.
105-
#
106-
# unwrap quotes, safely
107-
s = eval(s, {'__builtins__':{}}, {})
108-
# now escape any embedded double quotes
109-
parts = []
110-
last = 0
111-
i = string.find(s, '"')
112-
while i >= 0:
113-
# find the number of preceding backslashes
114-
j = i
115-
n = 0
116-
while j >= 0 and s[i] == '\\':
117-
j = j - 1
118-
n = n + 1
119-
if (n % 2) == 0:
120-
parts.append(s[last:j])
121-
parts.append('\\')
122-
parts.append(s[j:i])
123-
else:
124-
parts.append(s[last:i])
125-
last = i
126-
i = string.find(s, '"', i+1)
127-
else:
128-
parts.append(s[last:])
129-
if parts:
130-
return '"' + string.join(parts, '') + '"'
156+
lines = string.split(s, '\n')
157+
if len(lines) == 1:
158+
s = '"' + escape(s) + '"'
131159
else:
132-
return '"' + s + '"'
160+
if not lines[-1]:
161+
del lines[-1]
162+
lines[-1] = lines[-1] + '\n'
163+
for i in range(len(lines)):
164+
lines[i] = escape(lines[i])
165+
s = '""\n"' + string.join(lines, '\\n"\n"') + '"'
166+
return s
133167

134168

135169

@@ -173,7 +207,7 @@ def __openseen(self, ttype, tstring, lineno):
173207
linenos.append(entry)
174208
self.__state = self.__waiting
175209
elif ttype == tokenize.STRING:
176-
self.__data.append(normalize(tstring))
210+
self.__data.append(safe_eval(tstring))
177211
# TBD: should we warn if we seen anything else?
178212

179213
def set_filename(self, filename):
@@ -185,19 +219,21 @@ def write(self, fp):
185219
# common header
186220
try:
187221
sys.stdout = fp
188-
print '# POT file generated by pygettext.py', __version__
189-
print '#', timestamp
190-
print '#'
222+
# The time stamp in the header doesn't have the same format
223+
# as that generated by xgettext...
224+
print pot_header % {'time': timestamp, 'version':__version__}
191225
for k, v in self.__messages.items():
192226
for filename, lineno in v:
193227
# location comments are different b/w Solaris and GNU
228+
d = {'filename': filename,
229+
'lineno': lineno}
194230
if options.location == options.SOLARIS:
195-
print '# File: %s,' % filename, 'line: %d' % lineno
231+
print _('# File: %(filename)s, line: %(lineno)d') % d
196232
elif options.location == options.GNU:
197-
print '#: %s:%d' % (filename, lineno)
233+
print _('#: %(filename)s:%(lineno)d') % d
198234
# TBD: sorting, normalizing
199-
print 'msgid', k
200-
print 'msgstr '
235+
print 'msgid', normalize(k)
236+
print 'msgstr ""'
201237
print
202238
finally:
203239
sys.stdout = sys.__stdout__
@@ -208,9 +244,9 @@ def main():
208244
try:
209245
opts, args = getopt.getopt(
210246
sys.argv[1:],
211-
'k:d:n:h',
247+
'k:d:n:hv',
212248
['keyword', 'default-domain', 'help',
213-
'add-location=', 'no-location'])
249+
'add-location=', 'no-location', 'verbose'])
214250
except getopt.error, msg:
215251
usage(1, msg)
216252

@@ -223,6 +259,7 @@ class Options:
223259
keywords = []
224260
outfile = 'messages.pot'
225261
location = GNU
262+
verbose = 0
226263

227264
options = Options()
228265
locations = {'gnu' : options.GNU,
@@ -245,16 +282,21 @@ class Options:
245282
try:
246283
options.location = locations[string.lower(arg)]
247284
except KeyError:
248-
usage(1, 'Invalid value for --add-location: ' + arg)
285+
d = {'arg':arg}
286+
usage(1, _('Invalid value for --add-location: %(arg)s') % d)
249287
elif opt in ('--no-location',):
250288
options.location = 0
289+
elif opt in ('-v', '--verbose'):
290+
options.verbose = 1
251291

252292
# calculate all keywords
253293
options.keywords.extend(default_keywords)
254294

255295
# slurp through all the files
256296
eater = TokenEater(options)
257297
for filename in args:
298+
if options.verbose:
299+
print _('Working on %(filename)s') % {'filename':filename}
258300
fp = open(filename)
259301
eater.set_filename(filename)
260302
tokenize.tokenize(fp.readline, eater)

0 commit comments

Comments
 (0)