Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 44fa8f6

Browse files
committed
Make IDLE's file decode more robust.
1. coding_spec() only looks at first two lines of bytes to avoid a UnicodeDecodeError if rest of file is e.g. latin-1 2. coding_spec() handles \n or \r 3. Clarify that locale_encoding is used by calling it that. However, it's still called IOBinding.encoding in other parts of IDLE and that usage needs to be checked to verify that's still what is desired. 4. Return None from _decode() if decoding fails. 5. Name the vars representing bytes and strings or chars appropriately.
1 parent 504d885 commit 44fa8f6

1 file changed

Lines changed: 76 additions & 45 deletions

File tree

Lib/idlelib/IOBinding.py

Lines changed: 76 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,15 @@
2222
pass
2323

2424
# Encoding for file names
25-
filesystemencoding = sys.getfilesystemencoding()
25+
filesystemencoding = sys.getfilesystemencoding() ### currently unused
2626

27-
encoding = "ascii"
27+
locale_encoding = 'ascii'
2828
if sys.platform == 'win32':
2929
# On Windows, we could use "mbcs". However, to give the user
3030
# a portable encoding name, we need to find the code page
3131
try:
32-
encoding = locale.getdefaultlocale()[1]
33-
codecs.lookup(encoding)
32+
locale_encoding = locale.getdefaultlocale()[1]
33+
codecs.lookup(locale_encoding)
3434
except LookupError:
3535
pass
3636
else:
@@ -39,25 +39,28 @@
3939
# loaded, it may not offer nl_langinfo, or CODESET, or the
4040
# resulting codeset may be unknown to Python. We ignore all
4141
# these problems, falling back to ASCII
42-
encoding = locale.nl_langinfo(locale.CODESET)
43-
if encoding is None or encoding is '':
42+
locale_encoding = locale.nl_langinfo(locale.CODESET)
43+
if locale_encoding is None or locale_encoding is '':
4444
# situation occurs on Mac OS X
45-
encoding = 'ascii'
46-
codecs.lookup(encoding)
45+
locale_encoding = 'ascii'
46+
codecs.lookup(locale_encoding)
4747
except (NameError, AttributeError, LookupError):
48-
# Try getdefaultlocale well: it parses environment variables,
48+
# Try getdefaultlocale: it parses environment variables,
4949
# which may give a clue. Unfortunately, getdefaultlocale has
5050
# bugs that can cause ValueError.
5151
try:
52-
encoding = locale.getdefaultlocale()[1]
53-
if encoding is None or encoding is '':
52+
locale_encoding = locale.getdefaultlocale()[1]
53+
if locale_encoding is None or locale_encoding is '':
5454
# situation occurs on Mac OS X
55-
encoding = 'ascii'
56-
codecs.lookup(encoding)
55+
locale_encoding = 'ascii'
56+
codecs.lookup(locale_encoding)
5757
except (ValueError, LookupError):
5858
pass
5959

60-
encoding = encoding.lower()
60+
locale_encoding = locale_encoding.lower()
61+
62+
encoding = locale_encoding ### KBK 07Sep07 This is used all over IDLE, check!
63+
### 'encoding' is used below in encode(), check!
6164

6265
coding_re = re.compile("coding[:=]\s*([-\w_.]+)")
6366

@@ -110,26 +113,36 @@ def do_edit(self):
110113
def coding_spec(data):
111114
"""Return the encoding declaration according to PEP 263.
112115
113-
Raise LookupError if the encoding is declared but unknown.
116+
When checking encoded data, only the first two lines should be passed
117+
in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
118+
The first two lines would contain the encoding specification.
119+
120+
Raise a LookupError if the encoding is declared but unknown.
114121
"""
115122
if isinstance(data, bytes):
116-
str = data.decode('utf-8')
123+
try:
124+
lines = data.decode('utf-8')
125+
except UnicodeDecodeError:
126+
return None
117127
else:
118-
str = data
119-
# Only consider the first two lines
120-
str = str.split("\n")[:2]
121-
str = "\n".join(str)
128+
lines = data
129+
# consider only the first two lines
130+
if '\n' in lines:
131+
lst = lines.split('\n')[:2]
132+
elif '\r' in lines:
133+
lst = lines.split('\r')[:2]
134+
else:
135+
lst = list(lines)
136+
str = '\n'.join(lst)
122137
match = coding_re.search(str)
123138
if not match:
124139
return None
125140
name = match.group(1)
126-
# Check whether the encoding is known
127-
import codecs
128141
try:
129142
codecs.lookup(name)
130143
except LookupError:
131144
# The standard encoding error does not indicate the encoding
132-
raise LookupError("Unknown encoding "+name)
145+
raise LookupError("Unknown encoding: "+name)
133146
return name
134147

135148

@@ -236,12 +249,19 @@ def loadfile(self, filename):
236249
# open the file in binary mode so that we can handle
237250
# end-of-line convention ourselves.
238251
f = open(filename,'rb')
252+
two_lines = f.readline() + f.readline()
253+
f.seek(0)
239254
bytes = f.read()
240255
f.close()
241256
except IOError as msg:
242257
tkMessageBox.showerror("I/O Error", str(msg), master=self.text)
243258
return False
244-
chars = self.decode(bytes)
259+
chars = self._decode(two_lines, bytes)
260+
if chars is None:
261+
tkMessageBox.showerror("Decoding Error",
262+
"File %s\nFailed to Decode" % filename,
263+
parent=self.text)
264+
return False
245265
# We now convert all end-of-lines to '\n's
246266
firsteol = self.eol_re.search(chars)
247267
if firsteol:
@@ -257,50 +277,61 @@ def loadfile(self, filename):
257277
self.updaterecentfileslist(filename)
258278
return True
259279

260-
def decode(self, chars):
261-
"""Create a Unicode string
262-
263-
If that fails, let Tcl try its best
264-
"""
280+
def _decode(self, two_lines, bytes):
281+
"Create a Unicode string."
282+
chars = None
265283
# Check presence of a UTF-8 signature first
266-
if chars.startswith(BOM_UTF8):
284+
if bytes.startswith(BOM_UTF8):
267285
try:
268-
chars = chars[3:].decode("utf-8")
269-
except UnicodeError:
286+
chars = bytes[3:].decode("utf-8")
287+
except UnicodeDecodeError:
270288
# has UTF-8 signature, but fails to decode...
271-
return chars
289+
return None
272290
else:
273291
# Indicates that this file originally had a BOM
274292
self.fileencoding = 'BOM'
275293
return chars
276294
# Next look for coding specification
277295
try:
278-
enc = coding_spec(chars)
296+
enc = coding_spec(two_lines)
279297
except LookupError as name:
280298
tkMessageBox.showerror(
281299
title="Error loading the file",
282300
message="The encoding '%s' is not known to this Python "\
283301
"installation. The file may not display correctly" % name,
284302
master = self.text)
285303
enc = None
304+
except UnicodeDecodeError:
305+
return None
286306
if enc:
287307
try:
288-
return str(chars, enc)
289-
except UnicodeError:
308+
chars = str(bytes, enc)
309+
self.fileencoding = enc
310+
return chars
311+
except UnicodeDecodeError:
290312
pass
291-
# If it is ASCII, we need not to record anything
313+
# Try ascii:
292314
try:
293-
return str(chars, 'ascii')
294-
except UnicodeError:
315+
chars = str(bytes, 'ascii')
316+
self.fileencoding = None
317+
return chars
318+
except UnicodeDecodeError:
319+
pass
320+
# Try utf-8:
321+
try:
322+
chars = str(bytes, 'utf-8')
323+
self.fileencoding = 'utf-8'
324+
return chars
325+
except UnicodeDecodeError:
295326
pass
296327
# Finally, try the locale's encoding. This is deprecated;
297328
# the user should declare a non-ASCII encoding
298329
try:
299-
chars = str(chars, encoding)
300-
self.fileencoding = encoding
301-
except UnicodeError:
330+
chars = str(bytes, locale_encoding)
331+
self.fileencoding = locale_encoding
332+
except UnicodeDecodeError:
302333
pass
303-
return chars
334+
return chars # None on failure
304335

305336
def maybesave(self):
306337
if self.get_saved():
@@ -383,8 +414,9 @@ def encode(self, chars):
383414
return chars.encode('ascii')
384415
except UnicodeError:
385416
pass
386-
# If there is an encoding declared, try this first.
417+
# Check if there is an encoding declared
387418
try:
419+
# a string, let coding_spec slice it to the first two lines
388420
enc = coding_spec(chars)
389421
failed = None
390422
except LookupError as msg:
@@ -509,7 +541,6 @@ def askopenfile(self):
509541
self.opendialog = tkFileDialog.Open(master=self.text,
510542
filetypes=self.filetypes)
511543
filename = self.opendialog.show(initialdir=dir, initialfile=base)
512-
assert isinstance(filename, str)
513544
return filename
514545

515546
def defaultfilename(self, mode="open"):

0 commit comments

Comments
 (0)