Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[3.8] bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215) #21259

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 41 additions & 152 deletions Lib/idlelib/iomenu.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import codecs
from codecs import BOM_UTF8
import os
import re
import shlex
import sys
import tempfile
import tokenize

import tkinter.filedialog as tkFileDialog
import tkinter.messagebox as tkMessageBox
Expand All @@ -20,49 +18,6 @@
errors = 'surrogateescape'


coding_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

def coding_spec(data):
"""Return the encoding declaration according to PEP 263.

When checking encoded data, only the first two lines should be passed
in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
The first two lines would contain the encoding specification.

Raise a LookupError if the encoding is declared but unknown.
"""
if isinstance(data, bytes):
# This encoding might be wrong. However, the coding
# spec must be ASCII-only, so any non-ASCII characters
# around here will be ignored. Decoding to Latin-1 should
# never fail (except for memory outage)
lines = data.decode('iso-8859-1')
else:
lines = data
# consider only the first two lines
if '\n' in lines:
lst = lines.split('\n', 2)[:2]
elif '\r' in lines:
lst = lines.split('\r', 2)[:2]
else:
lst = [lines]
for line in lst:
match = coding_re.match(line)
if match is not None:
break
if not blank_re.match(line):
return None
else:
return None
name = match.group(1)
try:
codecs.lookup(name)
except LookupError:
# The standard encoding error does not indicate the encoding
raise LookupError("Unknown encoding: "+name)
return name


class IOBinding:
# One instance per editor Window so methods know which to save, close.
Expand All @@ -78,7 +33,7 @@ def __init__(self, editwin):
self.save_as)
self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>",
self.save_a_copy)
self.fileencoding = None
self.fileencoding = 'utf-8'
self.__id_print = self.text.bind("<<print-window>>", self.print_window)

def close(self):
Expand Down Expand Up @@ -165,34 +120,44 @@ def open(self, event=None, editFile=None):
self.text.focus_set()
return "break"

eol = r"(\r\n)|\n|\r" # \r\n (Windows), \n (UNIX), or \r (Mac)
eol_re = re.compile(eol)
eol_convention = os.linesep # default

def loadfile(self, filename):
try:
# open the file in binary mode so that we can handle
# end-of-line convention ourselves.
with open(filename, 'rb') as f:
two_lines = f.readline() + f.readline()
f.seek(0)
bytes = f.read()
except OSError as msg:
tkMessageBox.showerror("I/O Error", str(msg), parent=self.text)
try:
with tokenize.open(filename) as f:
chars = f.read()
fileencoding = f.encoding
eol_convention = f.newlines
converted = False
except (UnicodeDecodeError, SyntaxError):
# Wait for the editor window to appear
self.editwin.text.update()
enc = askstring(
"Specify file encoding",
"The file's encoding is invalid for Python 3.x.\n"
"IDLE will convert it to UTF-8.\n"
"What is the current encoding of the file?",
initialvalue='utf-8',
parent=self.editwin.text)
with open(filename, encoding=enc) as f:
chars = f.read()
fileencoding = f.encoding
eol_convention = f.newlines
converted = True
except OSError as err:
tkMessageBox.showerror("I/O Error", str(err), parent=self.text)
return False
chars, converted = self._decode(two_lines, bytes)
if chars is None:
except UnicodeDecodeError:
tkMessageBox.showerror("Decoding Error",
"File %s\nFailed to Decode" % filename,
parent=self.text)
return False
# We now convert all end-of-lines to '\n's
firsteol = self.eol_re.search(chars)
if firsteol:
self.eol_convention = firsteol.group(0)
chars = self.eol_re.sub(r"\n", chars)

self.text.delete("1.0", "end")
self.set_filename(None)
self.fileencoding = fileencoding
self.eol_convention = eol_convention
self.text.insert("1.0", chars)
self.reset_undo()
self.set_filename(filename)
Expand All @@ -205,74 +170,6 @@ def loadfile(self, filename):
self.updaterecentfileslist(filename)
return True

def _decode(self, two_lines, bytes):
"Create a Unicode string."
chars = None
# Check presence of a UTF-8 signature first
if bytes.startswith(BOM_UTF8):
try:
chars = bytes[3:].decode("utf-8")
except UnicodeDecodeError:
# has UTF-8 signature, but fails to decode...
return None, False
else:
# Indicates that this file originally had a BOM
self.fileencoding = 'BOM'
return chars, False
# Next look for coding specification
try:
enc = coding_spec(two_lines)
except LookupError as name:
tkMessageBox.showerror(
title="Error loading the file",
message="The encoding '%s' is not known to this Python "\
"installation. The file may not display correctly" % name,
parent = self.text)
enc = None
except UnicodeDecodeError:
return None, False
if enc:
try:
chars = str(bytes, enc)
self.fileencoding = enc
return chars, False
except UnicodeDecodeError:
pass
# Try ascii:
try:
chars = str(bytes, 'ascii')
self.fileencoding = None
return chars, False
except UnicodeDecodeError:
pass
# Try utf-8:
try:
chars = str(bytes, 'utf-8')
self.fileencoding = 'utf-8'
return chars, False
except UnicodeDecodeError:
pass
# Finally, try the locale's encoding. This is deprecated;
# the user should declare a non-ASCII encoding
try:
# Wait for the editor window to appear
self.editwin.text.update()
enc = askstring(
"Specify file encoding",
"The file's encoding is invalid for Python 3.x.\n"
"IDLE will convert it to UTF-8.\n"
"What is the current encoding of the file?",
initialvalue = encoding,
parent = self.editwin.text)

if enc:
chars = str(bytes, enc)
self.fileencoding = None
return chars, True
except (UnicodeDecodeError, LookupError):
pass
return None, False # None on failure

def maybesave(self):
if self.get_saved():
return "yes"
Expand Down Expand Up @@ -360,38 +257,30 @@ def encode(self, chars):
# text to us. Don't try to guess further.
return chars
# Preserve a BOM that might have been present on opening
if self.fileencoding == 'BOM':
return BOM_UTF8 + chars.encode("utf-8")
if self.fileencoding == 'utf-8-sig':
return chars.encode('utf-8-sig')
# See whether there is anything non-ASCII in it.
# If not, no need to figure out the encoding.
try:
return chars.encode('ascii')
except UnicodeError:
except UnicodeEncodeError:
pass
# Check if there is an encoding declared
try:
# a string, let coding_spec slice it to the first two lines
enc = coding_spec(chars)
failed = None
except LookupError as msg:
failed = msg
enc = None
else:
if not enc:
# PEP 3120: default source encoding is UTF-8
enc = 'utf-8'
if enc:
try:
return chars.encode(enc)
except UnicodeError:
failed = "Invalid encoding '%s'" % enc
encoded = chars.encode('ascii', 'replace')
enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline)
return chars.encode(enc)
except SyntaxError as err:
failed = str(err)
except UnicodeEncodeError:
failed = "Invalid encoding '%s'" % enc
tkMessageBox.showerror(
"I/O Error",
"%s.\nSaving as UTF-8" % failed,
parent = self.text)
parent=self.text)
# Fallback: save as UTF-8, with BOM - ignoring the incorrect
# declared encoding
return BOM_UTF8 + chars.encode("utf-8")
return chars.encode('utf-8-sig')

def print_window(self, event):
confirm = tkMessageBox.askokcancel(
Expand Down