1- import codecs
2- from codecs import BOM_UTF8
31import os
4- import re
52import shlex
63import sys
74import tempfile
5+ import tokenize
86
97import tkinter .filedialog as tkFileDialog
108import tkinter .messagebox as tkMessageBox
2018 errors = 'surrogateescape'
2119
2220
23- coding_re = re .compile (r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)' , re .ASCII )
24- blank_re = re .compile (r'^[ \t\f]*(?:[#\r\n]|$)' , re .ASCII )
25-
26- def coding_spec (data ):
27- """Return the encoding declaration according to PEP 263.
28-
29- When checking encoded data, only the first two lines should be passed
30- in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
31- The first two lines would contain the encoding specification.
32-
33- Raise a LookupError if the encoding is declared but unknown.
34- """
35- if isinstance (data , bytes ):
36- # This encoding might be wrong. However, the coding
37- # spec must be ASCII-only, so any non-ASCII characters
38- # around here will be ignored. Decoding to Latin-1 should
39- # never fail (except for memory outage)
40- lines = data .decode ('iso-8859-1' )
41- else :
42- lines = data
43- # consider only the first two lines
44- if '\n ' in lines :
45- lst = lines .split ('\n ' , 2 )[:2 ]
46- elif '\r ' in lines :
47- lst = lines .split ('\r ' , 2 )[:2 ]
48- else :
49- lst = [lines ]
50- for line in lst :
51- match = coding_re .match (line )
52- if match is not None :
53- break
54- if not blank_re .match (line ):
55- return None
56- else :
57- return None
58- name = match .group (1 )
59- try :
60- codecs .lookup (name )
61- except LookupError :
62- # The standard encoding error does not indicate the encoding
63- raise LookupError ("Unknown encoding: " + name )
64- return name
65-
6621
6722class IOBinding :
6823# One instance per editor Window so methods know which to save, close.
@@ -78,7 +33,7 @@ def __init__(self, editwin):
7833 self .save_as )
7934 self .__id_savecopy = self .text .bind ("<<save-copy-of-window-as-file>>" ,
8035 self .save_a_copy )
81- self .fileencoding = None
36+ self .fileencoding = 'utf-8'
8237 self .__id_print = self .text .bind ("<<print-window>>" , self .print_window )
8338
8439 def close (self ):
@@ -165,34 +120,44 @@ def open(self, event=None, editFile=None):
165120 self .text .focus_set ()
166121 return "break"
167122
168- eol = r"(\r\n)|\n|\r" # \r\n (Windows), \n (UNIX), or \r (Mac)
169- eol_re = re .compile (eol )
170123 eol_convention = os .linesep # default
171124
172125 def loadfile (self , filename ):
173126 try :
174- # open the file in binary mode so that we can handle
175- # end-of-line convention ourselves.
176- with open (filename , 'rb' ) as f :
177- two_lines = f .readline () + f .readline ()
178- f .seek (0 )
179- bytes = f .read ()
180- except OSError as msg :
181- tkMessageBox .showerror ("I/O Error" , str (msg ), parent = self .text )
127+ try :
128+ with tokenize .open (filename ) as f :
129+ chars = f .read ()
130+ fileencoding = f .encoding
131+ eol_convention = f .newlines
132+ converted = False
133+ except (UnicodeDecodeError , SyntaxError ):
134+ # Wait for the editor window to appear
135+ self .editwin .text .update ()
136+ enc = askstring (
137+ "Specify file encoding" ,
138+ "The file's encoding is invalid for Python 3.x.\n "
139+ "IDLE will convert it to UTF-8.\n "
140+ "What is the current encoding of the file?" ,
141+ initialvalue = 'utf-8' ,
142+ parent = self .editwin .text )
143+ with open (filename , encoding = enc ) as f :
144+ chars = f .read ()
145+ fileencoding = f .encoding
146+ eol_convention = f .newlines
147+ converted = True
148+ except OSError as err :
149+ tkMessageBox .showerror ("I/O Error" , str (err ), parent = self .text )
182150 return False
183- chars , converted = self ._decode (two_lines , bytes )
184- if chars is None :
151+ except UnicodeDecodeError :
185152 tkMessageBox .showerror ("Decoding Error" ,
186153 "File %s\n Failed to Decode" % filename ,
187154 parent = self .text )
188155 return False
189- # We now convert all end-of-lines to '\n's
190- firsteol = self .eol_re .search (chars )
191- if firsteol :
192- self .eol_convention = firsteol .group (0 )
193- chars = self .eol_re .sub (r"\n" , chars )
156+
194157 self .text .delete ("1.0" , "end" )
195158 self .set_filename (None )
159+ self .fileencoding = fileencoding
160+ self .eol_convention = eol_convention
196161 self .text .insert ("1.0" , chars )
197162 self .reset_undo ()
198163 self .set_filename (filename )
@@ -205,74 +170,6 @@ def loadfile(self, filename):
205170 self .updaterecentfileslist (filename )
206171 return True
207172
208- def _decode (self , two_lines , bytes ):
209- "Create a Unicode string."
210- chars = None
211- # Check presence of a UTF-8 signature first
212- if bytes .startswith (BOM_UTF8 ):
213- try :
214- chars = bytes [3 :].decode ("utf-8" )
215- except UnicodeDecodeError :
216- # has UTF-8 signature, but fails to decode...
217- return None , False
218- else :
219- # Indicates that this file originally had a BOM
220- self .fileencoding = 'BOM'
221- return chars , False
222- # Next look for coding specification
223- try :
224- enc = coding_spec (two_lines )
225- except LookupError as name :
226- tkMessageBox .showerror (
227- title = "Error loading the file" ,
228- message = "The encoding '%s' is not known to this Python " \
229- "installation. The file may not display correctly" % name ,
230- parent = self .text )
231- enc = None
232- except UnicodeDecodeError :
233- return None , False
234- if enc :
235- try :
236- chars = str (bytes , enc )
237- self .fileencoding = enc
238- return chars , False
239- except UnicodeDecodeError :
240- pass
241- # Try ascii:
242- try :
243- chars = str (bytes , 'ascii' )
244- self .fileencoding = None
245- return chars , False
246- except UnicodeDecodeError :
247- pass
248- # Try utf-8:
249- try :
250- chars = str (bytes , 'utf-8' )
251- self .fileencoding = 'utf-8'
252- return chars , False
253- except UnicodeDecodeError :
254- pass
255- # Finally, try the locale's encoding. This is deprecated;
256- # the user should declare a non-ASCII encoding
257- try :
258- # Wait for the editor window to appear
259- self .editwin .text .update ()
260- enc = askstring (
261- "Specify file encoding" ,
262- "The file's encoding is invalid for Python 3.x.\n "
263- "IDLE will convert it to UTF-8.\n "
264- "What is the current encoding of the file?" ,
265- initialvalue = encoding ,
266- parent = self .editwin .text )
267-
268- if enc :
269- chars = str (bytes , enc )
270- self .fileencoding = None
271- return chars , True
272- except (UnicodeDecodeError , LookupError ):
273- pass
274- return None , False # None on failure
275-
276173 def maybesave (self ):
277174 if self .get_saved ():
278175 return "yes"
@@ -360,38 +257,30 @@ def encode(self, chars):
360257 # text to us. Don't try to guess further.
361258 return chars
362259 # Preserve a BOM that might have been present on opening
363- if self .fileencoding == 'BOM ' :
364- return BOM_UTF8 + chars .encode (" utf-8" )
260+ if self .fileencoding == 'utf-8-sig ' :
261+ return chars .encode (' utf-8-sig' )
365262 # See whether there is anything non-ASCII in it.
366263 # If not, no need to figure out the encoding.
367264 try :
368265 return chars .encode ('ascii' )
369- except UnicodeError :
266+ except UnicodeEncodeError :
370267 pass
371268 # Check if there is an encoding declared
372269 try :
373- # a string, let coding_spec slice it to the first two lines
374- enc = coding_spec (chars )
375- failed = None
376- except LookupError as msg :
377- failed = msg
378- enc = None
379- else :
380- if not enc :
381- # PEP 3120: default source encoding is UTF-8
382- enc = 'utf-8'
383- if enc :
384- try :
385- return chars .encode (enc )
386- except UnicodeError :
387- failed = "Invalid encoding '%s'" % enc
270+ encoded = chars .encode ('ascii' , 'replace' )
271+ enc , _ = tokenize .detect_encoding (io .BytesIO (encoded ).readline )
272+ return chars .encode (enc )
273+ except SyntaxError as err :
274+ failed = str (err )
275+ except UnicodeEncodeError :
276+ failed = "Invalid encoding '%s'" % enc
388277 tkMessageBox .showerror (
389278 "I/O Error" ,
390279 "%s.\n Saving as UTF-8" % failed ,
391- parent = self .text )
280+ parent = self .text )
392281 # Fallback: save as UTF-8, with BOM - ignoring the incorrect
393282 # declared encoding
394- return BOM_UTF8 + chars .encode (" utf-8" )
283+ return chars .encode (' utf-8-sig' )
395284
396285 def print_window (self , event ):
397286 confirm = tkMessageBox .askokcancel (
0 commit comments