Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 768c16c

Browse files
Issue #18960: Fix bugs with Python source code encoding in the second line.
* The first line of Python script could be executed twice when the source encoding (not equal to 'utf-8') was specified on the second line. * Now the source encoding declaration on the second line isn't effective if the first line contains anything except a comment. * As a consequence, 'python -x' works now again with files with the source encoding declarations specified on the second file, and can be used again to make Python batch files on Windows. * The tokenize module now ignore the source encoding declaration on the second line if the first line contains anything except a comment. * IDLE now ignores the source encoding declaration on the second line if the first line contains anything except a comment. * 2to3 and the findnocoding.py script now ignore the source encoding declaration on the second line if the first line contains anything except a comment.
1 parent 21e7d4c commit 768c16c

7 files changed

Lines changed: 87 additions & 5 deletions

File tree

Lib/idlelib/IOBinding.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
### 'encoding' is used below in encode(), check!
6565

6666
coding_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
67+
blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
6768

6869
def coding_spec(data):
6970
"""Return the encoding declaration according to PEP 263.
@@ -93,6 +94,8 @@ def coding_spec(data):
9394
match = coding_re.match(line)
9495
if match is not None:
9596
break
97+
if not blank_re.match(line):
98+
return None
9699
else:
97100
return None
98101
name = match.group(1)

Lib/lib2to3/pgen2/tokenize.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ def compat(self, token, iterable):
237237
toks_append(tokval)
238238

239239
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
240+
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
240241

241242
def _get_normal_name(orig_enc):
242243
"""Imitates get_normal_name in tokenizer.c."""
@@ -309,6 +310,8 @@ def find_cookie(line):
309310
encoding = find_cookie(first)
310311
if encoding:
311312
return encoding, [first]
313+
if not blank_re.match(first):
314+
return default, [first]
312315

313316
second = read_or_stop()
314317
if not second:

Lib/test/test_tokenize.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,39 @@ def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
885885
readline = self.get_readline(lines)
886886
self.assertRaises(SyntaxError, detect_encoding, readline)
887887

888+
def test_cookie_second_line_noncommented_first_line(self):
889+
lines = (
890+
b"print('\xc2\xa3')\n",
891+
b'# vim: set fileencoding=iso8859-15 :\n',
892+
b"print('\xe2\x82\xac')\n"
893+
)
894+
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
895+
self.assertEqual(encoding, 'utf-8')
896+
expected = [b"print('\xc2\xa3')\n"]
897+
self.assertEqual(consumed_lines, expected)
898+
899+
def test_cookie_second_line_commented_first_line(self):
900+
lines = (
901+
b"#print('\xc2\xa3')\n",
902+
b'# vim: set fileencoding=iso8859-15 :\n',
903+
b"print('\xe2\x82\xac')\n"
904+
)
905+
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
906+
self.assertEqual(encoding, 'iso8859-15')
907+
expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
908+
self.assertEqual(consumed_lines, expected)
909+
910+
def test_cookie_second_line_empty_first_line(self):
911+
lines = (
912+
b'\n',
913+
b'# vim: set fileencoding=iso8859-15 :\n',
914+
b"print('\xe2\x82\xac')\n"
915+
)
916+
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
917+
self.assertEqual(encoding, 'iso8859-15')
918+
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
919+
self.assertEqual(consumed_lines, expected)
920+
888921
def test_latin1_normalization(self):
889922
# See get_normal_name() in tokenizer.c.
890923
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",

Lib/tokenize.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import collections
3333
from io import TextIOWrapper
3434
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
35+
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
3536

3637
import token
3738
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
@@ -409,6 +410,8 @@ def find_cookie(line):
409410
encoding = find_cookie(first)
410411
if encoding:
411412
return encoding, [first]
413+
if not blank_re.match(first):
414+
return default, [first]
412415

413416
second = read_or_stop()
414417
if not second:

Misc/NEWS

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@ What's New in Python 3.3.4 release candidate 1?
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #18960: The first line of Python script could be executed twice when
14+
the source encoding was specified on the second line. Now the source encoding
15+
declaration on the second line isn't effective if the first line contains
16+
anything except a comment. 'python -x' works now again with files with the
17+
source encoding declarations, and can be used to make Python batch files
18+
on Windows.
19+
1320
- Issue #19081: When a zipimport .zip file in sys.path being imported from
1421
is modified during the lifetime of the Python process after zipimport has
1522
already cached the zip's table of contents we detect this and recover
@@ -36,6 +43,9 @@ Core and Builtins
3643
Library
3744
-------
3845

46+
- Issue #18960: The tokenize module now ignore the source encoding declaration
47+
on the second line if the first line contains anything except a comment.
48+
3949
- Issue #20078: Reading malformed zipfiles no longer hangs with 100% CPU
4050
consumption.
4151

@@ -204,6 +214,9 @@ Library
204214
IDLE
205215
----
206216

217+
- Issue #18960: IDLE now ignores the source encoding declaration on the second
218+
line if the first line contains anything except a comment.
219+
207220
- Issue #20058: sys.stdin.readline() in IDLE now always returns only one line.
208221

209222
- Issue #19481: print() of string subclass instance in IDLE no longer hangs.
@@ -281,6 +294,13 @@ Build
281294
- Add workaround for VS 2010 nmake clean issue. VS 2010 doesn't set up PATH
282295
for nmake.exe correctly.
283296

297+
Tools/Demos
298+
-----------
299+
300+
- Issue #18960: 2to3 and the findnocoding.py script now ignore the source
301+
encoding declaration on the second line if the first line contains anything
302+
except a comment.
303+
284304

285305
What's New in Python 3.3.3?
286306
===========================

Parser/tokenizer.c

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -283,13 +283,27 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
283283
char *cs;
284284
int r = 1;
285285

286-
if (tok->cont_line)
286+
if (tok->cont_line) {
287287
/* It's a continuation line, so it can't be a coding spec. */
288+
tok->read_coding_spec = 1;
288289
return 1;
290+
}
289291
if (!get_coding_spec(line, &cs, size, tok))
290292
return 0;
291-
if (!cs)
293+
if (!cs) {
294+
Py_ssize_t i;
295+
for (i = 0; i < size; i++) {
296+
if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
297+
break;
298+
if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
299+
/* Stop checking coding spec after a line containing
300+
* anything except a comment. */
301+
tok->read_coding_spec = 1;
302+
break;
303+
}
304+
}
292305
return 1;
306+
}
293307
tok->read_coding_spec = 1;
294308
if (tok->encoding == NULL) {
295309
assert(tok->decoding_state == STATE_RAW);
@@ -476,13 +490,17 @@ fp_setreadl(struct tok_state *tok, const char* enc)
476490
_Py_IDENTIFIER(open);
477491
_Py_IDENTIFIER(readline);
478492
int fd;
493+
long pos;
479494

480495
io = PyImport_ImportModuleNoBlock("io");
481496
if (io == NULL)
482497
goto cleanup;
483498

484499
fd = fileno(tok->fp);
485-
if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
500+
/* Due to buffering the file offset for fd can be different from the file
501+
* position of tok->fp. */
502+
pos = ftell(tok->fp);
503+
if (pos == -1 || lseek(fd, (off_t)pos, SEEK_SET) == (off_t)-1) {
486504
PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
487505
goto cleanup;
488506
}
@@ -751,7 +769,7 @@ decode_str(const char *input, int single, struct tok_state *tok)
751769
if (newl[0]) {
752770
if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
753771
return error_ret(tok);
754-
if (tok->enc == NULL && newl[1]) {
772+
if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
755773
if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
756774
tok, buf_setreadl))
757775
return error_ret(tok);

Tools/scripts/findnocoding.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def walk_python_files(self, paths, *args, **kwargs):
3333

3434

3535
decl_re = re.compile(rb'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)')
36+
blank_re = re.compile(rb'^[ \t\f]*(?:[#\r\n]|$)')
3637

3738
def get_declaration(line):
3839
match = decl_re.match(line)
@@ -58,7 +59,8 @@ def needs_declaration(fullpath):
5859
line1 = infile.readline()
5960
line2 = infile.readline()
6061

61-
if get_declaration(line1) or get_declaration(line2):
62+
if (get_declaration(line1) or
63+
blank_re.match(line1) and get_declaration(line2)):
6264
# the file does have an encoding declaration, so trust it
6365
return False
6466

0 commit comments

Comments
 (0)