Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 447d33e

Browse files
committed
Implement PEP 3120.
1 parent 5de17db commit 447d33e

5 files changed

Lines changed: 69 additions & 6 deletions

File tree

Lib/test/badsyntax_pep3120.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
print("b�se")

Lib/test/test_pep3120.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# This file is marked as binary in the CVS, to prevent MacCVS from recoding it.
2+
3+
import unittest
4+
from test import test_support
5+
6+
class PEP3120Test(unittest.TestCase):
7+
8+
def test_pep3120(self):
9+
self.assertEqual(
10+
"Питон".encode("utf-8"),
11+
b'\xd0\x9f\xd0\xb8\xd1\x82\xd0\xbe\xd0\xbd'
12+
)
13+
self.assertEqual(
14+
"\П".encode("utf-8"),
15+
b'\\\xd0\x9f'
16+
)
17+
18+
def test_badsyntax(self):
19+
try:
20+
import test.badsyntax_pep3120
21+
except SyntaxError as msg:
22+
self.assert_(str(msg).find("Non-UTF-8 code starting with") >= 0)
23+
else:
24+
self.fail("expected exception didn't occur")
25+
26+
def test_main():
27+
test_support.run_unittest(PEP3120Test)
28+
29+
if __name__=="__main__":
30+
test_main()

Misc/NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ TO DO
2626
Core and Builtins
2727
-----------------
2828

29+
- PEP 3120: Change default encoding to UTF-8.
30+
2931
- PEP 3123: Use proper C inheritance for PyObject.
3032

3133
- Removed the __oct__ and __hex__ special methods and added a bin()

Parser/tokenizer.c

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,34 @@ static void fp_ungetc(int c, struct tok_state *tok) {
444444
ungetc(c, tok->fp);
445445
}
446446

447+
/* Check whether the characters at s start a valid
448+
UTF-8 sequence. Return the number of characters forming
449+
the sequence if yes, 0 if not. */
450+
static int valid_utf8(const unsigned char* s)
451+
{
452+
int expected = 0;
453+
int length;
454+
if (*s < 0x80)
455+
/* single-byte code */
456+
return 1;
457+
if (*s < 0xc0)
458+
/* following byte */
459+
return 0;
460+
if (*s < 0xE0)
461+
expected = 1;
462+
else if (*s < 0xF0)
463+
expected = 2;
464+
else if (*s < 0xF8)
465+
expected = 3;
466+
else
467+
return 0;
468+
length = expected + 1;
469+
for (; expected; expected--)
470+
if (s[expected] < 0x80 || s[expected] >= 0xC0)
471+
return 0;
472+
return length;
473+
}
474+
447475
/* Read a line of input from TOK. Determine encoding
448476
if necessary. */
449477

@@ -478,12 +506,13 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
478506
}
479507
}
480508
#ifndef PGEN
481-
/* The default encoding is ASCII, so make sure we don't have any
482-
non-ASCII bytes in it. */
509+
/* The default encoding is UTF-8, so make sure we don't have any
510+
non-UTF-8 sequences in it. */
483511
if (line && !tok->encoding) {
484512
unsigned char *c;
485-
for (c = (unsigned char *)line; *c; c++)
486-
if (*c > 127) {
513+
int length;
514+
for (c = (unsigned char *)line; *c; c += length)
515+
if (!(length = valid_utf8(c))) {
487516
badchar = *c;
488517
break;
489518
}
@@ -493,7 +522,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
493522
/* Need to add 1 to the line number, since this line
494523
has not been counted, yet. */
495524
sprintf(buf,
496-
"Non-ASCII character '\\x%.2x' "
525+
"Non-UTF-8 code starting with '\\x%.2x' "
497526
"in file %.200s on line %i, "
498527
"but no encoding declared; "
499528
"see http://www.python.org/peps/pep-0263.html for details",

Python/ast.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,8 @@ PyAST_FromNode(const node *n, PyCompilerFlags *flags, const char *filename,
203203
c.c_encoding = STR(n);
204204
n = CHILD(n, 0);
205205
} else {
206-
c.c_encoding = NULL;
206+
/* PEP 3120 */
207+
c.c_encoding = "utf-8";
207208
}
208209
c.c_arena = arena;
209210

0 commit comments

Comments
 (0)