@@ -444,6 +444,34 @@ static void fp_ungetc(int c, struct tok_state *tok) {
444444 ungetc (c , tok -> fp );
445445}
446446
447+ /* Check whether the characters at s start a valid
448+ UTF-8 sequence. Return the number of characters forming
449+ the sequence if yes, 0 if not. */
450+ static int valid_utf8 (const unsigned char * s )
451+ {
452+ int expected = 0 ;
453+ int length ;
454+ if (* s < 0x80 )
455+ /* single-byte code */
456+ return 1 ;
457+ if (* s < 0xc0 )
458+ /* following byte */
459+ return 0 ;
460+ if (* s < 0xE0 )
461+ expected = 1 ;
462+ else if (* s < 0xF0 )
463+ expected = 2 ;
464+ else if (* s < 0xF8 )
465+ expected = 3 ;
466+ else
467+ return 0 ;
468+ length = expected + 1 ;
469+ for (; expected ; expected -- )
470+ if (s [expected ] < 0x80 || s [expected ] >= 0xC0 )
471+ return 0 ;
472+ return length ;
473+ }
474+
447475/* Read a line of input from TOK. Determine encoding
448476 if necessary. */
449477
@@ -478,12 +506,13 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
478506 }
479507 }
480508#ifndef PGEN
481- /* The default encoding is ASCII , so make sure we don't have any
482- non-ASCII bytes in it. */
509+ /* The default encoding is UTF-8 , so make sure we don't have any
510+ non-UTF-8 sequences in it. */
483511 if (line && !tok -> encoding ) {
484512 unsigned char * c ;
485- for (c = (unsigned char * )line ; * c ; c ++ )
486- if (* c > 127 ) {
513+ int length ;
514+ for (c = (unsigned char * )line ; * c ; c += length )
515+ if (!(length = valid_utf8 (c ))) {
487516 badchar = * c ;
488517 break ;
489518 }
@@ -493,7 +522,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
493522 /* Need to add 1 to the line number, since this line
494523 has not been counted, yet. */
495524 sprintf (buf ,
496- "Non-ASCII character '\\x%.2x' "
525+ "Non-UTF-8 code starting with '\\x%.2x' "
497526 "in file %.200s on line %i, "
498527 "but no encoding declared; "
499528 "see http://www.python.org/peps/pep-0263.html for details" ,
0 commit comments