Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f741a9d

Browse files
committed
Add comments about handled code ranges in each branch
1 parent 18927b1 commit f741a9d

File tree

1 file changed

+16
-0
lines changed

1 file changed

+16
-0
lines changed

Parser/tokenizer.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,19 +501,35 @@ valid_utf8(const unsigned char* s)
501501
/* single-byte code */
502502
return 1;
503503
} else if (*s < 0xE0) {
504+
/* \xC2\x80-\xDF\xBF -- 0080-07FF */
504505
if (*s < 0xC2) {
506+
/* invalid sequence
507+
\x80-\xBF -- continuation byte
508+
\xC0-\xC1 -- fake 0000-007F */
505509
return 0;
506510
}
507511
expected = 1;
508512
} else if (*s < 0xF0) {
513+
/* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
509514
if (*s == 0xE0 && *(s + 1) < 0xA0) {
515+
/* invalid sequence
516+
\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
510517
return 0;
511518
} else if (*s == 0xED && *(s + 1) >= 0xA0) {
519+
/* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
520+
will result in surrogates in range D800-DFFF. Surrogates are
521+
not valid UTF-8 so they are rejected.
522+
See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
523+
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
512524
return 0;
513525
}
514526
expected = 2;
515527
} else if (*s < 0xF5) {
528+
/* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
516529
if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
530+
/* invalid sequence -- one of:
531+
\xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
532+
\xF4\x90\x80\x80- -- 110000- overflow */
517533
return 0;
518534
}
519535
expected = 3;

0 commit comments

Comments
 (0)