Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit e12896e

Browse files
committed
New surrogate support in the UTF-8 codec. By Bill Tutt.
1 parent d6d06ad commit e12896e

1 file changed

Lines changed: 80 additions & 29 deletions

File tree

Objects/unicodeobject.c

Lines changed: 80 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -657,10 +657,10 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
657657
e = s + size;
658658

659659
while (s < e) {
660-
register Py_UNICODE ch = (unsigned char)*s;
660+
Py_UCS4 ch = (unsigned char)*s;
661661

662662
if (ch < 0x80) {
663-
*p++ = ch;
663+
*p++ = (Py_UNICODE)ch;
664664
s++;
665665
continue;
666666
}
@@ -687,7 +687,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
687687
if (ch < 0x80)
688688
UTF8_ERROR("illegal encoding");
689689
else
690-
*p++ = ch;
690+
*p++ = (Py_UNICODE)ch;
691691
break;
692692

693693
case 3:
@@ -698,7 +698,30 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
698698
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
699699
UTF8_ERROR("illegal encoding");
700700
else
701-
*p++ = ch;
701+
*p++ = (Py_UNICODE)ch;
702+
break;
703+
704+
case 4:
705+
if ((s[1] & 0xc0) != 0x80 ||
706+
(s[2] & 0xc0) != 0x80 ||
707+
(s[3] & 0xc0) != 0x80)
708+
UTF8_ERROR("invalid data");
709+
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
710+
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
711+
/* validate and convert to UTF-16 */
712+
if ((ch < 0x10000) || /* minimum value allowed for 4 byte encoding */
713+
(ch > 0x10ffff)) /* maximum value allowed for UTF-16 */
714+
UTF8_ERROR("illegal encoding");
715+
/* compute and append the two surrogates: */
716+
717+
/* translate from 10000..10FFFF to 0..FFFF */
718+
ch -= 0x10000;
719+
720+
/* high surrogate = top 10 bits added to D800 */
721+
*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
722+
723+
/* low surrogate = bottom 10 bits added to DC00 */
724+
*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
702725
break;
703726

704727
default:
@@ -758,32 +781,60 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
758781
PyObject *v;
759782
char *p;
760783
char *q;
784+
Py_UCS4 ch2;
785+
unsigned int cbAllocated = 3 * size;
786+
unsigned int cbWritten = 0;
787+
int i = 0;
761788

762-
v = PyString_FromStringAndSize(NULL, 3 * size);
789+
v = PyString_FromStringAndSize(NULL, cbAllocated);
763790
if (v == NULL)
764791
return NULL;
765792
if (size == 0)
766793
goto done;
767794

768795
p = q = PyString_AS_STRING(v);
769-
while (size-- > 0) {
770-
Py_UNICODE ch = *s++;
771-
if (ch < 0x80)
796+
while (i < size) {
797+
Py_UCS4 ch = s[i++];
798+
if (ch < 0x80) {
772799
*p++ = (char) ch;
800+
cbWritten++;
801+
}
773802
else if (ch < 0x0800) {
774803
*p++ = 0xc0 | (ch >> 6);
775804
*p++ = 0x80 | (ch & 0x3f);
776-
} else if (0xD800 <= ch && ch <= 0xDFFF) {
777-
/* These byte ranges are reserved for UTF-16 surrogate
778-
bytes which the Python implementation currently does
779-
not support. */
780-
if (utf8_encoding_error(&s, &p, errors,
781-
"unsupported code range"))
805+
cbWritten += 2;
806+
}
807+
else {
808+
/* Check for high surrogate */
809+
if (0xD800 <= ch && ch <= 0xDBFF) {
810+
if (i != size) {
811+
ch2 = s[i];
812+
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
813+
814+
if (cbWritten >= (cbAllocated - 4)) {
815+
/* Provide enough room for some more
816+
surrogates */
817+
cbAllocated += 4*10;
818+
if (_PyString_Resize(&v, cbAllocated))
782819
goto onError;
783-
} else {
784-
*p++ = 0xe0 | (ch >> 12);
785-
*p++ = 0x80 | ((ch >> 6) & 0x3f);
786-
*p++ = 0x80 | (ch & 0x3f);
820+
}
821+
822+
/* combine the two values */
823+
ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
824+
825+
*p++ = (char)((ch >> 18) | 0xf0);
826+
*p++ = (char)(0x80 | (ch >> 12) & 0x3f);
827+
i++;
828+
cbWritten += 4;
829+
}
830+
}
831+
}
832+
else {
833+
*p++ = (char)(0xe0 | (ch >> 12));
834+
cbWritten += 3;
835+
}
836+
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
837+
*p++ = (char)(0x80 | (ch & 0x3f));
787838
}
788839
}
789840
*p = '\0';
@@ -1217,7 +1268,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
12171268
{
12181269
const char *start = s + 1;
12191270
const char *endBrace = start;
1220-
unsigned int uiValue;
1271+
Py_UCS4 value;
12211272
unsigned long j;
12221273

12231274
/* look for either the closing brace, or we
@@ -1248,25 +1299,25 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
12481299
}
12491300
goto ucnFallthrough;
12501301
}
1251-
uiValue = ((_Py_UnicodeCharacterName *)
1252-
(pucnHash->getValue(j)))->uiValue;
1253-
if (uiValue < 1<<16)
1302+
value = ((_Py_UnicodeCharacterName *)
1303+
(pucnHash->getValue(j)))->value;
1304+
if (value < 1<<16)
12541305
{
12551306
/* In UCS-2 range, easy solution.. */
1256-
*p++ = uiValue;
1307+
*p++ = value;
12571308
}
12581309
else
12591310
{
12601311
/* Oops, its in UCS-4 space, */
12611312
/* compute and append the two surrogates: */
12621313
/* translate from 10000..10FFFF to 0..FFFFF */
1263-
uiValue -= 0x10000;
1314+
value -= 0x10000;
12641315

12651316
/* high surrogate = top 10 bits added to D800 */
1266-
*p++ = 0xD800 + (uiValue >> 10);
1317+
*p++ = 0xD800 + (value >> 10);
12671318

12681319
/* low surrogate = bottom 10 bits added to DC00 */
1269-
*p++ = 0xDC00 + (uiValue & ~0xFC00);
1320+
*p++ = 0xDC00 + (value & ~0xFC00);
12701321
}
12711322
s = endBrace + 1;
12721323
}
@@ -3091,12 +3142,12 @@ unicode_center(PyUnicodeObject *self, PyObject *args)
30913142
/* gleaned from: */
30923143
/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
30933144

3094-
static unsigned long utf16Fixup[32] =
3145+
static short utf16Fixup[32] =
30953146
{
30963147
0, 0, 0, 0, 0, 0, 0, 0,
30973148
0, 0, 0, 0, 0, 0, 0, 0,
30983149
0, 0, 0, 0, 0, 0, 0, 0,
3099-
0, 0, 0, 0x2000, 0xf800, 0xf800, 0xf800, 0xf800
3150+
0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
31003151
};
31013152

31023153
static int
@@ -3111,7 +3162,7 @@ unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
31113162
len2 = str2->length;
31123163

31133164
while (len1 > 0 && len2 > 0) {
3114-
unsigned long c1, c2;
3165+
Py_UNICODE c1, c2;
31153166
long diff;
31163167

31173168
c1 = *s1++;

0 commit comments

Comments
 (0)