@@ -657,10 +657,10 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
657657 e = s + size ;
658658
659659 while (s < e ) {
660- register Py_UNICODE ch = (unsigned char )* s ;
660+ Py_UCS4 ch = (unsigned char )* s ;
661661
662662 if (ch < 0x80 ) {
663- * p ++ = ch ;
663+ * p ++ = ( Py_UNICODE ) ch ;
664664 s ++ ;
665665 continue ;
666666 }
@@ -687,7 +687,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
687687 if (ch < 0x80 )
688688 UTF8_ERROR ("illegal encoding" );
689689 else
690- * p ++ = ch ;
690+ * p ++ = ( Py_UNICODE ) ch ;
691691 break ;
692692
693693 case 3 :
@@ -698,7 +698,30 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
698698 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000 ))
699699 UTF8_ERROR ("illegal encoding" );
700700 else
701- * p ++ = ch ;
701+ * p ++ = (Py_UNICODE )ch ;
702+ break ;
703+
704+ case 4 :
705+ if ((s [1 ] & 0xc0 ) != 0x80 ||
706+ (s [2 ] & 0xc0 ) != 0x80 ||
707+ (s [3 ] & 0xc0 ) != 0x80 )
708+ UTF8_ERROR ("invalid data" );
709+ ch = ((s [0 ] & 0x7 ) << 18 ) + ((s [1 ] & 0x3f ) << 12 ) +
710+ ((s [2 ] & 0x3f ) << 6 ) + (s [3 ] & 0x3f );
711+ /* validate and convert to UTF-16 */
712+ if ((ch < 0x10000 ) || /* minimum value allowed for 4 byte encoding */
713+ (ch > 0x10ffff )) /* maximum value allowed for UTF-16 */
714+ UTF8_ERROR ("illegal encoding" );
715+ /* compute and append the two surrogates: */
716+
717+ /* translate from 10000..10FFFF to 0..FFFF */
718+ ch -= 0x10000 ;
719+
720+ /* high surrogate = top 10 bits added to D800 */
721+ * p ++ = (Py_UNICODE )(0xD800 + (ch >> 10 ));
722+
723+ /* low surrogate = bottom 10 bits added to DC00 */
724+ * p ++ = (Py_UNICODE )(0xDC00 + (ch & ~0xFC00 ));
702725 break ;
703726
704727 default :
@@ -758,32 +781,60 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
758781 PyObject * v ;
759782 char * p ;
760783 char * q ;
784+ Py_UCS4 ch2 ;
785+ unsigned int cbAllocated = 3 * size ;
786+ unsigned int cbWritten = 0 ;
787+ int i = 0 ;
761788
762- v = PyString_FromStringAndSize (NULL , 3 * size );
789+ v = PyString_FromStringAndSize (NULL , cbAllocated );
763790 if (v == NULL )
764791 return NULL ;
765792 if (size == 0 )
766793 goto done ;
767794
768795 p = q = PyString_AS_STRING (v );
769- while (size -- > 0 ) {
770- Py_UNICODE ch = * s ++ ;
771- if (ch < 0x80 )
796+ while (i < size ) {
797+ Py_UCS4 ch = s [ i ++ ] ;
798+ if (ch < 0x80 ) {
772799 * p ++ = (char ) ch ;
800+ cbWritten ++ ;
801+ }
773802 else if (ch < 0x0800 ) {
774803 * p ++ = 0xc0 | (ch >> 6 );
775804 * p ++ = 0x80 | (ch & 0x3f );
776- } else if (0xD800 <= ch && ch <= 0xDFFF ) {
777- /* These byte ranges are reserved for UTF-16 surrogate
778- bytes which the Python implementation currently does
779- not support. */
780- if (utf8_encoding_error (& s , & p , errors ,
781- "unsupported code range" ))
805+ cbWritten += 2 ;
806+ }
807+ else {
808+ /* Check for high surrogate */
809+ if (0xD800 <= ch && ch <= 0xDBFF ) {
810+ if (i != size ) {
811+ ch2 = s [i ];
812+ if (0xDC00 <= ch2 && ch2 <= 0xDFFF ) {
813+
814+ if (cbWritten >= (cbAllocated - 4 )) {
815+ /* Provide enough room for some more
816+ surrogates */
817+ cbAllocated += 4 * 10 ;
818+ if (_PyString_Resize (& v , cbAllocated ))
782819 goto onError ;
783- } else {
784- * p ++ = 0xe0 | (ch >> 12 );
785- * p ++ = 0x80 | ((ch >> 6 ) & 0x3f );
786- * p ++ = 0x80 | (ch & 0x3f );
820+ }
821+
822+ /* combine the two values */
823+ ch = ((ch - 0xD800 )<<10 | (ch2 - 0xDC00 ))+ 0x10000 ;
824+
825+ * p ++ = (char )((ch >> 18 ) | 0xf0 );
826+ * p ++ = (char )(0x80 | (ch >> 12 ) & 0x3f );
827+ i ++ ;
828+ cbWritten += 4 ;
829+ }
830+ }
831+ }
832+ else {
833+ * p ++ = (char )(0xe0 | (ch >> 12 ));
834+ cbWritten += 3 ;
835+ }
836+ * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
837+ * p ++ = (char )(0x80 | (ch & 0x3f ));
787838 }
788839 }
789840 * p = '\0' ;
@@ -1217,7 +1268,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
12171268 {
12181269 const char * start = s + 1 ;
12191270 const char * endBrace = start ;
1220- unsigned int uiValue ;
1271+ Py_UCS4 value ;
12211272 unsigned long j ;
12221273
12231274 /* look for either the closing brace, or we
@@ -1248,25 +1299,25 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
12481299 }
12491300 goto ucnFallthrough ;
12501301 }
1251- uiValue = ((_Py_UnicodeCharacterName * )
1252- (pucnHash -> getValue (j )))-> uiValue ;
1253- if (uiValue < 1 <<16 )
1302+ value = ((_Py_UnicodeCharacterName * )
1303+ (pucnHash -> getValue (j )))-> value ;
1304+ if (value < 1 <<16 )
12541305 {
12551306 /* In UCS-2 range, easy solution.. */
1256- * p ++ = uiValue ;
1307+ * p ++ = value ;
12571308 }
12581309 else
12591310 {
12601311 /* Oops, its in UCS-4 space, */
12611312 /* compute and append the two surrogates: */
12621313 /* translate from 10000..10FFFF to 0..FFFFF */
1263- uiValue -= 0x10000 ;
1314+ value -= 0x10000 ;
12641315
12651316 /* high surrogate = top 10 bits added to D800 */
1266- * p ++ = 0xD800 + (uiValue >> 10 );
1317+ * p ++ = 0xD800 + (value >> 10 );
12671318
12681319 /* low surrogate = bottom 10 bits added to DC00 */
1269- * p ++ = 0xDC00 + (uiValue & ~0xFC00 );
1320+ * p ++ = 0xDC00 + (value & ~0xFC00 );
12701321 }
12711322 s = endBrace + 1 ;
12721323 }
@@ -3091,12 +3142,12 @@ unicode_center(PyUnicodeObject *self, PyObject *args)
30913142/* gleaned from: */
30923143/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
30933144
3094- static unsigned long utf16Fixup [32 ] =
3145+ static short utf16Fixup [32 ] =
30953146{
30963147 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
30973148 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
30983149 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
3099- 0 , 0 , 0 , 0x2000 , 0xf800 , 0xf800 , 0xf800 , 0xf800
3150+ 0 , 0 , 0 , 0x2000 , -0x800 , -0x800 , -0x800 , -0x800
31003151};
31013152
31023153static int
@@ -3111,7 +3162,7 @@ unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
31113162 len2 = str2 -> length ;
31123163
31133164 while (len1 > 0 && len2 > 0 ) {
3114- unsigned long c1 , c2 ;
3165+ Py_UNICODE c1 , c2 ;
31153166 long diff ;
31163167
31173168 c1 = * s1 ++ ;
0 commit comments