@@ -771,13 +771,17 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
771771 ch = ((s [0 ] & 0x7 ) << 18 ) + ((s [1 ] & 0x3f ) << 12 ) +
772772 ((s [2 ] & 0x3f ) << 6 ) + (s [3 ] & 0x3f );
773773 /* validate and convert to UTF-16 */
774- if ((ch < 0x10000 ) || /* minimum value allowed for 4
774+ if ((ch < 0x10000 ) /* minimum value allowed for 4
775775 byte encoding */
776- (ch > 0x10ffff )) { /* maximum value allowed for
776+ || (ch > 0x10ffff )) /* maximum value allowed for
777777 UTF-16 */
778+ {
778779 errmsg = "illegal encoding" ;
779780 goto utf8Error ;
780781 }
782+ #if Py_UNICODE_SIZE == 4
783+ * p ++ = (Py_UNICODE )ch ;
784+ #else
781785 /* compute and append the two surrogates: */
782786
783787 /* translate from 10000..10FFFF to 0..FFFF */
@@ -788,6 +792,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
788792
789793 /* low surrogate = bottom 10 bits added to DC00 */
790794 * p ++ = (Py_UNICODE )(0xDC00 + (ch & 0x03FF ));
795+ #endif
791796 break ;
792797
793798 default :
@@ -878,7 +883,13 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
878883 * p ++ = 0x80 | (ch & 0x3f );
879884 cbWritten += 2 ;
880885 }
881- else {
886+ else if (ch < 0x10000 ) {
887+ #if Py_UNICODE_SIZE == 4
888+ * p ++ = 0xe0 | (ch >>12 );
889+ * p ++ = 0x80 | ((ch >>6 ) & 0x3f );
890+ * p ++ = 0x80 | (ch & 0x3f );
891+ cbWritten += 3 ;
892+ #else
882893 /* Check for high surrogate */
883894 if (0xD800 <= ch && ch <= 0xDBFF ) {
884895 if (i != size ) {
@@ -909,7 +920,14 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
909920 }
910921 * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
911922 * p ++ = (char )(0x80 | (ch & 0x3f ));
912- }
923+ #endif
924+ } else {
925+ * p ++ = 0xf0 | (ch >>18 );
926+ * p ++ = 0x80 | ((ch >>12 ) & 0x3f );
927+ * p ++ = 0x80 | ((ch >>6 ) & 0x3f );
928+ * p ++ = 0x80 | (ch & 0x3f );
929+ cbWritten += 4 ;
930+ }
913931 }
914932 * p = '\0' ;
915933 if (_PyString_Resize (& v , p - q ))
@@ -935,7 +953,7 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
935953/* --- UTF-16 Codec ------------------------------------------------------- */
936954
937955static
938- int utf16_decoding_error (const Py_UNICODE * * source ,
956+ int utf16_decoding_error (const Py_UCS2 * * source ,
939957 Py_UNICODE * * dest ,
940958 const char * errors ,
941959 const char * details )
@@ -973,12 +991,12 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
973991{
974992 PyUnicodeObject * unicode ;
975993 Py_UNICODE * p ;
976- const Py_UNICODE * q , * e ;
994+ const Py_UCS2 * q , * e ;
977995 int bo = 0 ;
978996 const char * errmsg = "" ;
979997
980998 /* size should be an even number */
981- if (size % sizeof (Py_UNICODE ) != 0 ) {
999+ if (size % sizeof (Py_UCS2 ) != 0 ) {
9821000 if (utf16_decoding_error (NULL , NULL , errors , "truncated data" ))
9831001 return NULL ;
9841002 /* The remaining input chars are ignored if we fall through
@@ -995,8 +1013,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
9951013
9961014 /* Unpack UTF-16 encoded data */
9971015 p = unicode -> str ;
998- q = (Py_UNICODE * )s ;
999- e = q + (size / sizeof (Py_UNICODE ));
1016+ q = (Py_UCS2 * )s ;
1017+ e = q + (size / sizeof (Py_UCS2 ));
10001018
10011019 if (byteorder )
10021020 bo = * byteorder ;
@@ -1026,7 +1044,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
10261044 }
10271045
10281046 while (q < e ) {
1029- register Py_UNICODE ch = * q ++ ;
1047+ register Py_UCS2 ch = * q ++ ;
10301048
10311049 /* Swap input bytes if needed. (This assumes
10321050 sizeof(Py_UNICODE) == 2 !) */
@@ -1048,17 +1066,33 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
10481066 goto utf16Error ;
10491067 }
10501068 if (0xDC00 <= * q && * q <= 0xDFFF ) {
1051- q ++ ;
1052- if (0xD800 <= * q && * q <= 0xDBFF ) {
1069+ Py_UCS2 ch2 = * q ++ ;
1070+ #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1071+ if (bo == 1 )
1072+ ch = (ch >> 8 ) | (ch << 8 );
1073+ #else
1074+ if (bo == -1 )
1075+ ch = (ch >> 8 ) | (ch << 8 );
1076+ #endif
1077+ if (0xD800 <= ch && ch <= 0xDBFF ) {
1078+ #if Py_UNICODE_SIZE == 2
10531079 /* This is valid data (a UTF-16 surrogate pair), but
10541080 we are not able to store this information since our
10551081 Py_UNICODE type only has 16 bits... this might
10561082 change someday, even though it's unlikely. */
10571083 errmsg = "code pairs are not supported" ;
10581084 goto utf16Error ;
1059- }
1060- else
1085+ #else
1086+ * p ++ = ((( ch & 0x3FF )<< 10 ) | ( ch2 & 0x3FF )) + 0x10000 ;
10611087 continue ;
1088+ #endif
1089+
1090+ }
1091+ else {
1092+ errmsg = "illegal UTF-16 surrogate" ;
1093+ goto utf16Error ;
1094+ }
1095+
10621096 }
10631097 errmsg = "illegal encoding" ;
10641098 /* Fall through to report the error */
@@ -1090,17 +1124,20 @@ PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
10901124 int byteorder )
10911125{
10921126 PyObject * v ;
1093- Py_UNICODE * p ;
1127+ Py_UCS2 * p ;
10941128 char * q ;
1129+ int i , pairs , doswap = 1 ;
10951130
1096- /* We don't create UTF-16 pairs... */
1131+ for (i = pairs = 0 ; i < size ; i ++ )
1132+ if (s [i ] >= 0x10000 )
1133+ pairs ++ ;
10971134 v = PyString_FromStringAndSize (NULL ,
1098- sizeof (Py_UNICODE ) * (size + (byteorder == 0 )));
1135+ sizeof (Py_UCS2 ) * (size + pairs + (byteorder == 0 )));
10991136 if (v == NULL )
11001137 return NULL ;
11011138
11021139 q = PyString_AS_STRING (v );
1103- p = (Py_UNICODE * )q ;
1140+ p = (Py_UCS2 * )q ;
11041141 if (byteorder == 0 )
11051142 * p ++ = 0xFEFF ;
11061143 if (size == 0 )
@@ -1112,12 +1149,24 @@ PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
11121149 byteorder == 1
11131150#endif
11141151 )
1115- Py_UNICODE_COPY (p , s , size );
1116- else
1117- while (size -- > 0 ) {
1118- Py_UNICODE ch = * s ++ ;
1152+ doswap = 0 ;
1153+ while (size -- > 0 ) {
1154+ Py_UNICODE ch = * s ++ ;
1155+ Py_UNICODE ch2 = 0 ;
1156+ if (ch >= 0x10000 ) {
1157+ ch2 = 0xDC00 |((ch - 0x10000 ) & 0x3FF );
1158+ ch = 0xD800 |((ch - 0x10000 )>>10 );
1159+ }
1160+ if (doswap ){
11191161 * p ++ = (ch >> 8 ) | (ch << 8 );
1162+ if (ch2 )
1163+ * p ++ = (ch2 >> 8 ) | (ch2 << 8 );
1164+ }else {
1165+ * p ++ = ch ;
1166+ if (ch2 )
1167+ * p ++ = ch2 ;
11201168 }
1169+ }
11211170 return v ;
11221171}
11231172
@@ -1271,10 +1320,14 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
12711320 /* UCS-2 character */
12721321 * p ++ = (Py_UNICODE ) chr ;
12731322 else if (chr <= 0x10ffff ) {
1274- /* UCS-4 character. store as two surrogate characters */
1323+ /* UCS-4 character. Either store directly, or as surrogate pair. */
1324+ #if Py_UNICODE_SIZE == 4
1325+ * p ++ = chr ;
1326+ #else
12751327 chr -= 0x10000L ;
12761328 * p ++ = 0xD800 + (Py_UNICODE ) (chr >> 10 );
12771329 * p ++ = 0xDC00 + (Py_UNICODE ) (chr & 0x03FF );
1330+ #endif
12781331 } else {
12791332 if (unicodeescape_decoding_error (
12801333 & s , & x , errors ,
@@ -1383,6 +1436,19 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
13831436 * p ++ = '\\' ;
13841437 * p ++ = (char ) ch ;
13851438 }
1439+ /* Map 21-bit characters to '\U00xxxxxx' */
1440+ else if (ch >= 0x10000 ) {
1441+ * p ++ = '\\' ;
1442+ * p ++ = 'U' ;
1443+ * p ++ = hexdigit [(ch >> 28 ) & 0xf ];
1444+ * p ++ = hexdigit [(ch >> 24 ) & 0xf ];
1445+ * p ++ = hexdigit [(ch >> 20 ) & 0xf ];
1446+ * p ++ = hexdigit [(ch >> 16 ) & 0xf ];
1447+ * p ++ = hexdigit [(ch >> 12 ) & 0xf ];
1448+ * p ++ = hexdigit [(ch >> 8 ) & 0xf ];
1449+ * p ++ = hexdigit [(ch >> 4 ) & 0xf ];
1450+ * p ++ = hexdigit [ch & 15 ];
1451+ }
13861452 /* Map 16-bit characters to '\uxxxx' */
13871453 else if (ch >= 256 ) {
13881454 * p ++ = '\\' ;
@@ -5281,13 +5347,6 @@ void _PyUnicode_Init(void)
52815347{
52825348 int i ;
52835349
5284- /* Doublecheck the configuration... */
5285- #ifndef USE_UCS4_STORAGE
5286- if (sizeof (Py_UNICODE ) != 2 )
5287- Py_FatalError ("Unicode configuration error: "
5288- "sizeof(Py_UNICODE) != 2 bytes" );
5289- #endif
5290-
52915350 /* Init the implementation */
52925351 unicode_freelist = NULL ;
52935352 unicode_freelist_size = 0 ;
0 commit comments