@@ -944,8 +944,7 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
944944/* --- UTF-16 Codec ------------------------------------------------------- */
945945
946946static
947- int utf16_decoding_error (const Py_UCS2 * * source ,
948- Py_UNICODE * * dest ,
947+ int utf16_decoding_error (Py_UNICODE * * dest ,
949948 const char * errors ,
950949 const char * details )
951950{
@@ -975,23 +974,29 @@ int utf16_decoding_error(const Py_UCS2 **source,
975974 }
976975}
977976
978- PyObject * PyUnicode_DecodeUTF16 (const char * s ,
979- int size ,
980- const char * errors ,
981- int * byteorder )
977+ PyObject *
978+ PyUnicode_DecodeUTF16 (const char * s ,
979+ int size ,
980+ const char * errors ,
981+ int * byteorder )
982982{
983983 PyUnicodeObject * unicode ;
984984 Py_UNICODE * p ;
985- const Py_UCS2 * q , * e ;
986- int bo = 0 ;
985+ const unsigned char * q , * e ;
986+ int bo = 0 ; /* assume native ordering by default */
987987 const char * errmsg = "" ;
988+ /* Offsets from q for retrieving byte pairs in the right order. */
989+ #ifdef BYTEORDER_IS_LITTLE_ENDIAN
990+ int ihi = 1 , ilo = 0 ;
991+ #else
992+ int ihi = 0 , ilo = 1 ;
993+ #endif
988994
989995 /* size should be an even number */
990- if (size % sizeof (Py_UCS2 ) != 0 ) {
991- if (utf16_decoding_error (NULL , NULL , errors , "truncated data" ))
992- return NULL ;
993- /* The remaining input chars are ignored if we fall through
994- here... */
996+ if (size & 1 ) {
997+ if (utf16_decoding_error (NULL , errors , "truncated data" ))
998+ return NULL ;
999+ -- size ; /* else ignore the oddball byte */
9951000 }
9961001
9971002 /* Note: size will always be longer than the resulting Unicode
@@ -1004,48 +1009,54 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
10041009
10051010 /* Unpack UTF-16 encoded data */
10061011 p = unicode -> str ;
1007- q = (Py_UCS2 * )s ;
1008- e = q + ( size / sizeof ( Py_UCS2 )) ;
1012+ q = (unsigned char * )s ;
1013+ e = q + size ;
10091014
10101015 if (byteorder )
1011- bo = * byteorder ;
1016+ bo = * byteorder ;
10121017
10131018 /* Check for BOM marks (U+FEFF) in the input and adjust current
10141019 byte order setting accordingly. In native mode, the leading BOM
10151020 mark is skipped, in all other modes, it is copied to the output
10161021 stream as-is (giving a ZWNBSP character). */
10171022 if (bo == 0 ) {
1023+ const Py_UNICODE bom = (q [ihi ] << 8 ) | q [ilo ];
10181024#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1019- if (* q == 0xFEFF ) {
1020- q ++ ;
1025+ if (bom == 0xFEFF ) {
1026+ q += 2 ;
10211027 bo = -1 ;
1022- } else if (* q == 0xFFFE ) {
1023- q ++ ;
1028+ }
1029+ else if (bom == 0xFFFE ) {
1030+ q += 2 ;
10241031 bo = 1 ;
10251032 }
10261033#else
1027- if (* q == 0xFEFF ) {
1028- q ++ ;
1034+ if (bom == 0xFEFF ) {
1035+ q += 2 ;
10291036 bo = 1 ;
1030- } else if (* q == 0xFFFE ) {
1031- q ++ ;
1037+ }
1038+ else if (bom == 0xFFFE ) {
1039+ q += 2 ;
10321040 bo = -1 ;
10331041 }
10341042#endif
10351043 }
1036-
1044+
1045+ if (bo == -1 ) {
1046+ /* force LE */
1047+ ihi = 1 ;
1048+ ilo = 0 ;
1049+ }
1050+ else if (bo == 1 ) {
1051+ /* force BE */
1052+ ihi = 0 ;
1053+ ilo = 1 ;
1054+ }
1055+
10371056 while (q < e ) {
1038- register Py_UCS2 ch = * q ++ ;
1057+ Py_UNICODE ch = (q [ihi ] << 8 ) | q [ilo ];
1058+ q += 2 ;
10391059
1040- /* Swap input bytes if needed. (This assumes
1041- sizeof(Py_UNICODE) == 2 !) */
1042- #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1043- if (bo == 1 )
1044- ch = (ch >> 8 ) | (ch << 8 );
1045- #else
1046- if (bo == -1 )
1047- ch = (ch >> 8 ) | (ch << 8 );
1048- #endif
10491060 if (ch < 0xD800 || ch > 0xDFFF ) {
10501061 * p ++ = ch ;
10511062 continue ;
@@ -1057,14 +1068,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
10571068 goto utf16Error ;
10581069 }
10591070 if (0xD800 <= ch && ch <= 0xDBFF ) {
1060- Py_UCS2 ch2 = * q ++ ;
1061- #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1062- if (bo == 1 )
1063- ch2 = (ch2 >> 8 ) | (ch2 << 8 );
1064- #else
1065- if (bo == -1 )
1066- ch2 = (ch2 >> 8 ) | (ch2 << 8 );
1067- #endif
1071+ Py_UNICODE ch2 = (q [ihi ] << 8 ) | q [ilo ];
1072+ q += 2 ;
10681073 if (0xDC00 <= ch2 && ch2 <= 0xDFFF ) {
10691074#ifndef Py_UNICODE_WIDE
10701075 * p ++ = ch ;
@@ -1084,7 +1089,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
10841089 /* Fall through to report the error */
10851090
10861091 utf16Error :
1087- if (utf16_decoding_error (& q , & p , errors , errmsg ))
1092+ if (utf16_decoding_error (& p , errors , errmsg ))
10881093 goto onError ;
10891094 }
10901095
@@ -1102,58 +1107,67 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
11021107 return NULL ;
11031108}
11041109
1105- #undef UTF16_ERROR
1106-
1107- PyObject * PyUnicode_EncodeUTF16 (const Py_UNICODE * s ,
1108- int size ,
1109- const char * errors ,
1110- int byteorder )
1110+ PyObject *
1111+ PyUnicode_EncodeUTF16 (const Py_UNICODE * s ,
1112+ int size ,
1113+ const char * errors ,
1114+ int byteorder )
11111115{
11121116 PyObject * v ;
1113- Py_UCS2 * p ;
1114- char * q ;
1115- int i , pairs , doswap = 1 ;
1117+ unsigned char * p ;
1118+ int i , pairs ;
1119+ /* Offsets from p for storing byte pairs in the right order. */
1120+ #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1121+ int ihi = 1 , ilo = 0 ;
1122+ #else
1123+ int ihi = 0 , ilo = 1 ;
1124+ #endif
1125+
1126+ #define STORECHAR (CH ) \
1127+ do { \
1128+ p[ihi] = ((CH) >> 8) & 0xff; \
1129+ p[ilo] = (CH) & 0xff; \
1130+ p += 2; \
1131+ } while(0)
11161132
11171133 for (i = pairs = 0 ; i < size ; i ++ )
11181134 if (s [i ] >= 0x10000 )
11191135 pairs ++ ;
11201136 v = PyString_FromStringAndSize (NULL ,
1121- sizeof ( Py_UCS2 ) * (size + pairs + (byteorder == 0 )));
1137+ 2 * (size + pairs + (byteorder == 0 )));
11221138 if (v == NULL )
11231139 return NULL ;
11241140
1125- q = PyString_AS_STRING (v );
1126- p = (Py_UCS2 * )q ;
1141+ p = (unsigned char * )PyString_AS_STRING (v );
11271142 if (byteorder == 0 )
1128- * p ++ = 0xFEFF ;
1143+ STORECHAR ( 0xFEFF ) ;
11291144 if (size == 0 )
11301145 return v ;
1131- if (byteorder == 0 ||
1132- #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1133- byteorder == -1
1134- #else
1135- byteorder == 1
1136- #endif
1137- )
1138- doswap = 0 ;
1146+
1147+ if (byteorder == -1 ) {
1148+ /* force LE */
1149+ ihi = 1 ;
1150+ ilo = 0 ;
1151+ }
1152+ else if (byteorder == 1 ) {
1153+ /* force BE */
1154+ ihi = 0 ;
1155+ ilo = 1 ;
1156+ }
1157+
11391158 while (size -- > 0 ) {
11401159 Py_UNICODE ch = * s ++ ;
11411160 Py_UNICODE ch2 = 0 ;
11421161 if (ch >= 0x10000 ) {
1143- ch2 = 0xDC00 |((ch - 0x10000 ) & 0x3FF );
1144- ch = 0xD800 |((ch - 0x10000 )>>10 );
1145- }
1146- if (doswap ){
1147- * p ++ = (ch >> 8 ) | (ch << 8 );
1148- if (ch2 )
1149- * p ++ = (ch2 >> 8 ) | (ch2 << 8 );
1150- }else {
1151- * p ++ = ch ;
1152- if (ch2 )
1153- * p ++ = ch2 ;
1162+ ch2 = 0xDC00 | ((ch - 0x10000 ) & 0x3FF );
1163+ ch = 0xD800 | ((ch - 0x10000 ) >> 10 );
11541164 }
1165+ STORECHAR (ch );
1166+ if (ch2 )
1167+ STORECHAR (ch2 );
11551168 }
11561169 return v ;
1170+ #undef STORECHAR
11571171}
11581172
11591173PyObject * PyUnicode_AsUTF16String (PyObject * unicode )
0 commit comments