Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0ba70cc

Browse files
committed
Support using UCS-4 as the Py_UNICODE type:
Add configure option --enable-unicode. Add config.h macros Py_USING_UNICODE, PY_UNICODE_TYPE, Py_UNICODE_SIZE, SIZEOF_WCHAR_T. Define Py_UCS2. Encode and decode large UTF-8 characters into single Py_UNICODE values for wide Unicode types; likewise for UTF-16. Remove test whether sizeof Py_UNICODE is two.
1 parent ff1cc90 commit 0ba70cc

7 files changed

Lines changed: 666 additions & 472 deletions

File tree

Include/unicodeobject.h

Lines changed: 23 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -60,28 +60,26 @@ Copyright (c) Corporation for National Research Initiatives.
6060

6161
/* experimental UCS-4 support. enable at your own risk! */
6262
#undef USE_UCS4_STORAGE
63-
64-
/*
65-
* Use this typedef when you need to represent a UTF-16 surrogate pair
66-
* as single unsigned integer.
67-
*/
68-
#if SIZEOF_INT >= 4
69-
typedef unsigned int Py_UCS4;
70-
#elif SIZEOF_LONG >= 4
71-
typedef unsigned long Py_UCS4;
72-
#endif
63+
#if Py_UNICODE_SIZE == 4
64+
#define USE_UCS4_STORAGE
65+
#endif
7366

7467
/* Set these flags if the platform has "wchar.h", "wctype.h" and the
7568
wchar_t type is a 16-bit unsigned type */
7669
/* #define HAVE_WCHAR_H */
7770
/* #define HAVE_USABLE_WCHAR_T */
7871

7972
/* Defaults for various platforms */
80-
#ifndef HAVE_USABLE_WCHAR_T
73+
#ifndef PY_UNICODE_TYPE
8174

8275
/* Windows has a usable wchar_t type (unless we're using UCS-4) */
8376
# if defined(MS_WIN32) && !defined(USE_UCS4_STORAGE)
8477
# define HAVE_USABLE_WCHAR_T
78+
# define PY_UNICODE_TYPE wchar_t
79+
# endif
80+
81+
# if defined(USE_UCS4_STORAGE)
82+
# define PY_UNICODE_TYPE Py_UCS4
8583
# endif
8684

8785
#endif
@@ -104,28 +102,23 @@ typedef unsigned long Py_UCS4;
104102
# include "wchar.h"
105103
#endif
106104

107-
#ifdef HAVE_USABLE_WCHAR_T
108-
109-
/* If the compiler defines whcar_t as a 16-bit unsigned type we can
110-
use the compiler type directly. Works fine with all modern Windows
111-
platforms. */
112-
113-
typedef wchar_t Py_UNICODE;
114-
115-
#else
116-
117-
/* Use if you have a standard ANSI compiler, without wchar_t support.
118-
If a short is not 16 bits on your platform, you have to fix the
119-
typedef below, or the module initialization code will complain. */
120-
121-
#ifdef USE_UCS4_STORAGE
122-
typedef Py_UCS4 Py_UNICODE;
123-
#else
124-
typedef unsigned short Py_UNICODE;
105+
/*
106+
* Use this typedef when you need to represent a UTF-16 surrogate pair
107+
* as single unsigned integer.
108+
*/
109+
#if SIZEOF_INT >= 4
110+
typedef unsigned int Py_UCS4;
111+
#elif SIZEOF_LONG >= 4
112+
typedef unsigned long Py_UCS4;
125113
#endif
126114

127-
#endif
115+
#if SIZEOF_SHORT == 2
116+
typedef unsigned short Py_UCS2;
117+
#else
118+
#error Cannot find a two-byte type
119+
#endif
128120

121+
typedef PY_UNICODE_TYPE Py_UNICODE;
129122

130123
/* --- Internal Unicode Operations ---------------------------------------- */
131124

Objects/unicodeobject.c

Lines changed: 89 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -771,13 +771,17 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
771771
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
772772
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
773773
/* validate and convert to UTF-16 */
774-
if ((ch < 0x10000) || /* minimum value allowed for 4
774+
if ((ch < 0x10000) /* minimum value allowed for 4
775775
byte encoding */
776-
(ch > 0x10ffff)) { /* maximum value allowed for
776+
|| (ch > 0x10ffff)) /* maximum value allowed for
777777
UTF-16 */
778+
{
778779
errmsg = "illegal encoding";
779780
goto utf8Error;
780781
}
782+
#if Py_UNICODE_SIZE == 4
783+
*p++ = (Py_UNICODE)ch;
784+
#else
781785
/* compute and append the two surrogates: */
782786

783787
/* translate from 10000..10FFFF to 0..FFFF */
@@ -788,6 +792,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
788792

789793
/* low surrogate = bottom 10 bits added to DC00 */
790794
*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
795+
#endif
791796
break;
792797

793798
default:
@@ -878,7 +883,13 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
878883
*p++ = 0x80 | (ch & 0x3f);
879884
cbWritten += 2;
880885
}
881-
else {
886+
else if (ch < 0x10000) {
887+
#if Py_UNICODE_SIZE == 4
888+
*p++ = 0xe0 | (ch>>12);
889+
*p++ = 0x80 | ((ch>>6) & 0x3f);
890+
*p++ = 0x80 | (ch & 0x3f);
891+
cbWritten += 3;
892+
#else
882893
/* Check for high surrogate */
883894
if (0xD800 <= ch && ch <= 0xDBFF) {
884895
if (i != size) {
@@ -909,7 +920,14 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
909920
}
910921
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
911922
*p++ = (char)(0x80 | (ch & 0x3f));
912-
}
923+
#endif
924+
} else {
925+
*p++ = 0xf0 | (ch>>18);
926+
*p++ = 0x80 | ((ch>>12) & 0x3f);
927+
*p++ = 0x80 | ((ch>>6) & 0x3f);
928+
*p++ = 0x80 | (ch & 0x3f);
929+
cbWritten += 4;
930+
}
913931
}
914932
*p = '\0';
915933
if (_PyString_Resize(&v, p - q))
@@ -935,7 +953,7 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
935953
/* --- UTF-16 Codec ------------------------------------------------------- */
936954

937955
static
938-
int utf16_decoding_error(const Py_UNICODE **source,
956+
int utf16_decoding_error(const Py_UCS2 **source,
939957
Py_UNICODE **dest,
940958
const char *errors,
941959
const char *details)
@@ -973,12 +991,12 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
973991
{
974992
PyUnicodeObject *unicode;
975993
Py_UNICODE *p;
976-
const Py_UNICODE *q, *e;
994+
const Py_UCS2 *q, *e;
977995
int bo = 0;
978996
const char *errmsg = "";
979997

980998
/* size should be an even number */
981-
if (size % sizeof(Py_UNICODE) != 0) {
999+
if (size % sizeof(Py_UCS2) != 0) {
9821000
if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
9831001
return NULL;
9841002
/* The remaining input chars are ignored if we fall through
@@ -995,8 +1013,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
9951013

9961014
/* Unpack UTF-16 encoded data */
9971015
p = unicode->str;
998-
q = (Py_UNICODE *)s;
999-
e = q + (size / sizeof(Py_UNICODE));
1016+
q = (Py_UCS2 *)s;
1017+
e = q + (size / sizeof(Py_UCS2));
10001018

10011019
if (byteorder)
10021020
bo = *byteorder;
@@ -1026,7 +1044,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
10261044
}
10271045

10281046
while (q < e) {
1029-
register Py_UNICODE ch = *q++;
1047+
register Py_UCS2 ch = *q++;
10301048

10311049
/* Swap input bytes if needed. (This assumes
10321050
sizeof(Py_UNICODE) == 2 !) */
@@ -1048,17 +1066,33 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
10481066
goto utf16Error;
10491067
}
10501068
if (0xDC00 <= *q && *q <= 0xDFFF) {
1051-
q++;
1052-
if (0xD800 <= *q && *q <= 0xDBFF) {
1069+
Py_UCS2 ch2 = *q++;
1070+
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1071+
if (bo == 1)
1072+
ch = (ch >> 8) | (ch << 8);
1073+
#else
1074+
if (bo == -1)
1075+
ch = (ch >> 8) | (ch << 8);
1076+
#endif
1077+
if (0xD800 <= ch && ch <= 0xDBFF) {
1078+
#if Py_UNICODE_SIZE == 2
10531079
/* This is valid data (a UTF-16 surrogate pair), but
10541080
we are not able to store this information since our
10551081
Py_UNICODE type only has 16 bits... this might
10561082
change someday, even though it's unlikely. */
10571083
errmsg = "code pairs are not supported";
10581084
goto utf16Error;
1059-
}
1060-
else
1085+
#else
1086+
*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
10611087
continue;
1088+
#endif
1089+
1090+
}
1091+
else {
1092+
errmsg = "illegal UTF-16 surrogate";
1093+
goto utf16Error;
1094+
}
1095+
10621096
}
10631097
errmsg = "illegal encoding";
10641098
/* Fall through to report the error */
@@ -1090,17 +1124,20 @@ PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
10901124
int byteorder)
10911125
{
10921126
PyObject *v;
1093-
Py_UNICODE *p;
1127+
Py_UCS2 *p;
10941128
char *q;
1129+
int i, pairs, doswap = 1;
10951130

1096-
/* We don't create UTF-16 pairs... */
1131+
for (i = pairs = 0; i < size; i++)
1132+
if (s[i] >= 0x10000)
1133+
pairs++;
10971134
v = PyString_FromStringAndSize(NULL,
1098-
sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1135+
sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
10991136
if (v == NULL)
11001137
return NULL;
11011138

11021139
q = PyString_AS_STRING(v);
1103-
p = (Py_UNICODE *)q;
1140+
p = (Py_UCS2 *)q;
11041141
if (byteorder == 0)
11051142
*p++ = 0xFEFF;
11061143
if (size == 0)
@@ -1112,12 +1149,24 @@ PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
11121149
byteorder == 1
11131150
#endif
11141151
)
1115-
Py_UNICODE_COPY(p, s, size);
1116-
else
1117-
while (size-- > 0) {
1118-
Py_UNICODE ch = *s++;
1152+
doswap = 0;
1153+
while (size-- > 0) {
1154+
Py_UNICODE ch = *s++;
1155+
Py_UNICODE ch2 = 0;
1156+
if (ch >= 0x10000) {
1157+
ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
1158+
ch = 0xD800|((ch-0x10000)>>10);
1159+
}
1160+
if (doswap){
11191161
*p++ = (ch >> 8) | (ch << 8);
1162+
if (ch2)
1163+
*p++ = (ch2 >> 8) | (ch2 << 8);
1164+
}else{
1165+
*p++ = ch;
1166+
if(ch2)
1167+
*p++ = ch2;
11201168
}
1169+
}
11211170
return v;
11221171
}
11231172

@@ -1271,10 +1320,14 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
12711320
/* UCS-2 character */
12721321
*p++ = (Py_UNICODE) chr;
12731322
else if (chr <= 0x10ffff) {
1274-
/* UCS-4 character. store as two surrogate characters */
1323+
/* UCS-4 character. Either store directly, or as surrogate pair. */
1324+
#if Py_UNICODE_SIZE == 4
1325+
*p++ = chr;
1326+
#else
12751327
chr -= 0x10000L;
12761328
*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
12771329
*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1330+
#endif
12781331
} else {
12791332
if (unicodeescape_decoding_error(
12801333
&s, &x, errors,
@@ -1383,6 +1436,19 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
13831436
*p++ = '\\';
13841437
*p++ = (char) ch;
13851438
}
1439+
/* Map 21-bit characters to '\U00xxxxxx' */
1440+
else if (ch >= 0x10000) {
1441+
*p++ = '\\';
1442+
*p++ = 'U';
1443+
*p++ = hexdigit[(ch >> 28) & 0xf];
1444+
*p++ = hexdigit[(ch >> 24) & 0xf];
1445+
*p++ = hexdigit[(ch >> 20) & 0xf];
1446+
*p++ = hexdigit[(ch >> 16) & 0xf];
1447+
*p++ = hexdigit[(ch >> 12) & 0xf];
1448+
*p++ = hexdigit[(ch >> 8) & 0xf];
1449+
*p++ = hexdigit[(ch >> 4) & 0xf];
1450+
*p++ = hexdigit[ch & 15];
1451+
}
13861452
/* Map 16-bit characters to '\uxxxx' */
13871453
else if (ch >= 256) {
13881454
*p++ = '\\';
@@ -5281,13 +5347,6 @@ void _PyUnicode_Init(void)
52815347
{
52825348
int i;
52835349

5284-
/* Doublecheck the configuration... */
5285-
#ifndef USE_UCS4_STORAGE
5286-
if (sizeof(Py_UNICODE) != 2)
5287-
Py_FatalError("Unicode configuration error: "
5288-
"sizeof(Py_UNICODE) != 2 bytes");
5289-
#endif
5290-
52915350
/* Init the implementation */
52925351
unicode_freelist = NULL;
52935352
unicode_freelist_size = 0;

Python/bltinmodule.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,12 +324,16 @@ builtin_unichr(PyObject *self, PyObject *args)
324324
s[0] = (Py_UNICODE) x;
325325
return PyUnicode_FromUnicode(s, 1);
326326
} else {
327+
#if Py_UNICODE_SIZE == 2
327328
/* UCS-4 character. store as two surrogate characters */
328329
x -= 0x10000L;
329330
s[0] = 0xD800 + (Py_UNICODE) (x >> 10);
330331
s[1] = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
331332
return PyUnicode_FromUnicode(s, 2);
333+
#endif
332334
}
335+
s[0] = (Py_UNICODE)x;
336+
return PyUnicode_FromUnicode(s, 1);
333337
}
334338

335339
static char unichr_doc[] =

acconfig.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,15 @@
104104
/* Define if the compiler provides a wchar.h header file. */
105105
#undef HAVE_WCHAR_H
106106

107+
/* Define if you want to have a Unicode type. */
108+
#undef Py_USING_UNICODE
109+
110+
/* Define as the integral type used for Unicode representation. */
111+
#undef PY_UNICODE_TYPE
112+
113+
/* Define as the size of the unicode type. */
114+
#undef Py_UNICODE_SIZE
115+
107116
/* Define if malloc(0) returns a NULL pointer */
108117
#undef MALLOC_ZERO_RETURNS_NULL
109118

config.h.in

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,15 @@
163163
/* Define if the compiler provides a wchar.h header file. */
164164
#undef HAVE_WCHAR_H
165165

166+
/* Define if you want to have a Unicode type. */
167+
#undef Py_USING_UNICODE
168+
169+
/* Define as the integral type used for Unicode representation. */
170+
#undef PY_UNICODE_TYPE
171+
172+
/* Define as the size of the unicode type. */
173+
#undef Py_UNICODE_SIZE
174+
166175
/* Define if malloc(0) returns a NULL pointer */
167176
#undef MALLOC_ZERO_RETURNS_NULL
168177

@@ -284,6 +293,9 @@
284293
/* The number of bytes in a void *. */
285294
#undef SIZEOF_VOID_P
286295

296+
/* The number of bytes in a wchar_t. */
297+
#undef SIZEOF_WCHAR_T
298+
287299
/* Define if you have the _getpty function. */
288300
#undef HAVE__GETPTY
289301

0 commit comments

Comments
 (0)