Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 324ac65

Browse files
committed
#5127: Even on narrow unicode builds, the C functions that access the Unicode
Database (Py_UNICODE_TOLOWER, Py_UNICODE_ISDECIMAL, and others) now accept and return characters from the full Unicode range (Py_UCS4). The differences from Python code are few: - unicodedata.numeric(), unicodedata.decimal() and unicodedata.digit() now return the correct value for large code points - repr() may consider more characters as printable.
1 parent 36e778e commit 324ac65

7 files changed

Lines changed: 69 additions & 232 deletions

File tree

Include/unicodeobject.h

Lines changed: 22 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -221,24 +221,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
221221
# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
222222
# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
223223
# define _PyUnicode_Init _PyUnicodeUCS2_Init
224-
# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
225-
# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
226-
# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
227-
# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
228-
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
229-
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
230-
# define _PyUnicode_IsPrintable _PyUnicodeUCS2_IsPrintable
231-
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
232-
# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
233-
# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
234-
# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
235-
# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
236-
# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
237-
# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
238-
# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
239-
# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
240-
# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
241-
# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
242224

243225
#else
244226

@@ -322,24 +304,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
322304
# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
323305
# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
324306
# define _PyUnicode_Init _PyUnicodeUCS4_Init
325-
# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
326-
# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
327-
# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
328-
# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
329-
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
330-
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
331-
# define _PyUnicode_IsPrintable _PyUnicodeUCS4_IsPrintable
332-
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
333-
# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
334-
# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
335-
# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
336-
# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
337-
# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
338-
# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
339-
# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
340-
# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
341-
# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
342-
# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
343307

344308

345309
#endif
@@ -351,7 +315,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
351315
configure Python using --with-wctype-functions. This reduces the
352316
interpreter's code size. */
353317

354-
#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
318+
#if defined(Py_UNICODE_WIDE) && defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
355319

356320
#include <wctype.h>
357321

@@ -1542,75 +1506,75 @@ PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
15421506
*/
15431507

15441508
PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1545-
Py_UNICODE ch /* Unicode character */
1509+
Py_UCS4 ch /* Unicode character */
15461510
);
15471511

15481512
PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1549-
Py_UNICODE ch /* Unicode character */
1513+
Py_UCS4 ch /* Unicode character */
15501514
);
15511515

15521516
PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1553-
Py_UNICODE ch /* Unicode character */
1517+
Py_UCS4 ch /* Unicode character */
15541518
);
15551519

15561520
PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1557-
Py_UNICODE ch /* Unicode character */
1521+
Py_UCS4 ch /* Unicode character */
15581522
);
15591523

15601524
PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1561-
Py_UNICODE ch /* Unicode character */
1525+
Py_UCS4 ch /* Unicode character */
15621526
);
15631527

15641528
PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1565-
const Py_UNICODE ch /* Unicode character */
1529+
const Py_UCS4 ch /* Unicode character */
15661530
);
15671531

15681532
PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1569-
const Py_UNICODE ch /* Unicode character */
1533+
const Py_UCS4 ch /* Unicode character */
15701534
);
15711535

1572-
PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
1573-
Py_UNICODE ch /* Unicode character */
1536+
PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1537+
Py_UCS4 ch /* Unicode character */
15741538
);
15751539

1576-
PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
1577-
Py_UNICODE ch /* Unicode character */
1540+
PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1541+
Py_UCS4 ch /* Unicode character */
15781542
);
15791543

1580-
PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
1581-
Py_UNICODE ch /* Unicode character */
1544+
PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1545+
Py_UCS4 ch /* Unicode character */
15821546
);
15831547

15841548
PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1585-
Py_UNICODE ch /* Unicode character */
1549+
Py_UCS4 ch /* Unicode character */
15861550
);
15871551

15881552
PyAPI_FUNC(int) _PyUnicode_ToDigit(
1589-
Py_UNICODE ch /* Unicode character */
1553+
Py_UCS4 ch /* Unicode character */
15901554
);
15911555

15921556
PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1593-
Py_UNICODE ch /* Unicode character */
1557+
Py_UCS4 ch /* Unicode character */
15941558
);
15951559

15961560
PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1597-
Py_UNICODE ch /* Unicode character */
1561+
Py_UCS4 ch /* Unicode character */
15981562
);
15991563

16001564
PyAPI_FUNC(int) _PyUnicode_IsDigit(
1601-
Py_UNICODE ch /* Unicode character */
1565+
Py_UCS4 ch /* Unicode character */
16021566
);
16031567

16041568
PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1605-
Py_UNICODE ch /* Unicode character */
1569+
Py_UCS4 ch /* Unicode character */
16061570
);
16071571

16081572
PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1609-
Py_UNICODE ch /* Unicode character */
1573+
Py_UCS4 ch /* Unicode character */
16101574
);
16111575

16121576
PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1613-
Py_UNICODE ch /* Unicode character */
1577+
Py_UCS4 ch /* Unicode character */
16141578
);
16151579

16161580
PyAPI_FUNC(size_t) Py_UNICODE_strlen(

Lib/test/test_unicode.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1353,6 +1353,10 @@ def __repr__(self):
13531353
self.assertEqual(repr(s1()), '\\n')
13541354
self.assertEqual(repr(s2()), '\\n')
13551355

1356+
def test_printable_repr(self):
1357+
self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
1358+
self.assertEqual(repr('\U00011000'), "'\\U00011000'") # nonprintable
1359+
13561360
def test_expandtabs_overflows_gracefully(self):
13571361
# This test only affects 32-bit platforms because expandtabs can only take
13581362
# an int as the max value, not a 64-bit C long. If expandtabs is changed

Lib/test/test_unicodedata.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,12 @@ def test_linebreak_7643(self):
294294
self.assertEqual(len(lines), 1,
295295
r"\u%.4x should not be a linebreak" % i)
296296

297+
def test_UCS4(self):
298+
# unicodedata should work with code points outside the BMP
299+
# even on a narrow Unicode build
300+
self.assertEqual(self.db.category(u"\U0001012A"), "No")
301+
self.assertEqual(self.db.numeric(u"\U0001012A"), 9000)
302+
297303
def test_main():
298304
test.support.run_unittest(
299305
UnicodeMiscTest,

Misc/NEWS

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@ What's New in Python 3.2 Alpha 2?
1212
Core and Builtins
1313
-----------------
1414

15+
- Issue #5127: The C functions that access the Unicode Database now accept and
16+
return characters from the full Unicode range, even on narrow unicode builds
17+
(Py_UNICODE_TOLOWER, Py_UNICODE_ISDECIMAL, and others). A visible difference
18+
in Python is that unicodedata.numeric() now returns the correct value for
19+
large code points, and repr() may consider more characters as printable.
20+
1521
- Issue #9425: Create PyModule_GetFilenameObject() function to get the filename
1622
as a unicode object, instead of a byte string. Function needed to support
1723
unencodable filenames. Deprecate PyModule_GetFilename() in favor on the new

Objects/unicodectype.c

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@
2626
#define NUMERIC_MASK 0x1000
2727

2828
typedef struct {
29-
const Py_UNICODE upper;
30-
const Py_UNICODE lower;
31-
const Py_UNICODE title;
29+
const Py_UCS4 upper;
30+
const Py_UCS4 lower;
31+
const Py_UCS4 title;
3232
const unsigned char decimal;
3333
const unsigned char digit;
3434
const unsigned short flags;
@@ -37,15 +37,13 @@ typedef struct {
3737
#include "unicodetype_db.h"
3838

3939
static const _PyUnicode_TypeRecord *
40-
gettyperecord(Py_UNICODE code)
40+
gettyperecord(Py_UCS4 code)
4141
{
4242
int index;
4343

44-
#ifdef Py_UNICODE_WIDE
4544
if (code >= 0x110000)
4645
index = 0;
4746
else
48-
#endif
4947
{
5048
index = index1[(code>>SHIFT)];
5149
index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
@@ -57,7 +55,7 @@ gettyperecord(Py_UNICODE code)
5755
/* Returns the titlecase Unicode characters corresponding to ch or just
5856
ch if no titlecase mapping is known. */
5957

60-
Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
58+
Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
6159
{
6260
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
6361
int delta = ctype->title;
@@ -74,7 +72,7 @@ Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
7472
/* Returns 1 for Unicode characters having the category 'Lt', 0
7573
otherwise. */
7674

77-
int _PyUnicode_IsTitlecase(Py_UNICODE ch)
75+
int _PyUnicode_IsTitlecase(Py_UCS4 ch)
7876
{
7977
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
8078

@@ -84,7 +82,7 @@ int _PyUnicode_IsTitlecase(Py_UNICODE ch)
8482
/* Returns 1 for Unicode characters having the XID_Start property, 0
8583
otherwise. */
8684

87-
int _PyUnicode_IsXidStart(Py_UNICODE ch)
85+
int _PyUnicode_IsXidStart(Py_UCS4 ch)
8886
{
8987
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
9088

@@ -94,7 +92,7 @@ int _PyUnicode_IsXidStart(Py_UNICODE ch)
9492
/* Returns 1 for Unicode characters having the XID_Continue property,
9593
0 otherwise. */
9694

97-
int _PyUnicode_IsXidContinue(Py_UNICODE ch)
95+
int _PyUnicode_IsXidContinue(Py_UCS4 ch)
9896
{
9997
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
10098

@@ -104,14 +102,14 @@ int _PyUnicode_IsXidContinue(Py_UNICODE ch)
104102
/* Returns the integer decimal (0-9) for Unicode characters having
105103
this property, -1 otherwise. */
106104

107-
int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
105+
int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
108106
{
109107
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
110108

111109
return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
112110
}
113111

114-
int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
112+
int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
115113
{
116114
if (_PyUnicode_ToDecimalDigit(ch) < 0)
117115
return 0;
@@ -121,14 +119,14 @@ int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
121119
/* Returns the integer digit (0-9) for Unicode characters having
122120
this property, -1 otherwise. */
123121

124-
int _PyUnicode_ToDigit(Py_UNICODE ch)
122+
int _PyUnicode_ToDigit(Py_UCS4 ch)
125123
{
126124
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
127125

128126
return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
129127
}
130128

131-
int _PyUnicode_IsDigit(Py_UNICODE ch)
129+
int _PyUnicode_IsDigit(Py_UCS4 ch)
132130
{
133131
if (_PyUnicode_ToDigit(ch) < 0)
134132
return 0;
@@ -138,7 +136,7 @@ int _PyUnicode_IsDigit(Py_UNICODE ch)
138136
/* Returns the numeric value as double for Unicode characters having
139137
this property, -1.0 otherwise. */
140138

141-
int _PyUnicode_IsNumeric(Py_UNICODE ch)
139+
int _PyUnicode_IsNumeric(Py_UCS4 ch)
142140
{
143141
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
144142

@@ -158,7 +156,7 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
158156
* Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
159157
* Zs (Separator, Space) other than ASCII space('\x20').
160158
*/
161-
int _PyUnicode_IsPrintable(Py_UNICODE ch)
159+
int _PyUnicode_IsPrintable(Py_UCS4 ch)
162160
{
163161
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
164162

@@ -170,7 +168,7 @@ int _PyUnicode_IsPrintable(Py_UNICODE ch)
170168
/* Returns 1 for Unicode characters having the category 'Ll', 0
171169
otherwise. */
172170

173-
int _PyUnicode_IsLowercase(Py_UNICODE ch)
171+
int _PyUnicode_IsLowercase(Py_UCS4 ch)
174172
{
175173
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
176174

@@ -180,7 +178,7 @@ int _PyUnicode_IsLowercase(Py_UNICODE ch)
180178
/* Returns 1 for Unicode characters having the category 'Lu', 0
181179
otherwise. */
182180

183-
int _PyUnicode_IsUppercase(Py_UNICODE ch)
181+
int _PyUnicode_IsUppercase(Py_UCS4 ch)
184182
{
185183
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
186184

@@ -190,7 +188,7 @@ int _PyUnicode_IsUppercase(Py_UNICODE ch)
190188
/* Returns the uppercase Unicode characters corresponding to ch or just
191189
ch if no uppercase mapping is known. */
192190

193-
Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
191+
Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
194192
{
195193
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
196194
int delta = ctype->upper;
@@ -204,7 +202,7 @@ Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
204202
/* Returns the lowercase Unicode characters corresponding to ch or just
205203
ch if no lowercase mapping is known. */
206204

207-
Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
205+
Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
208206
{
209207
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
210208
int delta = ctype->lower;
@@ -218,7 +216,7 @@ Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
218216
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
219217
'Lo' or 'Lm', 0 otherwise. */
220218

221-
int _PyUnicode_IsAlpha(Py_UNICODE ch)
219+
int _PyUnicode_IsAlpha(Py_UCS4 ch)
222220
{
223221
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
224222

@@ -230,27 +228,27 @@ int _PyUnicode_IsAlpha(Py_UNICODE ch)
230228
/* Export the interfaces using the wchar_t type for portability
231229
reasons: */
232230

233-
int _PyUnicode_IsLowercase(Py_UNICODE ch)
231+
int _PyUnicode_IsLowercase(Py_UCS4 ch)
234232
{
235233
return iswlower(ch);
236234
}
237235

238-
int _PyUnicode_IsUppercase(Py_UNICODE ch)
236+
int _PyUnicode_IsUppercase(Py_UCS4 ch)
239237
{
240238
return iswupper(ch);
241239
}
242240

243-
Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
241+
Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
244242
{
245243
return towlower(ch);
246244
}
247245

248-
Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
246+
Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
249247
{
250248
return towupper(ch);
251249
}
252250

253-
int _PyUnicode_IsAlpha(Py_UNICODE ch)
251+
int _PyUnicode_IsAlpha(Py_UCS4 ch)
254252
{
255253
return iswalpha(ch);
256254
}

0 commit comments

Comments
 (0)