Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b2bf01d

Browse files
committed
use full unicode mappings for upper/lower/title case (#12736)
Also broaden the category of characters that count as lowercase/uppercase.
1 parent 9007f72 commit b2bf01d

11 files changed

Lines changed: 4596 additions & 1719 deletions

File tree

Doc/c-api/unicode.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,16 +318,25 @@ These APIs can be used for fast direct character conversions:
318318
319319
Return the character *ch* converted to lower case.
320320
321+
.. deprecated:: 3.3
322+
This function uses simple case mappings.
323+
321324
322325
.. c:function:: Py_UNICODE Py_UNICODE_TOUPPER(Py_UNICODE ch)
323326
324327
Return the character *ch* converted to upper case.
325328
329+
.. deprecated:: 3.3
330+
This function uses simple case mappings.
331+
326332
327333
.. c:function:: Py_UNICODE Py_UNICODE_TOTITLE(Py_UNICODE ch)
328334
329335
Return the character *ch* converted to title case.
330336
337+
.. deprecated:: 3.3
338+
This function uses simple case mappings.
339+
331340
332341
.. c:function:: int Py_UNICODE_TODECIMAL(Py_UNICODE ch)
333342

Doc/library/stdtypes.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1360,7 +1360,8 @@ functions based on regular expressions.
13601360
.. method:: str.swapcase()
13611361

13621362
Return a copy of the string with uppercase characters converted to lowercase and
1363-
vice versa.
1363+
vice versa. Note that it is not necessarily true that
1364+
``s.swapcase().swapcase() == s``.
13641365

13651366

13661367
.. method:: str.title()

Include/unicodeobject.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2008,6 +2008,29 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
20082008
Py_UCS4 ch /* Unicode character */
20092009
);
20102010

2011+
PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
2012+
Py_UCS4 ch, /* Unicode character */
2013+
Py_UCS4 *res
2014+
);
2015+
2016+
PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
2017+
Py_UCS4 ch, /* Unicode character */
2018+
Py_UCS4 *res
2019+
);
2020+
2021+
PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
2022+
Py_UCS4 ch, /* Unicode character */
2023+
Py_UCS4 *res
2024+
);
2025+
2026+
PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
2027+
const Py_UCS4 ch /* Unicode character */
2028+
);
2029+
2030+
PyAPI_FUNC(int) _PyUnicode_IsCased(
2031+
const Py_UCS4 ch /* Unicode character */
2032+
);
2033+
20112034
PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
20122035
Py_UCS4 ch /* Unicode character */
20132036
);

Lib/test/string_tests.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -669,7 +669,7 @@ def test_capitalize(self):
669669

670670
# check that titlecased chars are lowered correctly
671671
# \u1ffc is the titlecased char
672-
self.checkequal('\u1ffc\u1ff3\u1ff3\u1ff3',
672+
self.checkequal('\u03a9\u0399\u1ff3\u1ff3\u1ff3',
673673
'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
674674
# check with cased non-letter chars
675675
self.checkequal('\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',

Lib/test/test_unicode.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,8 @@ def test_fixup(s):
369369
def test_islower(self):
370370
string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
371371
self.checkequalnofix(False, '\u1FFc', 'islower')
372+
self.assertFalse('\u2167'.islower())
373+
self.assertTrue('\u2177'.islower())
372374
# non-BMP, uppercase
373375
self.assertFalse('\U00010401'.islower())
374376
self.assertFalse('\U00010427'.islower())
@@ -383,6 +385,8 @@ def test_isupper(self):
383385
string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
384386
if not sys.platform.startswith('java'):
385387
self.checkequalnofix(False, '\u1FFc', 'isupper')
388+
self.assertTrue('\u2167'.isupper())
389+
self.assertFalse('\u2177'.isupper())
386390
# non-BMP, uppercase
387391
self.assertTrue('\U00010401'.isupper())
388392
self.assertTrue('\U00010427'.isupper())
@@ -548,6 +552,18 @@ def test_lower(self):
548552
'\U0001044F\U0001044F')
549553
self.assertEqual('X\U00010427x\U0001044F'.lower(),
550554
'x\U0001044Fx\U0001044F')
555+
self.assertEqual('fi'.lower(), 'fi')
556+
self.assertEqual('\u0130'.lower(), '\u0069\u0307')
557+
# Special case for GREEK CAPITAL LETTER SIGMA U+03A3
558+
self.assertEqual('\u03a3'.lower(), '\u03c3')
559+
self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
560+
self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
561+
self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
562+
self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
563+
self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
564+
self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
565+
self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
566+
self.assertEqual('\u2177'.lower(), '\u2177')
551567

552568
def test_upper(self):
553569
string_tests.CommonTest.test_upper(self)
@@ -558,6 +574,13 @@ def test_upper(self):
558574
'\U00010427\U00010427')
559575
self.assertEqual('X\U00010427x\U0001044F'.upper(),
560576
'X\U00010427X\U00010427')
577+
self.assertEqual('fi'.upper(), 'FI')
578+
self.assertEqual('\u0130'.upper(), '\u0130')
579+
self.assertEqual('\u03a3'.upper(), '\u03a3')
580+
self.assertEqual('ß'.upper(), 'SS')
581+
self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
582+
self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
583+
self.assertEqual('\u2177'.upper(), '\u2167')
561584

562585
def test_capitalize(self):
563586
string_tests.CommonTest.test_capitalize(self)
@@ -570,6 +593,11 @@ def test_capitalize(self):
570593
'\U00010427\U0001044F')
571594
self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
572595
'X\U0001044Fx\U0001044F')
596+
self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
597+
exp = '\u0399\u0308\u0300\u0069\u0307'
598+
self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
599+
self.assertEqual('finnish'.capitalize(), 'FInnish')
600+
self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
573601

574602
def test_title(self):
575603
string_tests.MixinStrUnicodeUserStringTest.test_title(self)
@@ -584,6 +612,9 @@ def test_title(self):
584612
'\U00010427\U0001044F \U00010427\U0001044F')
585613
self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
586614
'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
615+
self.assertEqual('fiNNISH'.title(), 'Finnish')
616+
self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
617+
self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
587618

588619
def test_swapcase(self):
589620
string_tests.CommonTest.test_swapcase(self)
@@ -597,6 +628,19 @@ def test_swapcase(self):
597628
'\U00010427\U0001044F')
598629
self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
599630
'x\U0001044FX\U00010427')
631+
self.assertEqual('fi'.swapcase(), 'FI')
632+
self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
633+
# Special case for GREEK CAPITAL LETTER SIGMA U+03A3
634+
self.assertEqual('\u03a3'.swapcase(), '\u03c3')
635+
self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
636+
self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
637+
self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
638+
self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
639+
self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
640+
self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
641+
self.assertEqual('\u03a3'.swapcase(), '\u03c3')
642+
self.assertEqual('ß'.swapcase(), 'SS')
643+
self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
600644

601645
def test_contains(self):
602646
# Testing Unicode contains method

Lib/test/test_unicodedata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
class UnicodeMethodsTest(unittest.TestCase):
2222

2323
# update this, if the database changes
24-
expectedchecksum = '21b90f1aed00081b81ca7942b22196af090015a0'
24+
expectedchecksum = 'df0b3ca6785a070b21f837b227dbdbdff3c2e921'
2525

2626
def test_method_checksum(self):
2727
h = hashlib.sha1()

Misc/NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #12736: Use full unicode case mappings for upper, lower, and title case.
14+
1315
- Issue #12760: Add a create mode to open(). Patch by David Townshend.
1416

1517
- Issue #13738: Simplify implementation of bytes.lower() and bytes.upper().

Objects/unicodectype.c

Lines changed: 75 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@
2121
#define XID_START_MASK 0x100
2222
#define XID_CONTINUE_MASK 0x200
2323
#define PRINTABLE_MASK 0x400
24-
#define NODELTA_MASK 0x800
25-
#define NUMERIC_MASK 0x1000
24+
#define NUMERIC_MASK 0x800
25+
#define CASE_IGNORABLE_MASK 0x1000
26+
#define CASED_MASK 0x2000
27+
#define EXTENDED_CASE_MASK 0x4000
2628

2729
typedef struct {
2830
const Py_UCS4 upper;
@@ -57,15 +59,8 @@ gettyperecord(Py_UCS4 code)
5759
Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
5860
{
5961
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
60-
int delta = ctype->title;
6162

62-
if (ctype->flags & NODELTA_MASK)
63-
return delta;
64-
65-
if (delta >= 32768)
66-
delta -= 65536;
67-
68-
return ch + delta;
63+
return ctype->title ? ctype->title : ch;
6964
}
7065

7166
/* Returns 1 for Unicode characters having the category 'Lt', 0
@@ -188,12 +183,10 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch)
188183
Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
189184
{
190185
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
191-
int delta = ctype->upper;
192-
if (ctype->flags & NODELTA_MASK)
193-
return delta;
194-
if (delta >= 32768)
195-
delta -= 65536;
196-
return ch + delta;
186+
187+
if (ctype->flags & EXTENDED_CASE_MASK)
188+
return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF];
189+
return ctype->upper ? ctype->upper : ch;
197190
}
198191

199192
/* Returns the lowercase Unicode characters corresponding to ch or just
@@ -202,12 +195,72 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
202195
Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
203196
{
204197
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
205-
int delta = ctype->lower;
206-
if (ctype->flags & NODELTA_MASK)
207-
return delta;
208-
if (delta >= 32768)
209-
delta -= 65536;
210-
return ch + delta;
198+
199+
if (ctype->flags & EXTENDED_CASE_MASK)
200+
return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF];
201+
return ctype->lower ? ctype->lower : ch;
202+
}
203+
204+
int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
205+
{
206+
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
207+
208+
if (ctype->flags & EXTENDED_CASE_MASK) {
209+
int index = ctype->lower & 0xFFFFFF;
210+
int n = ctype->lower >> 24;
211+
int i;
212+
for (i = 0; i < n; i++)
213+
res[i] = _PyUnicode_ExtendedCase[index + i];
214+
return n;
215+
}
216+
res[0] = ctype->lower ? ctype->lower : ch;
217+
return 1;
218+
}
219+
220+
int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
221+
{
222+
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
223+
224+
if (ctype->flags & EXTENDED_CASE_MASK) {
225+
int index = ctype->title & 0xFFFFFF;
226+
int n = ctype->title >> 24;
227+
int i;
228+
for (i = 0; i < n; i++)
229+
res[i] = _PyUnicode_ExtendedCase[index + i];
230+
return n;
231+
}
232+
res[0] = ctype->title ? ctype->title : ch;
233+
return 1;
234+
}
235+
236+
int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
237+
{
238+
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
239+
240+
if (ctype->flags & EXTENDED_CASE_MASK) {
241+
int index = ctype->upper & 0xFFFFFF;
242+
int n = ctype->upper >> 24;
243+
int i;
244+
for (i = 0; i < n; i++)
245+
res[i] = _PyUnicode_ExtendedCase[index + i];
246+
return n;
247+
}
248+
res[0] = ctype->upper ? ctype->upper : ch;
249+
return 1;
250+
}
251+
252+
int _PyUnicode_IsCased(Py_UCS4 ch)
253+
{
254+
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
255+
256+
return (ctype->flags & CASED_MASK) != 0;
257+
}
258+
259+
int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
260+
{
261+
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
262+
263+
return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
211264
}
212265

213266
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',

0 commit comments

Comments
 (0)