Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 1b08b30

Browse files
committed
Merged revisions 71894 via svnmerge from
svn+ssh://[email protected]/python/trunk ........ r71894 | walter.doerwald | 2009-04-25 16:03:16 +0200 (Sa, 25 Apr 2009) | 4 lines Issue #5828 (Invalid behavior of unicode.lower): Fixed bogus logic in makeunicodedata.py and regenerated the Unicode database (This fixes u'\u1d79'.lower() == '\x00'). ........
1 parent 939f9c8 commit 1b08b30

4 files changed

Lines changed: 41 additions & 25 deletions

File tree

Lib/test/test_unicodedata.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
class UnicodeMethodsTest(unittest.TestCase):
2121

2222
# update this, if the database changes
23-
expectedchecksum = 'aef99984a58c8e1e5363a3175f2ff9608599a93e'
23+
expectedchecksum = 'b7db9b5f1d804976fa921d2009cbef6f025620c1'
2424

2525
def test_method_checksum(self):
2626
h = hashlib.sha1()
@@ -258,6 +258,19 @@ def test_ucd_510(self):
258258
# the upper-case mapping: as delta, or as absolute value
259259
self.assert_("a".upper()=='A')
260260
self.assert_("\u1d79".upper()=='\ua77d')
261+
self.assert_(".".upper()=='.')
262+
263+
def test_bug_5828(self):
264+
self.assertEqual("\u1d79".lower(), "\u1d79")
265+
# Only U+0000 should have U+0000 as its upper/lower/titlecase variant
266+
self.assertEqual(
267+
[
268+
c for c in range(sys.maxunicode+1)
269+
if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title()
270+
],
271+
[0]
272+
)
273+
261274

262275
def test_main():
263276
test.support.run_unittest(

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,10 @@ Library
104104
- Issue #2703: SimpleXMLRPCDispatcher.__init__: Provide default values for
105105
new arguments introduced in 2.5.
106106

107+
- Issue #5828 (Invalid behavior of unicode.lower): Fixed bogus logic in
108+
makeunicodedata.py and regenerated the Unicode database (This fixes
109+
u'\u1d79'.lower() == '\x00').
110+
107111
Extension Modules
108112
-----------------
109113

Objects/unicodetype_db.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
127127
{0, 0, 0, 0, 8, 1540},
128128
{0, 0, 0, 0, 9, 1540},
129129
{0, 0, 0, 0, 0, 1792},
130-
{42877, 0, 42877, 0, 0, 3849},
130+
{42877, 7545, 42877, 0, 0, 3849},
131131
{3814, 0, 3814, 0, 0, 1801},
132132
{65477, 0, 65477, 0, 0, 1801},
133133
{0, 57921, 0, 0, 0, 1921},
@@ -174,7 +174,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
174174
{0, 54787, 0, 0, 0, 1921},
175175
{0, 54753, 0, 0, 0, 1921},
176176
{58272, 0, 58272, 0, 0, 1801},
177-
{0, 7545, 0, 0, 0, 3969},
177+
{42877, 7545, 42877, 0, 0, 3969},
178178
{0, 40, 0, 0, 0, 1921},
179179
{65496, 0, 65496, 0, 0, 1801},
180180
};

Tools/unicode/makeunicodedata.py

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -383,33 +383,32 @@ def makeunicodetype(unicode, trace):
383383
flags |= XID_CONTINUE_MASK
384384
# use delta predictor for upper/lower/title if it fits
385385
if record[12]:
386-
upper = int(record[12], 16) - char
387-
if -32768 <= upper <= 32767 and delta:
388-
upper = upper & 0xffff
389-
else:
390-
upper += char
391-
delta = False
386+
upper = int(record[12], 16)
392387
else:
393-
upper = 0
388+
upper = char
394389
if record[13]:
395-
lower = int(record[13], 16) - char
396-
if -32768 <= lower <= 32767 and delta:
397-
lower = lower & 0xffff
398-
else:
399-
lower += char
400-
delta = False
390+
lower = int(record[13], 16)
401391
else:
402-
lower = 0
392+
lower = char
403393
if record[14]:
404-
title = int(record[14], 16) - char
405-
if -32768 <= lower <= 32767 and delta:
406-
title = title & 0xffff
407-
else:
408-
title += char
409-
delta = False
394+
title = int(record[14], 16)
395+
else:
396+
# UCD.html says that a missing title char means that
397+
# it defaults to the uppercase character, not to the
398+
# character itself. Apparently, in the current UCD (5.x)
399+
# this feature is never used
400+
title = upper
401+
upper_d = upper - char
402+
lower_d = lower - char
403+
title_d = title - char
404+
if -32768 <= upper_d <= 32767 and \
405+
-32768 <= lower_d <= 32767 and \
406+
-32768 <= title_d <= 32767:
407+
# use deltas
408+
upper = upper_d & 0xffff
409+
lower = lower_d & 0xffff
410+
title = title_d & 0xffff
410411
else:
411-
title = 0
412-
if not delta:
413412
flags |= NODELTA_MASK
414413
# decimal digit, integer digit
415414
decimal = 0

0 commit comments

Comments
 (0)