Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 5d98ec7

Browse files
committed
Issue python#5828 (Invalid behavior of unicode.lower): Fixed bogus logic in
makeunicodedata.py and regenerated the Unicode database (This fixes u'\u1d79'.lower() == '\x00').
1 parent 140d9d6 commit 5d98ec7

4 files changed

Lines changed: 41 additions & 25 deletions

File tree

Lib/test/test_unicodedata.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
class UnicodeMethodsTest(unittest.TestCase):
2121

2222
# update this, if the database changes
23-
expectedchecksum = 'aef99984a58c8e1e5363a3175f2ff9608599a93e'
23+
expectedchecksum = 'b7db9b5f1d804976fa921d2009cbef6f025620c1'
2424

2525
def test_method_checksum(self):
2626
h = hashlib.sha1()
@@ -257,6 +257,19 @@ def test_ucd_510(self):
257257
# the upper-case mapping: as delta, or as absolute value
258258
self.assert_(u"a".upper()==u'A')
259259
self.assert_(u"\u1d79".upper()==u'\ua77d')
260+
self.assert_(u".".upper()==u".")
261+
262+
def test_bug_5828(self):
263+
self.assertEqual(u"\u1d79".lower(), u"\u1d79")
264+
# Only U+0000 should have U+0000 as its upper/lower/titlecase variant
265+
self.assertEqual(
266+
[
267+
c for c in range(sys.maxunicode+1)
268+
if u"\x00" in unichr(c).lower()+unichr(c).upper()+unichr(c).title()
269+
],
270+
[0]
271+
)
272+
260273

261274
def test_main():
262275
test.test_support.run_unittest(

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,10 @@ Library
773773
- Issue #2703: SimpleXMLRPCDispatcher.__init__: Provide default values for
774774
new arguments introduced in 2.5.
775775

776+
- Issue #5828 (Invalid behavior of unicode.lower): Fixed bogus logic in
777+
makeunicodedata.py and regenerated the Unicode database (This fixes
778+
u'\u1d79'.lower() == '\x00').
779+
776780
Tools/Demos
777781
-----------
778782

Objects/unicodetype_db.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
118118
{0, 0, 0, 0, 7, 4},
119119
{0, 0, 0, 0, 8, 4},
120120
{0, 0, 0, 0, 9, 4},
121-
{42877, 0, 42877, 0, 0, 265},
121+
{42877, 7545, 42877, 0, 0, 265},
122122
{3814, 0, 3814, 0, 0, 9},
123123
{65477, 0, 65477, 0, 0, 9},
124124
{0, 57921, 0, 0, 0, 129},
@@ -159,7 +159,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
159159
{0, 54787, 0, 0, 0, 129},
160160
{0, 54753, 0, 0, 0, 129},
161161
{58272, 0, 58272, 0, 0, 9},
162-
{0, 7545, 0, 0, 0, 385},
162+
{42877, 7545, 42877, 0, 0, 385},
163163
{0, 40, 0, 0, 0, 129},
164164
{65496, 0, 65496, 0, 0, 9},
165165
};

Tools/unicode/makeunicodedata.py

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -371,33 +371,32 @@ def makeunicodetype(unicode, trace):
371371
flags |= UPPER_MASK
372372
# use delta predictor for upper/lower/title if it fits
373373
if record[12]:
374-
upper = int(record[12], 16) - char
375-
if -32768 <= upper <= 32767 and delta:
376-
upper = upper & 0xffff
377-
else:
378-
upper += char
379-
delta = False
374+
upper = int(record[12], 16)
380375
else:
381-
upper = 0
376+
upper = char
382377
if record[13]:
383-
lower = int(record[13], 16) - char
384-
if -32768 <= lower <= 32767 and delta:
385-
lower = lower & 0xffff
386-
else:
387-
lower += char
388-
delta = False
378+
lower = int(record[13], 16)
389379
else:
390-
lower = 0
380+
lower = char
391381
if record[14]:
392-
title = int(record[14], 16) - char
393-
if -32768 <= lower <= 32767 and delta:
394-
title = title & 0xffff
395-
else:
396-
title += char
397-
delta = False
382+
title = int(record[14], 16)
383+
else:
384+
# UCD.html says that a missing title char means that
385+
# it defaults to the uppercase character, not to the
386+
# character itself. Apparently, in the current UCD (5.x)
387+
# this feature is never used
388+
title = upper
389+
upper_d = upper - char
390+
lower_d = lower - char
391+
title_d = title - char
392+
if -32768 <= upper_d <= 32767 and \
393+
-32768 <= lower_d <= 32767 and \
394+
-32768 <= title_d <= 32767:
395+
# use deltas
396+
upper = upper_d & 0xffff
397+
lower = lower_d & 0xffff
398+
title = title_d & 0xffff
398399
else:
399-
title = 0
400-
if not delta:
401400
flags |= NODELTA_MASK
402401
# decimal digit, integer digit
403402
decimal = 0

0 commit comments

Comments
 (0)