Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ef7fe2e

Browse files
committed
Implement names for CJK unified ideographs. Add name to KeyError output.
Verify that the lookup for an existing name succeeds.
1 parent 8579efc commit ef7fe2e

4 files changed

Lines changed: 59 additions & 8 deletions

File tree

Lib/test/output/test_ucn

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ test_ucn
22
Testing General Unicode Character Name, and case insensitivity... done.
33
Testing name to code mapping.... done.
44
Testing hangul syllable names.... done.
5-
Testing code to name mapping for all characters.... done.
6-
Found 22728 characters in the unicode name database
5+
Testing names of CJK unified ideographs.... done.
6+
Testing code to name mapping for all BMP characters.... done.
7+
Found 50212 characters in the unicode name database
78
Testing misc. symbols for unicode character name expansion.... done.
89
Testing unicode character name expansion strict error handling.... done.

Lib/test/test_ucn.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,16 +80,28 @@
8080
raise AssertionError, "Found name for U+D7A4"
8181
print "done."
8282

83-
print "Testing code to name mapping for all characters....",
83+
print "Testing names of CJK unified ideographs....",
84+
exec r"""
85+
verify(u"\N{CJK UNIFIED IDEOGRAPH-3400}" == u"\u3400")
86+
verify(u"\N{CJK UNIFIED IDEOGRAPH-4DB5}" == u"\u4db5")
87+
verify(u"\N{CJK UNIFIED IDEOGRAPH-4E00}" == u"\u4e00")
88+
verify(u"\N{CJK UNIFIED IDEOGRAPH-9FA5}" == u"\u9fa5")
89+
verify(u"\N{CJK UNIFIED IDEOGRAPH-20000}" == u"\U00020000")
90+
verify(u"\N{CJK UNIFIED IDEOGRAPH-2A6D6}" == u"\U0002a6d6")
91+
"""
92+
print "done."
93+
94+
print "Testing code to name mapping for all BMP characters....",
8495
count = 0
85-
for code in range(65536):
96+
for code in range(0x10000):
8697
try:
8798
char = unichr(code)
8899
name = unicodedata.name(char)
89-
verify(unicodedata.lookup(name) == char)
90-
count += 1
91100
except (KeyError, ValueError):
92101
pass
102+
else:
103+
verify(unicodedata.lookup(name) == char)
104+
count += 1
93105
print "done."
94106

95107
print "Found", count, "characters in the unicode name database"

Misc/NEWS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ Extension modules
318318
is now named bsddb185.
319319

320320
- unicodedata was updated to Unicode 3.2. In now also supports names
321-
for Hangul syllables.
321+
for Hangul syllables and CJK unified ideographs.
322322

323323
- resource.getrlimit() now returns longs instead of ints.
324324

Modules/unicodedata.c

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,16 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
348348
return 1;
349349
}
350350

351+
if ((0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
352+
(0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
353+
(0x20000 <= code && code <= 0x2A6D6)) {/* CJK Ideograph Extension B */
354+
if (buflen < 28)
355+
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
356+
return 0;
357+
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
358+
return 1;
359+
}
360+
351361
if (code >= 0x110000)
352362
return 0;
353363

@@ -449,6 +459,30 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
449459
*code = SBase + (L*VCount+V)*TCount + T;
450460
return 1;
451461
}
462+
/* Otherwise, it's an illegal syllable name. */
463+
return 0;
464+
}
465+
466+
/* Check for unified ideographs. */
467+
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
468+
/* Four or five hexdigits must follow. */
469+
v = 0;
470+
name += 22;
471+
namelen -= 22;
472+
if (namelen != 4 && namelen != 5)
473+
return 0;
474+
while (namelen--) {
475+
v *= 16;
476+
if (*name >= '0' && *name <= '9')
477+
v += *name - '0';
478+
else if (*name >= 'A' && *name <= 'F')
479+
v += *name - 'A' + 10;
480+
else
481+
return 0;
482+
name++;
483+
}
484+
*code = v;
485+
return 1;
452486
}
453487

454488
/* the following is the same as python's dictionary lookup, with
@@ -535,7 +569,11 @@ unicodedata_lookup(PyObject* self, PyObject* args)
535569
return NULL;
536570

537571
if (!_getcode(name, namelen, &code)) {
538-
PyErr_SetString(PyExc_KeyError, "undefined character name");
572+
char fmt[] = "undefined character name '%s'";
573+
char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
574+
sprintf(buf, fmt, name);
575+
PyErr_SetString(PyExc_KeyError, buf);
576+
PyMem_FREE(buf);
539577
return NULL;
540578
}
541579

0 commit comments

Comments
 (0)