Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 5cbc71e

Browse files
committed
Issue #10459: Update CJK character names to Unicode 6.0.
1 parent 249d7e3 commit 5cbc71e

4 files changed

Lines changed: 33 additions & 7 deletions

File tree

Lib/test/test_ucn.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,13 @@ def test_cjk_unified_ideographs(self):
8888
self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
8989
self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
9090
self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
91-
self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", "\u9fa5")
91+
self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
9292
self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
9393
self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
94+
self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
95+
self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
96+
self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
97+
self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
9498

9599
def test_bmp_characters(self):
96100
import unicodedata

Misc/NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ Core and Builtins
3232
Library
3333
-------
3434

35+
- Issue #10459: Update CJK character names to Unicode 6.0.
36+
3537
- Issue #4493: urllib.request adds '/' in front of path components which does not
3638
start with '/. Common behavior exhibited by browsers and other clients.
3739

Modules/unicodedata.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -866,13 +866,16 @@ static char *hangul_syllables[][3] = {
866866
{ 0, 0, "H" }
867867
};
868868

869+
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
869870
static int
870871
is_unified_ideograph(Py_UCS4 code)
871872
{
872-
return (
873-
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
874-
(0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
875-
(0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
873+
return
874+
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
875+
(0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
876+
(0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
877+
(0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
878+
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
876879
}
877880

878881
static int

Tools/unicode/makeunicodedata.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,15 @@
7070
NODELTA_MASK = 0x800
7171
NUMERIC_MASK = 0x1000
7272

73+
# these ranges need to match unicodedata.c:is_unified_ideograph
74+
cjk_ranges = [
75+
('3400', '4DB5'),
76+
('4E00', '9FCB'),
77+
('20000', '2A6D6'),
78+
('2A700', '2B734'),
79+
('2B740', '2B81D')
80+
]
81+
7382
def maketables(trace=0):
7483

7584
print("--- Reading", UNICODE_DATA % "", "...")
@@ -81,7 +90,7 @@ def maketables(trace=0):
8190

8291
for version in old_versions:
8392
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
84-
old_unicode = UnicodeData(version)
93+
old_unicode = UnicodeData(version, cjk_check=False)
8594
print(len(list(filter(None, old_unicode.table))), "characters")
8695
merge_old_version(version, unicode, old_unicode)
8796

@@ -804,7 +813,8 @@ class UnicodeData:
804813

805814
def __init__(self, version,
806815
linebreakprops=False,
807-
expand=1):
816+
expand=1,
817+
cjk_check=True):
808818
self.changed = []
809819
file = open_data(UNICODE_DATA, version)
810820
table = [None] * 0x110000
@@ -816,6 +826,8 @@ def __init__(self, version,
816826
char = int(s[0], 16)
817827
table[char] = s
818828

829+
cjk_ranges_found = []
830+
819831
# expand first-last ranges
820832
if expand:
821833
field = None
@@ -826,12 +838,17 @@ def __init__(self, version,
826838
s[1] = ""
827839
field = s
828840
elif s[1][-5:] == "Last>":
841+
if s[1].startswith("<CJK Ideograph"):
842+
cjk_ranges_found.append((field[0],
843+
s[0]))
829844
s[1] = ""
830845
field = None
831846
elif field:
832847
f2 = field[:]
833848
f2[0] = "%X" % i
834849
table[i] = f2
850+
if cjk_check and cjk_ranges != cjk_ranges_found:
851+
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
835852

836853
# public attributes
837854
self.filename = UNICODE_DATA % ''

0 commit comments

Comments
 (0)