7070NODELTA_MASK = 0x800
7171NUMERIC_MASK = 0x1000
7272
73+ # these ranges need to match unicodedata.c:is_unified_ideograph
74+ cjk_ranges = [
75+ ('3400' , '4DB5' ),
76+ ('4E00' , '9FCB' ),
77+ ('20000' , '2A6D6' ),
78+ ('2A700' , '2B734' ),
79+ ('2B740' , '2B81D' )
80+ ]
81+
7382def maketables (trace = 0 ):
7483
7584 print ("--- Reading" , UNICODE_DATA % "" , "..." )
@@ -81,7 +90,7 @@ def maketables(trace=0):
8190
8291 for version in old_versions :
8392 print ("--- Reading" , UNICODE_DATA % ("-" + version ), "..." )
84- old_unicode = UnicodeData (version )
93+ old_unicode = UnicodeData (version , cjk_check = False )
8594 print (len (list (filter (None , old_unicode .table ))), "characters" )
8695 merge_old_version (version , unicode , old_unicode )
8796
@@ -804,7 +813,8 @@ class UnicodeData:
804813
805814 def __init__ (self , version ,
806815 linebreakprops = False ,
807- expand = 1 ):
816+ expand = 1 ,
817+ cjk_check = True ):
808818 self .changed = []
809819 file = open_data (UNICODE_DATA , version )
810820 table = [None ] * 0x110000
@@ -816,6 +826,8 @@ def __init__(self, version,
816826 char = int (s [0 ], 16 )
817827 table [char ] = s
818828
829+ cjk_ranges_found = []
830+
819831 # expand first-last ranges
820832 if expand :
821833 field = None
@@ -826,12 +838,17 @@ def __init__(self, version,
826838 s [1 ] = ""
827839 field = s
828840 elif s [1 ][- 5 :] == "Last>" :
841+ if s [1 ].startswith ("<CJK Ideograph" ):
842+ cjk_ranges_found .append ((field [0 ],
843+ s [0 ]))
829844 s [1 ] = ""
830845 field = None
831846 elif field :
832847 f2 = field [:]
833848 f2 [0 ] = "%X" % i
834849 table [i ] = f2
850+ if cjk_check and cjk_ranges != cjk_ranges_found :
851+ raise ValueError ("CJK ranges deviate: have %r" % cjk_ranges_found )
835852
836853 # public attributes
837854 self .filename = UNICODE_DATA % ''
0 commit comments