1414# - DerivedNormalizationProps.txt
1515# - NormalizationTest.txt
1616# - UnicodeData.txt
17+ # - StandardizedVariants.txt
1718#
1819# Since this should not require frequent updates, we just store this
19- # out-of-line and check the unicode .rs file into git.
20+ # out-of-line and check the tables .rs and normalization_tests.rs files into git.
2021import collections
2122import urllib .request
2223
5758 'Cc' : ['C' ], 'Cf' : ['C' ], 'Cs' : ['C' ], 'Co' : ['C' ], 'Cn' : ['C' ],
5859}
5960
61+ # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
62+ # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
63+ S_BASE , L_COUNT , V_COUNT , T_COUNT = 0xAC00 , 19 , 21 , 28
64+ S_COUNT = L_COUNT * V_COUNT * T_COUNT
65+
6066class UnicodeData (object ):
6167 def __init__ (self ):
6268 self ._load_unicode_data ()
@@ -66,6 +72,9 @@ def __init__(self):
6672 self .canon_comp = self ._compute_canonical_comp ()
6773 self .canon_fully_decomp , self .compat_fully_decomp = self ._compute_fully_decomposed ()
6874
75+ self .cjk_compat_variants_fully_decomp = {}
76+ self ._load_cjk_compat_ideograph_variants ()
77+
6978 def stats (name , table ):
7079 count = sum (len (v ) for v in table .values ())
7180 print ("%s: %d chars => %d decomposed chars" % (name , len (table ), count ))
@@ -75,6 +84,7 @@ def stats(name, table):
7584 stats ("Compatible decomp" , self .compat_decomp )
7685 stats ("Canonical fully decomp" , self .canon_fully_decomp )
7786 stats ("Compatible fully decomp" , self .compat_fully_decomp )
87+ stats ("CJK Compat Variants fully decomp" , self .cjk_compat_variants_fully_decomp )
7888
7989 self .ss_leading , self .ss_trailing = self ._compute_stream_safe_tables ()
8090
@@ -83,6 +93,7 @@ def _fetch(self, filename):
8393 return resp .read ().decode ('utf-8' )
8494
8595 def _load_unicode_data (self ):
96+ self .name_to_char_int = {}
8697 self .combining_classes = {}
8798 self .compat_decomp = {}
8899 self .canon_decomp = {}
@@ -95,6 +106,9 @@ def _load_unicode_data(self):
95106 char , category , cc , decomp = pieces [0 ], pieces [2 ], pieces [3 ], pieces [5 ]
96107 char_int = int (char , 16 )
97108
109+ name = pieces [1 ].strip ()
110+ self .name_to_char_int [name ] = char_int
111+
98112 if cc != '0' :
99113 self .combining_classes [char_int ] = cc
100114
@@ -106,6 +120,41 @@ def _load_unicode_data(self):
106120 if category == 'M' or 'M' in expanded_categories .get (category , []):
107121 self .general_category_mark .append (char_int )
108122
123+ def _load_cjk_compat_ideograph_variants (self ):
124+ for line in self ._fetch ("StandardizedVariants.txt" ).splitlines ():
125+ strip_comments = line .split ('#' , 1 )[0 ].strip ()
126+ if not strip_comments :
127+ continue
128+
129+ variation_sequence , description , differences = strip_comments .split (';' )
130+ description = description .strip ()
131+
132+ # Don't use variations that only apply in particular shaping environments.
133+ if differences :
134+ continue
135+
136+ # Look for entries where the description field is a codepoint name.
137+ if description not in self .name_to_char_int :
138+ continue
139+
140+ # Only consider the CJK Compatibility Ideographs.
141+ if not description .startswith ('CJK COMPATIBILITY IDEOGRAPH-' ):
142+ continue
143+
144+ char_int = self .name_to_char_int [description ]
145+
146+ assert not char_int in self .combining_classes , "Unexpected: CJK compat variant with a combining class"
147+ assert not char_int in self .compat_decomp , "Unexpected: CJK compat variant and compatibility decomposition"
148+ assert len (self .canon_decomp [char_int ]) == 1 , "Unexpected: CJK compat variant and non-singleton canonical decomposition"
149+ # If we ever need to handle Hangul here, we'll need to handle it separately.
150+ assert not (S_BASE <= char_int < S_BASE + S_COUNT )
151+
152+ cjk_compat_variant_parts = [int (c , 16 ) for c in variation_sequence .split ()]
153+ for c in cjk_compat_variant_parts :
154+ assert not c in self .canon_decomp , "Unexpected: CJK compat variant is unnormalized (canon)"
155+ assert not c in self .compat_decomp , "Unexpected: CJK compat variant is unnormalized (compat)"
156+ self .cjk_compat_variants_fully_decomp [char_int ] = cjk_compat_variant_parts
157+
109158 def _load_norm_props (self ):
110159 props = collections .defaultdict (list )
111160
@@ -178,11 +227,6 @@ def _compute_fully_decomposed(self):
178227 The upshot is that decomposition code is very simple and easy to inline
179228 at mild code size cost.
180229 """
181- # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
182- # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
183- S_BASE , L_COUNT , V_COUNT , T_COUNT = 0xAC00 , 19 , 21 , 28
184- S_COUNT = L_COUNT * V_COUNT * T_COUNT
185-
186230 def _decompose (char_int , compatible ):
187231 # 7-bit ASCII never decomposes
188232 if char_int <= 0x7f :
@@ -320,8 +364,8 @@ def gen_composition_table(canon_comp, out):
320364 out .write (" }\n " )
321365 out .write ("}\n " )
322366
323- def gen_decomposition_tables (canon_decomp , compat_decomp , out ):
324- tables = [(canon_decomp , 'canonical' ), (compat_decomp , 'compatibility' )]
367+ def gen_decomposition_tables (canon_decomp , compat_decomp , cjk_compat_variants_decomp , out ):
368+ tables = [(canon_decomp , 'canonical' ), (compat_decomp , 'compatibility' ), ( cjk_compat_variants_decomp , 'cjk_compat_variants' ) ]
325369 for table , name in tables :
326370 gen_mph_data (name + '_decomposed' , table , "(u32, &'static [char])" ,
327371 lambda k : "(0x{:x}, &[{}])" .format (k ,
@@ -491,7 +535,7 @@ def minimal_perfect_hash(d):
491535 gen_composition_table (data .canon_comp , out )
492536 out .write ("\n " )
493537
494- gen_decomposition_tables (data .canon_fully_decomp , data .compat_fully_decomp , out )
538+ gen_decomposition_tables (data .canon_fully_decomp , data .compat_fully_decomp , data . cjk_compat_variants_fully_decomp , out )
495539
496540 gen_combining_mark (data .general_category_mark , out )
497541 out .write ("\n " )
0 commit comments