88""" #"
99
1010import unittest
11+ import unicodedata
1112
1213from test import support
14+ from http .client import HTTPException
15+ from test .test_normalization import check_version
1316
1417class UnicodeNamesTest (unittest .TestCase ):
1518
@@ -59,8 +62,6 @@ def test_general(self):
5962 )
6063
6164 def test_ascii_letters (self ):
62- import unicodedata
63-
6465 for char in "" .join (map (chr , range (ord ("a" ), ord ("z" )))):
6566 name = "LATIN SMALL LETTER %s" % char .upper ()
6667 code = unicodedata .lookup (name )
@@ -81,7 +82,6 @@ def test_hangul_syllables(self):
8182 self .checkletter ("HANGUL SYLLABLE HWEOK" , "\ud6f8 " )
8283 self .checkletter ("HANGUL SYLLABLE HIH" , "\ud7a3 " )
8384
84- import unicodedata
8585 self .assertRaises (ValueError , unicodedata .name , "\ud7a4 " )
8686
8787 def test_cjk_unified_ideographs (self ):
@@ -97,23 +97,97 @@ def test_cjk_unified_ideographs(self):
9797 self .checkletter ("CJK UNIFIED IDEOGRAPH-2B81D" , "\U0002B81D " )
9898
9999 def test_bmp_characters (self ):
100- import unicodedata
101- count = 0
102100 for code in range (0x10000 ):
103101 char = chr (code )
104102 name = unicodedata .name (char , None )
105103 if name is not None :
106104 self .assertEqual (unicodedata .lookup (name ), char )
107- count += 1
108105
109106 def test_misc_symbols (self ):
110107 self .checkletter ("PILCROW SIGN" , "\u00b6 " )
111108 self .checkletter ("REPLACEMENT CHARACTER" , "\uFFFD " )
112109 self .checkletter ("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK" , "\uFF9F " )
113110 self .checkletter ("FULLWIDTH LATIN SMALL LETTER A" , "\uFF41 " )
114111
112+ def test_aliases (self ):
113+ # Check that the aliases defined in the NameAliases.txt file work.
114+ # This should be updated when new aliases are added or the file
115+ # should be downloaded and parsed instead. See #12753.
116+ aliases = [
117+ ('LATIN CAPITAL LETTER GHA' , 0x01A2 ),
118+ ('LATIN SMALL LETTER GHA' , 0x01A3 ),
119+ ('KANNADA LETTER LLLA' , 0x0CDE ),
120+ ('LAO LETTER FO FON' , 0x0E9D ),
121+ ('LAO LETTER FO FAY' , 0x0E9F ),
122+ ('LAO LETTER RO' , 0x0EA3 ),
123+ ('LAO LETTER LO' , 0x0EA5 ),
124+ ('TIBETAN MARK BKA- SHOG GI MGO RGYAN' , 0x0FD0 ),
125+ ('YI SYLLABLE ITERATION MARK' , 0xA015 ),
126+ ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET' , 0xFE18 ),
127+ ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS' , 0x1D0C5 )
128+ ]
129+ for alias , codepoint in aliases :
130+ self .checkletter (alias , chr (codepoint ))
131+ name = unicodedata .name (chr (codepoint ))
132+ self .assertNotEqual (name , alias )
133+ self .assertEqual (unicodedata .lookup (alias ),
134+ unicodedata .lookup (name ))
135+ with self .assertRaises (KeyError ):
136+ unicodedata .ucd_3_2_0 .lookup (alias )
137+
138+ def test_aliases_names_in_pua_range (self ):
139+ # We are storing aliases in the PUA 15, but their names shouldn't leak
140+ for cp in range (0xf0000 , 0xf0100 ):
141+ with self .assertRaises (ValueError ) as cm :
142+ unicodedata .name (chr (cp ))
143+ self .assertEqual (str (cm .exception ), 'no such name' )
144+
145+ def test_named_sequences_names_in_pua_range (self ):
146+ # We are storing named seq in the PUA 15, but their names shouldn't leak
147+ for cp in range (0xf0100 , 0xf0fff ):
148+ with self .assertRaises (ValueError ) as cm :
149+ unicodedata .name (chr (cp ))
150+ self .assertEqual (str (cm .exception ), 'no such name' )
151+
152+ def test_named_sequences_sample (self ):
153+ # Check a few named sequences. See #12753.
154+ sequences = [
155+ ('LATIN SMALL LETTER R WITH TILDE' , '\u0072 \u0303 ' ),
156+ ('TAMIL SYLLABLE SAI' , '\u0BB8 \u0BC8 ' ),
157+ ('TAMIL SYLLABLE MOO' , '\u0BAE \u0BCB ' ),
158+ ('TAMIL SYLLABLE NNOO' , '\u0BA3 \u0BCB ' ),
159+ ('TAMIL CONSONANT KSS' , '\u0B95 \u0BCD \u0BB7 \u0BCD ' ),
160+ ]
161+ for seqname , codepoints in sequences :
162+ self .assertEqual (unicodedata .lookup (seqname ), codepoints )
163+ with self .assertRaises (SyntaxError ):
164+ self .checkletter (seqname , None )
165+ with self .assertRaises (KeyError ):
166+ unicodedata .ucd_3_2_0 .lookup (seqname )
167+
168+ def test_named_sequences_full (self ):
169+ # Check all the named sequences
170+ url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" %
171+ unicodedata .unidata_version )
172+ try :
173+ testdata = support .open_urlresource (url , encoding = "utf-8" ,
174+ check = check_version )
175+ except (IOError , HTTPException ):
176+ self .skipTest ("Could not retrieve " + url )
177+ self .addCleanup (testdata .close )
178+ for line in testdata :
179+ line = line .strip ()
180+ if not line or line .startswith ('#' ):
181+ continue
182+ seqname , codepoints = line .split (';' )
183+ codepoints = '' .join (chr (int (cp , 16 )) for cp in codepoints .split ())
184+ self .assertEqual (unicodedata .lookup (seqname ), codepoints )
185+ with self .assertRaises (SyntaxError ):
186+ self .checkletter (seqname , None )
187+ with self .assertRaises (KeyError ):
188+ unicodedata .ucd_3_2_0 .lookup (seqname )
189+
115190 def test_errors (self ):
116- import unicodedata
117191 self .assertRaises (TypeError , unicodedata .name )
118192 self .assertRaises (TypeError , unicodedata .name , 'xx' )
119193 self .assertRaises (TypeError , unicodedata .lookup )
0 commit comments