1- import re , unicodedata , sys
1+ import re , sys
2+ from unicodedata import ucd_3_2_0 as unicodedata
23
34if sys .maxunicode == 65535 :
45 raise RuntimeError ("need UCS-4 Python" )
@@ -37,16 +38,20 @@ def compact_set(l):
3738 tuple .append ((prev ,prev + span + 1 ))
3839 else :
3940 single .append (prev )
40- tuple = " + " .join (["list(range(%d,%d))" % t for t in tuple ])
41+ if not single and len (tuple ) == 1 :
42+ tuple = "range(%d,%d)" % tuple [0 ]
43+ else :
44+ tuple = " + " .join ("list(range(%d,%d))" % t for t in tuple )
4145 if not single :
4246 return "set(%s)" % tuple
4347 if not tuple :
44- return "set(%s )" % repr (single )
45- return "set(%s + %s)" % (repr ( single ), tuple )
48+ return "set(%r )" % (single , )
49+ return "set(%r + %s)" % (single , tuple )
4650
4751############## Read the tables in the RFC #######################
4852
49- data = open ("rfc3454.txt" ).readlines ()
53+ with open ("rfc3454.txt" ) as f :
54+ data = f .readlines ()
5055
5156tables = []
5257curname = None
@@ -55,8 +60,7 @@ def compact_set(l):
5560 if not l :
5661 continue
5762 # Skip RFC page breaks
58- if l .startswith ("Hoffman & Blanchet" ) or \
59- l .startswith ("RFC 3454" ):
63+ if l .startswith (("Hoffman & Blanchet" , "RFC 3454" )):
6064 continue
6165 # Find start/end lines
6266 m = re .match ("----- (Start|End) Table ([A-Z](.[0-9])+) -----" , l )
@@ -71,6 +75,8 @@ def compact_set(l):
7175 else :
7276 if not curname :
7377 raise RuntimeError ("End without start" , l )
78+ if curname != m .group (2 ):
79+ raise RuntimeError ("Unexpected end" , l )
7480 curname = None
7581 continue
7682 if not curname :
@@ -113,10 +119,10 @@ def compact_set(l):
113119and mappings, for which a mapping function is provided.
114120\" \" \"
115121
116- import unicodedata
122+ from unicodedata import ucd_3_2_0 as unicodedata
117123""" )
118124
119- print ("assert unicodedata.unidata_version == %s " % repr (unicodedata .unidata_version ))
125+ print ("assert unicodedata.unidata_version == %r " % (unicodedata .unidata_version , ))
120126
121127# A.1 is the table of unassigned characters
122128# XXX Plane 15 PUA is listed as unassigned in Python.
@@ -173,15 +179,15 @@ def in_table_b1(code):
173179b3_exceptions = {}
174180
175181for k ,v in table_b2 .items ():
176- if map (ord , unichr (k ).lower ()) != v :
177- b3_exceptions [k ] = u "" .join (map (unichr ,v ))
182+ if list ( map (ord , chr (k ).lower () )) != v :
183+ b3_exceptions [k ] = "" .join (map (chr ,v ))
178184
179185b3 = sorted (b3_exceptions .items ())
180186
181187print ("""
182188b3_exceptions = {""" )
183- for i ,( k , v ) in enumerate (b3 ):
184- print ("0x%x:%s ," % ( k , repr ( v )) , end = ' ' )
189+ for i , kv in enumerate (b3 ):
190+ print ("0x%x:%a ," % kv , end = ' ' )
185191 if i % 4 == 3 :
186192 print ()
187193print ("}" )
@@ -224,7 +230,7 @@ def map_table_b2(a):
224230def map_table_b2(a):
225231 al = map_table_b3(a)
226232 b = unicodedata.normalize("NFKC", al)
227- bl = u "".join([map_table_b3(ch) for ch in b])
233+ bl = "".join([map_table_b3(ch) for ch in b])
228234 c = unicodedata.normalize("NFKC", bl)
229235 if b != c:
230236 return c
@@ -240,7 +246,7 @@ def map_table_b2(a):
240246
241247print ("""
242248def in_table_c11(code):
243- return code == u " "
249+ return code == " "
244250""" )
245251
246252# C.1.2 is the rest of all space characters
@@ -249,12 +255,12 @@ def in_table_c11(code):
249255assert name == "C.1.2"
250256
251257# table = set(table.keys())
252- # Zs = set(gen_category(["Zs"])) - set([ 0x20])
258+ # Zs = set(gen_category(["Zs"])) - { 0x20}
253259# assert Zs == table
254260
255261print ("""
256262def in_table_c12(code):
257- return unicodedata.category(code) == "Zs" and code != u " "
263+ return unicodedata.category(code) == "Zs" and code != " "
258264
259265def in_table_c11_c12(code):
260266 return unicodedata.category(code) == "Zs"
0 commit comments