3838UNIHAN = "Unihan%s.txt"
3939DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
4040DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
41+ LINE_BREAK = "LineBreak%s.txt"
4142
4243old_versions = ["3.2.0" ]
4344
5253
5354EASTASIANWIDTH_NAMES = [ "F" , "H" , "W" , "Na" , "A" , "N" ]
5455
56+ MANDATORY_LINE_BREAKS = [ "BK" , "CR" , "LF" , "NL" ]
57+
5558# note: should match definitions in Objects/unicodectype.c
5659ALPHA_MASK = 0x01
5760DECIMAL_MASK = 0x02
@@ -77,7 +80,8 @@ def maketables(trace=0):
7780 EASTASIAN_WIDTH % version ,
7881 UNIHAN % version ,
7982 DERIVED_CORE_PROPERTIES % version ,
80- DERIVEDNORMALIZATION_PROPS % version )
83+ DERIVEDNORMALIZATION_PROPS % version ,
84+ LINE_BREAK % version )
8185
8286 print (len (list (filter (None , unicode .table ))), "characters" )
8387
@@ -378,7 +382,7 @@ def makeunicodetype(unicode, trace):
378382 flags |= ALPHA_MASK
379383 if category == "Ll" :
380384 flags |= LOWER_MASK
381- if category == "Zl" or bidirectional == "B" :
385+ if 'Line_Break' in properties or bidirectional == "B" :
382386 flags |= LINEBREAK_MASK
383387 linebreaks .append (char )
384388 if category == "Zs" or bidirectional in ("WS" , "B" , "S" ):
@@ -537,8 +541,9 @@ def makeunicodetype(unicode, trace):
537541 print (file = fp )
538542
539543 # Generate code for _PyUnicode_IsLinebreak()
540- print ("/* Returns 1 for Unicode characters having the category 'Zl'," , file = fp )
541- print (" * 'Zp' or type 'B', 0 otherwise." , file = fp )
544+ print ("/* Returns 1 for Unicode characters having the line break" , file = fp )
545+ print (" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional" , file = fp )
546+ print (" * type 'B', 0 otherwise." , file = fp )
542547 print (" */" , file = fp )
543548 print ('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)' , file = fp )
544549 print ('{' , file = fp )
@@ -826,7 +831,8 @@ class UnicodeData:
826831 # derived-props] (17)
827832
828833 def __init__ (self , filename , exclusions , eastasianwidth , unihan ,
829- derivedprops , derivednormalizationprops = None , expand = 1 ):
834+ derivedprops , derivednormalizationprops = None , linebreakprops = None ,
835+ expand = 1 ):
830836 self .changed = []
831837 file = open (filename )
832838 table = [None ] * 0x110000
@@ -912,6 +918,19 @@ def __init__(self, filename, exclusions, eastasianwidth, unihan,
912918 # apply to unassigned code points; ignore them
913919 table [char ][- 1 ].add (p )
914920
921+ if linebreakprops :
922+ for s in open (linebreakprops ):
923+ s = s .partition ('#' )[0 ]
924+ s = [i .strip () for i in s .split (';' )]
925+ if len (s ) < 2 or s [1 ] not in MANDATORY_LINE_BREAKS :
926+ continue
927+ if '..' not in s [0 ]:
928+ first = last = int (s [0 ], 16 )
929+ else :
930+ first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
931+ for char in range (first , last + 1 ):
932+ table [char ][- 1 ].add ('Line_Break' )
933+
915934 if derivednormalizationprops :
916935 quickchecks = [0 ] * 0x110000 # default is Yes
917936 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC' .split ()
0 commit comments