Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 806d8cf

Browse files
committed
Merged revisions 79494,79496 via svnmerge from
svn+ssh://[email protected]/python/trunk ........ r79494 | florent.xicluna | 2010-03-30 10:24:06 +0200 (mar, 30 mar 2010) | 2 lines #7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to Unicode Standard Annex #14. ........ r79496 | florent.xicluna | 2010-03-30 18:29:03 +0200 (mar, 30 mar 2010) | 2 lines Highlight the change of behavior related to r79494. Now VT and FF are linebreaks. ........
1 parent 364129e commit 806d8cf

5 files changed

Lines changed: 52 additions & 12 deletions

File tree

Lib/test/test_unicodedata.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class UnicodeMethodsTest(unittest.TestCase):
2525

2626
def test_method_checksum(self):
2727
h = hashlib.sha1()
28-
for i in range(65536):
28+
for i in range(0x10000):
2929
char = chr(i)
3030
data = [
3131
# Predicates (single char)
@@ -284,6 +284,17 @@ def test_bug_4971(self):
284284
self.assertEqual("\u01c5".title(), "\u01c5")
285285
self.assertEqual("\u01c6".title(), "\u01c5")
286286

287+
def test_linebreak_7643(self):
288+
for i in range(0x10000):
289+
lines = (chr(i) + 'A').splitlines()
290+
if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
291+
0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
292+
self.assertEqual(len(lines), 2,
293+
r"\u%.4x should be a linebreak" % i)
294+
else:
295+
self.assertEqual(len(lines), 1,
296+
r"\u%.4x should not be a linebreak" % i)
297+
287298
def test_main():
288299
test.support.run_unittest(
289300
UnicodeMiscTest,

Misc/NEWS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,11 @@ C-API
293293
Library
294294
-------
295295

296+
- Backwards incompatible change: Unicode codepoints line tabulation (0x0B) and
297+
form feed (0x0C) are now considered linebreaks, as specified in Unicode
298+
Standard Annex #14. See issue #7643.
299+
http://www.unicode.org/reports/tr14/
300+
296301
- Comparisons using one of <, <=, >, >= between a complex instance and
297302
a Fractions instance now raise TypeError instead of returning
298303
True/False. This makes Fraction <=> complex comparisons consistent with

Objects/unicodeobject.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,9 @@ static const char unicode_default_encoding[] = "utf-8";
126126
/* Fast detection of the most frequent whitespace characters */
127127
const unsigned char _Py_ascii_whitespace[] = {
128128
0, 0, 0, 0, 0, 0, 0, 0,
129-
/* case 0x0009: * HORIZONTAL TABULATION */
129+
/* case 0x0009: * CHARACTER TABULATION */
130130
/* case 0x000A: * LINE FEED */
131-
/* case 0x000B: * VERTICAL TABULATION */
131+
/* case 0x000B: * LINE TABULATION */
132132
/* case 0x000C: * FORM FEED */
133133
/* case 0x000D: * CARRIAGE RETURN */
134134
0, 1, 1, 1, 1, 1, 0, 0,
@@ -163,8 +163,10 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
163163
static unsigned char ascii_linebreak[] = {
164164
0, 0, 0, 0, 0, 0, 0, 0,
165165
/* 0x000A, * LINE FEED */
166+
/* 0x000B, * LINE TABULATION */
167+
/* 0x000C, * FORM FEED */
166168
/* 0x000D, * CARRIAGE RETURN */
167-
0, 0, 1, 0, 0, 1, 0, 0,
169+
0, 0, 1, 1, 1, 1, 0, 0,
168170
0, 0, 0, 0, 0, 0, 0, 0,
169171
/* 0x001C, * FILE SEPARATOR */
170172
/* 0x001D, * GROUP SEPARATOR */

Objects/unicodetype_db.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,7 @@ static unsigned char index1[] = {
694694
};
695695

696696
static unsigned char index2[] = {
697-
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
697+
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
698698
1, 1, 1, 1, 3, 3, 3, 2, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
699699
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5, 5, 5, 5, 5, 5, 5, 16, 16, 16, 16,
700700
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
@@ -3395,13 +3395,16 @@ int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
33953395
#endif
33963396
}
33973397

3398-
/* Returns 1 for Unicode characters having the category 'Zl',
3399-
* 'Zp' or type 'B', 0 otherwise.
3398+
/* Returns 1 for Unicode characters having the line break
3399+
* property 'BK', 'CR', 'LF' or 'NL' or having bidirectional
3400+
* type 'B', 0 otherwise.
34003401
*/
34013402
int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
34023403
{
34033404
switch (ch) {
34043405
case 0x000A:
3406+
case 0x000B:
3407+
case 0x000C:
34053408
case 0x000D:
34063409
case 0x001C:
34073410
case 0x001D:

Tools/unicode/makeunicodedata.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
UNIHAN = "Unihan%s.txt"
3939
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
4040
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
41+
LINE_BREAK = "LineBreak%s.txt"
4142

4243
old_versions = ["3.2.0"]
4344

@@ -52,6 +53,8 @@
5253

5354
EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
5455

56+
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
57+
5558
# note: should match definitions in Objects/unicodectype.c
5659
ALPHA_MASK = 0x01
5760
DECIMAL_MASK = 0x02
@@ -77,7 +80,8 @@ def maketables(trace=0):
7780
EASTASIAN_WIDTH % version,
7881
UNIHAN % version,
7982
DERIVED_CORE_PROPERTIES % version,
80-
DERIVEDNORMALIZATION_PROPS % version)
83+
DERIVEDNORMALIZATION_PROPS % version,
84+
LINE_BREAK % version)
8185

8286
print(len(list(filter(None, unicode.table))), "characters")
8387

@@ -378,7 +382,7 @@ def makeunicodetype(unicode, trace):
378382
flags |= ALPHA_MASK
379383
if category == "Ll":
380384
flags |= LOWER_MASK
381-
if category == "Zl" or bidirectional == "B":
385+
if 'Line_Break' in properties or bidirectional == "B":
382386
flags |= LINEBREAK_MASK
383387
linebreaks.append(char)
384388
if category == "Zs" or bidirectional in ("WS", "B", "S"):
@@ -537,8 +541,9 @@ def makeunicodetype(unicode, trace):
537541
print(file=fp)
538542

539543
# Generate code for _PyUnicode_IsLinebreak()
540-
print("/* Returns 1 for Unicode characters having the category 'Zl',", file=fp)
541-
print(" * 'Zp' or type 'B', 0 otherwise.", file=fp)
544+
print("/* Returns 1 for Unicode characters having the line break", file=fp)
545+
print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
546+
print(" * type 'B', 0 otherwise.", file=fp)
542547
print(" */", file=fp)
543548
print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
544549
print('{', file=fp)
@@ -826,7 +831,8 @@ class UnicodeData:
826831
# derived-props] (17)
827832

828833
def __init__(self, filename, exclusions, eastasianwidth, unihan,
829-
derivedprops, derivednormalizationprops=None, expand=1):
834+
derivedprops, derivednormalizationprops=None, linebreakprops=None,
835+
expand=1):
830836
self.changed = []
831837
file = open(filename)
832838
table = [None] * 0x110000
@@ -912,6 +918,19 @@ def __init__(self, filename, exclusions, eastasianwidth, unihan,
912918
# apply to unassigned code points; ignore them
913919
table[char][-1].add(p)
914920

921+
if linebreakprops:
922+
for s in open(linebreakprops):
923+
s = s.partition('#')[0]
924+
s = [i.strip() for i in s.split(';')]
925+
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
926+
continue
927+
if '..' not in s[0]:
928+
first = last = int(s[0], 16)
929+
else:
930+
first, last = [int(c, 16) for c in s[0].split('..')]
931+
for char in range(first, last+1):
932+
table[char][-1].add('Line_Break')
933+
915934
if derivednormalizationprops:
916935
quickchecks = [0] * 0x110000 # default is Yes
917936
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()

0 commit comments

Comments
 (0)