Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 677bde2

Browse files
committed
Patch #626485: Support Unicode normalization.
1 parent 74a530d commit 677bde2

6 files changed

Lines changed: 1053 additions & 23 deletions

File tree

Doc/lib/libunicodedata.tex

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ \section{\module{unicodedata} ---
55
\modulesynopsis{Access the Unicode Database.}
66
\moduleauthor{Marc-Andre Lemburg}{[email protected]}
77
\sectionauthor{Marc-Andre Lemburg}{[email protected]}
8-
8+
\sectionauthor{Martin v. L\"owis}{[email protected]}
99

1010
\index{Unicode}
1111
\index{character}
@@ -14,10 +14,10 @@ \section{\module{unicodedata} ---
1414
This module provides access to the Unicode Character Database which
1515
defines character properties for all Unicode characters. The data in
1616
this database is based on the \file{UnicodeData.txt} file version
17-
3.0.0 which is publically available from \url{ftp://ftp.unicode.org/}.
17+
3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}.
1818

1919
The module uses the same names and symbols as defined by the
20-
UnicodeData File Format 3.0.0 (see
20+
UnicodeData File Format 3.2.0 (see
2121
\url{http://www.unicode.org/Public/UNIDATA/UnicodeData.html}). It
2222
defines the following functions:
2323

@@ -83,3 +83,37 @@ \section{\module{unicodedata} ---
8383
character \var{unichr} as string. An empty string is returned in case
8484
no such mapping is defined.
8585
\end{funcdesc}
86+
87+
\begin{funcdesc}{normalize}{form, unistr}
88+
89+
Return the normal form \var{form} for the Unicode string \var{unistr}.
90+
Valid values for \var{form} are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
91+
92+
The Unicode standard defines various normalization forms of a Unicode
93+
string, based on the definition of canonical equivalence and
94+
compatibility equivalence. In Unicode, several characters can be
95+
expressed in various way. For example, the character U+00C7 (LATIN
96+
CAPITAL LETTER C WITH CEDILLA) can also be expressed as the sequence
97+
U+0043 (LATIN CAPITAL LETTER C) U+0327 (COMBINING CEDILLA).
98+
99+
For each character, there are two normal forms: normal form C and
100+
normal form D. Normal form D (NFD) is also known as canonical
101+
decomposition, and translates each character into its decomposed form.
102+
Normal form C (NFC) first applies a canonical decomposition, then
103+
composes pre-combined characters again.
104+
105+
In addition to these two forms, there two additional normal forms
106+
based on compatibility equivalence. In Unicode, certain characters are
107+
supported which normally would be unified with other characters. For
108+
example, U+2160 (ROMAN NUMERAL ONE) is really the same thing as U+0049
109+
(LATIN CAPITAL LETTER I). However, it is supported in Unicode for
110+
compatibility with existing character sets (e.g. gb2312).
111+
112+
The normal form KD (NFKD) will apply the compatibility decomposition,
113+
i.e. replace all compatibility characters with their equivalents. The
114+
normal form KC (NFKC) first applies the compatibility decomposition,
115+
followed by the canonical composition.
116+
117+
\versionadded{2.3}
118+
\end{funcdesc}
119+

Lib/test/test_normalization.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from test.test_support import verbose, TestFailed, TestSkipped, verify
2+
import sys
3+
from unicodedata import normalize
4+
try:
5+
data = open("NormalizationTest.txt","r").readlines()
6+
except IOError:
7+
raise TestSkipped("NormalizationTest.txt not found, download from http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt")
8+
9+
class RangeError:
10+
pass
11+
12+
def NFC(str):
13+
return normalize("NFC", str)
14+
15+
def NFKC(str):
16+
return normalize("NFKC", str)
17+
18+
def NFD(str):
19+
return normalize("NFD", str)
20+
21+
def NFKD(str):
22+
return normalize("NFKD", str)
23+
24+
def unistr(data):
25+
data = [int(x, 16) for x in data.split(" ")]
26+
for x in data:
27+
if x > sys.maxunicode:
28+
raise RangeError
29+
return u"".join([unichr(x) for x in data])
30+
31+
part1_data = {}
32+
for line in data:
33+
if '#' in line:
34+
line = line.split('#')[0]
35+
line = line.strip()
36+
if not line:
37+
continue
38+
if line.startswith("@Part"):
39+
part = line
40+
continue
41+
try:
42+
c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
43+
except RangeError:
44+
# Skip unsupported characters
45+
continue
46+
47+
if verbose:
48+
print line
49+
50+
# Perform tests
51+
verify(c2 == NFC(c1) == NFC(c2) == NFC(c3), line)
52+
verify(c4 == NFC(c4) == NFC(c5), line)
53+
verify(c3 == NFD(c1) == NFD(c2) == NFD(c3), line)
54+
verify(c5 == NFD(c4) == NFD(c5), line)
55+
verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5), line)
56+
verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5), line)
57+
58+
# Record part 1 data
59+
if part == "@Part1":
60+
part1_data[c1] = 1
61+
62+
# Perform tests for all other data
63+
for c in range(sys.maxunicode+1):
64+
X = unichr(c)
65+
if X in part1_data:
66+
continue
67+
assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c
68+

Misc/NEWS

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,8 @@ Extension modules
317317
available in source code, but not built automatically anymore, and
318318
is now named bsddb185.
319319

320-
- unicodedata was updated to Unicode 3.2. In now also supports names
321-
for Hangul syllables and CJK unified ideographs.
320+
- unicodedata was updated to Unicode 3.2. It supports normalization
321+
and names for Hangul syllables and CJK unified ideographs.
322322

323323
- resource.getrlimit() now returns longs instead of ints.
324324

0 commit comments

Comments
 (0)