python
diff --git a/‎Doc/lib/libunicodedata.tex‎
Lines changed: 37 additions & 3 deletions b/‎Doc/lib/libunicodedata.tex‎
Lines changed: 37 additions & 3 deletions
diff --git a/‎Lib/test/test_normalization.py‎
Lines changed: 68 additions & 0 deletions b/‎Lib/test/test_normalization.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎Misc/NEWS‎
Lines changed: 2 additions & 2 deletions b/‎Misc/NEWS‎
Lines changed: 2 additions & 2 deletions
@@ -5,7 +5,7 @@ \section{\module{unicodedata} ---
 \modulesynopsis{Access the Unicode Database.}
 \moduleauthor{Marc-Andre Lemburg}{[email protected]}
 \sectionauthor{Marc-Andre Lemburg}{[email protected]}
-
+\sectionauthor{Martin v. L\"owis}{[email protected]}
 
 \index{Unicode}
 \index{character}
@@ -14,10 +14,10 @@ \section{\module{unicodedata} ---
 This module provides access to the Unicode Character Database which
 defines character properties for all Unicode characters. The data in
 this database is based on the \file{UnicodeData.txt} file version
-3.0.0 which is publically available from \url{ftp://ftp.unicode.org/}.
+3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}.
 
 The module uses the same names and symbols as defined by the
-UnicodeData File Format 3.0.0 (see
+UnicodeData File Format 3.2.0 (see
 \url{http://www.unicode.org/Public/UNIDATA/UnicodeData.html}).  It
 defines the following functions:
 
@@ -83,3 +83,37 @@ \section{\module{unicodedata} ---
   character \var{unichr} as string. An empty string is returned in case
   no such mapping is defined.
 \end{funcdesc}
+
+\begin{funcdesc}{normalize}{form, unistr}
+
+Return the normal form \var{form} for the Unicode string \var{unistr}.
+Valid values for \var{form} are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
+
+The Unicode standard defines various normalization forms of a Unicode
+string, based on the definition of canonical equivalence and
+compatibility equivalence. In Unicode, several characters can be
+expressed in various way. For example, the character U+00C7 (LATIN
+CAPITAL LETTER C WITH CEDILLA) can also be expressed as the sequence
+U+0043 (LATIN CAPITAL LETTER C) U+0327 (COMBINING CEDILLA).
+
+For each character, there are two normal forms: normal form C and
+normal form D. Normal form D (NFD) is also known as canonical
+decomposition, and translates each character into its decomposed form.
+Normal form C (NFC) first applies a canonical decomposition, then
+composes pre-combined characters again.
+
+In addition to these two forms, there two additional normal forms
+based on compatibility equivalence. In Unicode, certain characters are
+supported which normally would be unified with other characters. For
+example, U+2160 (ROMAN NUMERAL ONE) is really the same thing as U+0049
+(LATIN CAPITAL LETTER I). However, it is supported in Unicode for
+compatibility with existing character sets (e.g. gb2312).
+
+The normal form KD (NFKD) will apply the compatibility decomposition,
+i.e. replace all compatibility characters with their equivalents. The
+normal form KC (NFKC) first applies the compatibility decomposition,
+followed by the canonical composition.
+
+\versionadded{2.3}
+\end{funcdesc}
+
@@ -0,0 +1,68 @@
+from test.test_support import verbose, TestFailed, TestSkipped, verify
+import sys
+from unicodedata import normalize
+try:
+    data = open("NormalizationTest.txt","r").readlines()
+except IOError:
+    raise TestSkipped("NormalizationTest.txt not found, download from http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt")
+
+class RangeError:
+    pass
+
+def NFC(str):
+    return normalize("NFC", str)
+
+def NFKC(str):
+    return normalize("NFKC", str)
+
+def NFD(str):
+    return normalize("NFD", str)
+
+def NFKD(str):
+    return normalize("NFKD", str)
+
+def unistr(data):
+    data = [int(x, 16) for x in data.split(" ")]
+    for x in data:
+        if x > sys.maxunicode:
+            raise RangeError
+    return u"".join([unichr(x) for x in data])
+
+part1_data = {}
+for line in data:
+    if '#' in line:
+        line = line.split('#')[0]
+    line = line.strip()
+    if not line:
+        continue
+    if line.startswith("@Part"):
+        part = line
+        continue
+    try:
+        c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
+    except RangeError:
+        # Skip unsupported characters
+        continue
+
+    if verbose:
+        print line
+    
+    # Perform tests
+    verify(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
+    verify(c4 ==  NFC(c4) ==  NFC(c5), line)
+    verify(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
+    verify(c5 ==  NFD(c4) ==  NFD(c5), line)
+    verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5), line)
+    verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5), line)
+
+    # Record part 1 data
+    if part == "@Part1":
+        part1_data[c1] = 1
+
+# Perform tests for all other data
+for c in range(sys.maxunicode+1):
+    X = unichr(c)
+    if X in part1_data:
+        continue
+    assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c
+    
@@ -317,8 +317,8 @@ Extension modules
   available in source code, but not built automatically anymore, and
   is now named bsddb185.
 
-- unicodedata was updated to Unicode 3.2. In now also supports names
-  for Hangul syllables and CJK unified ideographs.
+- unicodedata was updated to Unicode 3.2. It supports normalization
+  and names for Hangul syllables and CJK unified ideographs.
 
 - resource.getrlimit() now returns longs instead of ints.