Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 931b8aa

Browse files
committed
#12753: Add support for Unicode name aliases and named sequences.
1 parent 3764a96 commit 931b8aa

10 files changed

Lines changed: 18125 additions & 17194 deletions

File tree

Doc/library/unicodedata.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ following functions:
2929
Look up character by name. If a character with the given name is found, return
3030
the corresponding character. If not found, :exc:`KeyError` is raised.
3131

32+
.. versionchanged:: 3.3
33+
Support for name aliases [#]_ and named sequences [#]_ has been added.
34+
3235

3336
.. function:: name(chr[, default])
3437

@@ -160,3 +163,9 @@ Examples:
160163
>>> unicodedata.bidirectional('\u0660') # 'A'rabic, 'N'umber
161164
'AN'
162165

166+
167+
.. rubric:: Footnotes
168+
169+
.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt
170+
171+
.. [#] http://www.unicode.org/Public/6.0.0/ucd/NamedSequences.txt

Doc/reference/lexical_analysis.rst

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -492,13 +492,13 @@ Escape sequences only recognized in string literals are:
492492
+-----------------+---------------------------------+-------+
493493
| Escape Sequence | Meaning | Notes |
494494
+=================+=================================+=======+
495-
| ``\N{name}`` | Character named *name* in the | |
495+
| ``\N{name}`` | Character named *name* in the | \(4) |
496496
| | Unicode database | |
497497
+-----------------+---------------------------------+-------+
498-
| ``\uxxxx`` | Character with 16-bit hex value | \(4) |
498+
| ``\uxxxx`` | Character with 16-bit hex value | \(5) |
499499
| | *xxxx* | |
500500
+-----------------+---------------------------------+-------+
501-
| ``\Uxxxxxxxx`` | Character with 32-bit hex value | \(5) |
501+
| ``\Uxxxxxxxx`` | Character with 32-bit hex value | \(6) |
502502
| | *xxxxxxxx* | |
503503
+-----------------+---------------------------------+-------+
504504

@@ -516,10 +516,14 @@ Notes:
516516
with the given value.
517517

518518
(4)
519+
.. versionchanged:: 3.3
520+
Support for name aliases [#]_ has been added.
521+
522+
(5)
519523
Individual code units which form parts of a surrogate pair can be encoded using
520524
this escape sequence. Exactly four hex digits are required.
521525

522-
(5)
526+
(6)
523527
Any Unicode character can be encoded this way, but characters outside the Basic
524528
Multilingual Plane (BMP) will be encoded using a surrogate pair if Python is
525529
compiled to use 16-bit code units (the default). Exactly eight hex digits
@@ -706,3 +710,8 @@ The following printing ASCII characters are not used in Python. Their
706710
occurrence outside string literals and comments is an unconditional error::
707711

708712
$ ? `
713+
714+
715+
.. rubric:: Footnotes
716+
717+
.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt

Doc/whatsnew/3.3.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,12 @@ Some smaller changes made to the core Python language are:
179179

180180
* Stub
181181

182+
Added support for Unicode name aliases and named sequences.
183+
Both :func:`unicodedata.lookup()` and '\N{...}' now resolve name aliases,
184+
and :func:`unicodedata.lookup()` resolves named sequences too.
185+
186+
(Contributed by Ezio Melotti in :issue:`12753`)
187+
182188

183189
New, Improved, and Deprecated Modules
184190
=====================================

Include/ucnhash.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@ typedef struct {
1919
success, zero if not. Does not set Python exceptions.
2020
If self is NULL, data come from the default version of the database.
2121
If it is not NULL, it should be a unicodedata.ucd_X_Y_Z object */
22-
int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen);
22+
int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
23+
int with_alias_and_seq);
2324

2425
/* Get character code for a given name. Same error handling
2526
as for getname. */
26-
int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code);
27+
int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code,
28+
int with_named_seq);
2729

2830
} _PyUnicode_Name_CAPI;
2931

Lib/test/test_ucn.py

Lines changed: 81 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@
88
"""#"
99

1010
import unittest
11+
import unicodedata
1112

1213
from test import support
14+
from http.client import HTTPException
15+
from test.test_normalization import check_version
1316

1417
class UnicodeNamesTest(unittest.TestCase):
1518

@@ -59,8 +62,6 @@ def test_general(self):
5962
)
6063

6164
def test_ascii_letters(self):
62-
import unicodedata
63-
6465
for char in "".join(map(chr, range(ord("a"), ord("z")))):
6566
name = "LATIN SMALL LETTER %s" % char.upper()
6667
code = unicodedata.lookup(name)
@@ -81,7 +82,6 @@ def test_hangul_syllables(self):
8182
self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
8283
self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
8384

84-
import unicodedata
8585
self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
8686

8787
def test_cjk_unified_ideographs(self):
@@ -97,23 +97,97 @@ def test_cjk_unified_ideographs(self):
9797
self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
9898

9999
def test_bmp_characters(self):
100-
import unicodedata
101-
count = 0
102100
for code in range(0x10000):
103101
char = chr(code)
104102
name = unicodedata.name(char, None)
105103
if name is not None:
106104
self.assertEqual(unicodedata.lookup(name), char)
107-
count += 1
108105

109106
def test_misc_symbols(self):
110107
self.checkletter("PILCROW SIGN", "\u00b6")
111108
self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
112109
self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
113110
self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
114111

112+
def test_aliases(self):
113+
# Check that the aliases defined in the NameAliases.txt file work.
114+
# This should be updated when new aliases are added or the file
115+
# should be downloaded and parsed instead. See #12753.
116+
aliases = [
117+
('LATIN CAPITAL LETTER GHA', 0x01A2),
118+
('LATIN SMALL LETTER GHA', 0x01A3),
119+
('KANNADA LETTER LLLA', 0x0CDE),
120+
('LAO LETTER FO FON', 0x0E9D),
121+
('LAO LETTER FO FAY', 0x0E9F),
122+
('LAO LETTER RO', 0x0EA3),
123+
('LAO LETTER LO', 0x0EA5),
124+
('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
125+
('YI SYLLABLE ITERATION MARK', 0xA015),
126+
('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
127+
('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
128+
]
129+
for alias, codepoint in aliases:
130+
self.checkletter(alias, chr(codepoint))
131+
name = unicodedata.name(chr(codepoint))
132+
self.assertNotEqual(name, alias)
133+
self.assertEqual(unicodedata.lookup(alias),
134+
unicodedata.lookup(name))
135+
with self.assertRaises(KeyError):
136+
unicodedata.ucd_3_2_0.lookup(alias)
137+
138+
def test_aliases_names_in_pua_range(self):
139+
# We are storing aliases in the PUA 15, but their names shouldn't leak
140+
for cp in range(0xf0000, 0xf0100):
141+
with self.assertRaises(ValueError) as cm:
142+
unicodedata.name(chr(cp))
143+
self.assertEqual(str(cm.exception), 'no such name')
144+
145+
def test_named_sequences_names_in_pua_range(self):
146+
# We are storing named seq in the PUA 15, but their names shouldn't leak
147+
for cp in range(0xf0100, 0xf0fff):
148+
with self.assertRaises(ValueError) as cm:
149+
unicodedata.name(chr(cp))
150+
self.assertEqual(str(cm.exception), 'no such name')
151+
152+
def test_named_sequences_sample(self):
153+
# Check a few named sequences. See #12753.
154+
sequences = [
155+
('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
156+
('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
157+
('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
158+
('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
159+
('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
160+
]
161+
for seqname, codepoints in sequences:
162+
self.assertEqual(unicodedata.lookup(seqname), codepoints)
163+
with self.assertRaises(SyntaxError):
164+
self.checkletter(seqname, None)
165+
with self.assertRaises(KeyError):
166+
unicodedata.ucd_3_2_0.lookup(seqname)
167+
168+
def test_named_sequences_full(self):
169+
# Check all the named sequences
170+
url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" %
171+
unicodedata.unidata_version)
172+
try:
173+
testdata = support.open_urlresource(url, encoding="utf-8",
174+
check=check_version)
175+
except (IOError, HTTPException):
176+
self.skipTest("Could not retrieve " + url)
177+
self.addCleanup(testdata.close)
178+
for line in testdata:
179+
line = line.strip()
180+
if not line or line.startswith('#'):
181+
continue
182+
seqname, codepoints = line.split(';')
183+
codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
184+
self.assertEqual(unicodedata.lookup(seqname), codepoints)
185+
with self.assertRaises(SyntaxError):
186+
self.checkletter(seqname, None)
187+
with self.assertRaises(KeyError):
188+
unicodedata.ucd_3_2_0.lookup(seqname)
189+
115190
def test_errors(self):
116-
import unicodedata
117191
self.assertRaises(TypeError, unicodedata.name)
118192
self.assertRaises(TypeError, unicodedata.name, 'xx')
119193
self.assertRaises(TypeError, unicodedata.lookup)

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1?
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #12753: Add support for Unicode name aliases and named sequences.
14+
Both :func:`unicodedata.lookup()` and '\N{...}' now resolve aliases,
15+
and :func:`unicodedata.lookup()` resolves named sequences too.
16+
1317
- Issue #12170: The count(), find(), rfind(), index() and rindex() methods
1418
of bytes and bytearray objects now accept an integer between 0 and 255
1519
as their first argument. Patch by Petri Lehtinen.

Modules/unicodedata.c

Lines changed: 56 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -926,9 +926,19 @@ is_unified_ideograph(Py_UCS4 code)
926926
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
927927
}
928928

929+
/* macros used to determine if the given codepoint is in the PUA range that
930+
* we are using to store aliases and named sequences */
931+
#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
932+
#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
933+
(cp < named_sequences_end))
934+
929935
static int
930-
_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
936+
_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
937+
int with_alias_and_seq)
931938
{
939+
/* Find the name associated with the given codepoint.
940+
* If with_alias_and_seq is 1, check for names in the Private Use Area 15
941+
* that we are using for aliases and named sequences. */
932942
int offset;
933943
int i;
934944
int word;
@@ -937,7 +947,14 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
937947
if (code >= 0x110000)
938948
return 0;
939949

950+
/* XXX should we just skip all the codepoints in the PUAs here? */
951+
if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
952+
return 0;
953+
940954
if (self && UCD_Check(self)) {
955+
/* in 3.2.0 there are no aliases and named sequences */
956+
if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
957+
return 0;
941958
const change_record *old = get_old_record(self, code);
942959
if (old->category_changed == 0) {
943960
/* unassigned */
@@ -1022,7 +1039,7 @@ _cmpname(PyObject *self, int code, const char* name, int namelen)
10221039
/* check if code corresponds to the given name */
10231040
int i;
10241041
char buffer[NAME_MAXLEN];
1025-
if (!_getucname(self, code, buffer, sizeof(buffer)))
1042+
if (!_getucname(self, code, buffer, sizeof(buffer), 1))
10261043
return 0;
10271044
for (i = 0; i < namelen; i++) {
10281045
if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
@@ -1052,8 +1069,28 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
10521069
}
10531070

10541071
static int
1055-
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
1072+
_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1073+
{
1074+
/* check if named sequences are allowed */
1075+
if (!with_named_seq && IS_NAMED_SEQ(cp))
1076+
return 0;
1077+
/* if the codepoint is in the PUA range that we use for aliases,
1078+
* convert it to obtain the right codepoint */
1079+
if (IS_ALIAS(cp))
1080+
*code = name_aliases[cp-aliases_start];
1081+
else
1082+
*code = cp;
1083+
return 1;
1084+
}
1085+
1086+
static int
1087+
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1088+
int with_named_seq)
10561089
{
1090+
/* Return the codepoint associated with the given name.
1091+
* Named aliases are resolved too (unless self != NULL (i.e. we are using
1092+
* 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1093+
* using for the named sequence, and the caller must then convert it. */
10571094
unsigned int h, v;
10581095
unsigned int mask = code_size-1;
10591096
unsigned int i, incr;
@@ -1109,10 +1146,8 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
11091146
v = code_hash[i];
11101147
if (!v)
11111148
return 0;
1112-
if (_cmpname(self, v, name, namelen)) {
1113-
*code = v;
1114-
return 1;
1115-
}
1149+
if (_cmpname(self, v, name, namelen))
1150+
return _check_alias_and_seq(v, code, with_named_seq);
11161151
incr = (h ^ (h >> 3)) & mask;
11171152
if (!incr)
11181153
incr = mask;
@@ -1121,10 +1156,8 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
11211156
v = code_hash[i];
11221157
if (!v)
11231158
return 0;
1124-
if (_cmpname(self, v, name, namelen)) {
1125-
*code = v;
1126-
return 1;
1127-
}
1159+
if (_cmpname(self, v, name, namelen))
1160+
return _check_alias_and_seq(v, code, with_named_seq);
11281161
incr = incr << 1;
11291162
if (incr > mask)
11301163
incr = incr ^ code_poly;
@@ -1162,7 +1195,7 @@ unicodedata_name(PyObject* self, PyObject* args)
11621195
if (c == (Py_UCS4)-1)
11631196
return NULL;
11641197

1165-
if (!_getucname(self, c, name, sizeof(name))) {
1198+
if (!_getucname(self, c, name, sizeof(name), 0)) {
11661199
if (defobj == NULL) {
11671200
PyErr_SetString(PyExc_ValueError, "no such name");
11681201
return NULL;
@@ -1190,15 +1223,22 @@ unicodedata_lookup(PyObject* self, PyObject* args)
11901223

11911224
char* name;
11921225
int namelen;
1226+
unsigned int index;
11931227
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
11941228
return NULL;
11951229

1196-
if (!_getcode(self, name, namelen, &code)) {
1197-
PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1198-
name);
1230+
if (!_getcode(self, name, namelen, &code, 1)) {
1231+
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
11991232
return NULL;
12001233
}
1201-
1234+
// check if code is in the PUA range that we use for named sequences
1235+
// and convert it
1236+
if (IS_NAMED_SEQ(code)) {
1237+
index = code-named_sequences_start;
1238+
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1239+
named_sequences[index].seq,
1240+
named_sequences[index].seqlen);
1241+
}
12021242
return PyUnicode_FromOrdinal(code);
12031243
}
12041244

0 commit comments

Comments
 (0)