Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Prev Previous commit
Next Next commit
Merge branch 'main' into wip/tangut-ideographs
  • Loading branch information
serhiy-storchaka committed Feb 13, 2026
commit 2e1560cb99d28f1f6f99a8a3baf8a8d67b88cfbd
24 changes: 19 additions & 5 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,6 @@ def test_method_checksum(self):

class BaseUnicodeFunctionsTest:

# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = '95cc75e49b140c61b884c16d0a9fbbb0b93a7fa9'

@requires_resource('cpu')
def test_function_checksum(self):
db = self.db
data = []
Expand Down Expand Up @@ -184,6 +179,25 @@ def test_name_inverse_lookup(self):
if looked_name is not None:
self.assertEqual(self.db.lookup(looked_name), char)

def test_no_names_in_pua(self):
puas = [*range(0xe000, 0xf8ff),
*range(0xf0000, 0xfffff),
*range(0x100000, 0x10ffff)]
for i in puas:
char = chr(i)
self.assertRaises(ValueError, self.db.name, char)

def test_lookup_nonexistant(self):
# just make sure that lookup can fail
for nonexistent in [
"LATIN SMLL LETR A",
"OPEN HANDS SIGHS",
"DREGS",
"HANDBUG",
"MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
"???",
]:
self.assertRaises(KeyError, self.db.lookup, nonexistent)

def test_digit(self):
self.assertEqual(self.db.digit('A', None), None)
Expand Down
25 changes: 9 additions & 16 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -1392,11 +1392,11 @@ _getucname(PyObject *self,
return 1;
}

/* get offset into phrasebook */
offset = phrasebook_offset1[(code>>phrasebook_shift)];
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
(code&((1<<phrasebook_shift)-1))];
if (!offset)
/* get position of codepoint in order of names in the dawg */
offset = dawg_codepoint_to_pos_index1[(code>>DAWG_CODEPOINT_TO_POS_SHIFT)];
offset = dawg_codepoint_to_pos_index2[(offset<<DAWG_CODEPOINT_TO_POS_SHIFT) +
(code&((1<<DAWG_CODEPOINT_TO_POS_SHIFT)-1))];
if (offset == DAWG_CODEPOINT_TO_POS_NOTFOUND)
return 0;

assert(buflen >= 0);
Expand Down Expand Up @@ -1472,7 +1472,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
}

/* Check for CJK unified ideographs. */
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
/* Four or five hexdigits must follow. */
unsigned int v;
v = 0;
Expand All @@ -1497,7 +1497,6 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
return 1;
}


/* Check for Tangut ideographs. */
if (strncmp(name, "TANGUT IDEOGRAPH-", 17) == 0) {
/* Five hexdigits must follow. */
Expand All @@ -1522,15 +1521,9 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
return 1;
}


/* the following is the same as python's dictionary lookup, with
only minor changes. see the makeunicodedata script for more
details */

h = (unsigned int) _gethash(name, namelen, code_magic);
i = (~h) & mask;
v = code_hash[i];
if (!v)
assert(namelen >= 0);
int position = _lookup_dawg_packed(name, Py_SAFE_DOWNCAST(namelen, int, unsigned int));
if (position < 0) {
return 0;
}
*code = dawg_pos_to_codepoint[position];
Expand Down
You are viewing a condensed version of this merge commit. You can view the full changes here.