Merge branch 'main' into wip/tangut-ideographs

python · serhiy-storchaka · Feb 16, 2026 · Jul 26, 2023 · Jul 26, 2023 · Jul 26, 2023
commit 2e1560cb99d28f1f6f99a8a3baf8a8d67b88cfbd
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
@@ -105,11 +105,6 @@ def test_method_checksum(self):
 
 class BaseUnicodeFunctionsTest:
 
-    # Update this if the database changes. Make sure to do a full rebuild
-    # (e.g. 'make distclean && make') to get the correct checksum.
-    expectedchecksum = '95cc75e49b140c61b884c16d0a9fbbb0b93a7fa9'
-
-    @requires_resource('cpu')
     def test_function_checksum(self):
         db = self.db
         data = []
@@ -184,6 +179,25 @@ def test_name_inverse_lookup(self):
             if looked_name is not None:
                 self.assertEqual(self.db.lookup(looked_name), char)
 
+    def test_no_names_in_pua(self):
+        puas = [*range(0xe000, 0xf8ff),
+                *range(0xf0000, 0xfffff),
+                *range(0x100000, 0x10ffff)]
+        for i in puas:
+            char = chr(i)
+            self.assertRaises(ValueError, self.db.name, char)
+
+    def test_lookup_nonexistant(self):
+        # just make sure that lookup can fail
+        for nonexistent in [
+            "LATIN SMLL LETR A",
+            "OPEN HANDS SIGHS",
+            "DREGS",
+            "HANDBUG",
+            "MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
+            "???",
+        ]:
+            self.assertRaises(KeyError, self.db.lookup, nonexistent)
 
     def test_digit(self):
         self.assertEqual(self.db.digit('A', None), None)

diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
@@ -1392,11 +1392,11 @@ _getucname(PyObject *self,
         return 1;
     }
 
-    /* get offset into phrasebook */
-    offset = phrasebook_offset1[(code>>phrasebook_shift)];
-    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
-                               (code&((1<<phrasebook_shift)-1))];
-    if (!offset)
+    /* get position of codepoint in order of names in the dawg */
+    offset = dawg_codepoint_to_pos_index1[(code>>DAWG_CODEPOINT_TO_POS_SHIFT)];
+    offset = dawg_codepoint_to_pos_index2[(offset<<DAWG_CODEPOINT_TO_POS_SHIFT) +
+                               (code&((1<<DAWG_CODEPOINT_TO_POS_SHIFT)-1))];
+    if (offset == DAWG_CODEPOINT_TO_POS_NOTFOUND)
         return 0;
 
     assert(buflen >= 0);
@@ -1472,7 +1472,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
     }
 
     /* Check for CJK unified ideographs. */
-    if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
+    if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
         /* Four or five hexdigits must follow. */
         unsigned int v;
         v = 0;
@@ -1497,7 +1497,6 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
         return 1;
     }
 
-
     /* Check for Tangut ideographs. */
     if (strncmp(name, "TANGUT IDEOGRAPH-", 17) == 0) {
         /* Five hexdigits must follow. */
@@ -1522,15 +1521,9 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
         return 1;
     }
 
-
-    /* the following is the same as python's dictionary lookup, with
-       only minor changes.  see the makeunicodedata script for more
-       details */
-
-    h = (unsigned int) _gethash(name, namelen, code_magic);
-    i = (~h) & mask;
-    v = code_hash[i];
-    if (!v)
+    assert(namelen >= 0);
+    int position = _lookup_dawg_packed(name, Py_SAFE_DOWNCAST(namelen, int, unsigned int));
+    if (position < 0) {
         return 0;
     }
     *code = dawg_pos_to_codepoint[position];