From 9942ce65d7771117d7a542fb511496bfa45f6164 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 11 Dec 2022 19:33:23 -0500 Subject: [PATCH 1/5] Reduce hash collisions for code objects --- Lib/test/test_code.py | 26 +++++++++++++++++++++ Objects/codeobject.c | 53 +++++++++++++++++++++++++++---------------- 2 files changed, 59 insertions(+), 20 deletions(-) diff --git a/Lib/test/test_code.py b/Lib/test/test_code.py index 02ab8fbcdb0700..b13d5770abe8d2 100644 --- a/Lib/test/test_code.py +++ b/Lib/test/test_code.py @@ -465,6 +465,32 @@ def f(): self.assertNotEqual(code_b, code_d) self.assertNotEqual(code_c, code_d) + def test_code_hash_uses_firstlineno(self): + c1 = (lambda: 1).__code__ + c2 = (lambda: 1).__code__ + self.assertNotEqual(c1, c2) + self.assertNotEqual(hash(c1), hash(c2)) + c3 = c1.replace(co_firstlineno=17) + self.assertNotEqual(c1, c3) + self.assertNotEqual(hash(c1), hash(c3)) + + def test_code_hash_uses_order(self): + # Swapping posonlyargcount and kwonlyargcount should change the hash. + c = (lambda x, y, *, z=1, w=1: 1).__code__ + self.assertEqual(c.co_argcount, 2) + self.assertEqual(c.co_posonlyargcount, 0) + self.assertEqual(c.co_kwonlyargcount, 2) + swapped = c.replace(co_posonlyargcount=2, co_kwonlyargcount=0) + self.assertNotEqual(c, swapped) + self.assertNotEqual(hash(c), hash(swapped)) + + def test_code_hash_uses_bytecode(self): + c = (lambda x, y: x + y).__code__ + d = (lambda x, y: x * y).__code__ + c1 = c.replace(co_code=d.co_code) + self.assertNotEqual(c, c1) + self.assertNotEqual(hash(c), hash(c1)) + def isinterned(s): return s is sys.intern(('_' + s + '_')[1:-1]) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index c92c7deaf8086f..cbdfa73dade616 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -1834,28 +1834,41 @@ code_richcompare(PyObject *self, PyObject *other, int op) static Py_hash_t code_hash(PyCodeObject *co) { - Py_hash_t h, h0, h1, h2, h3; - h0 = PyObject_Hash(co->co_name); - if (h0 == -1) return -1; - h1 = PyObject_Hash(co->co_consts); - if (h1 == -1) return -1; - h2 = PyObject_Hash(co->co_names); - if (h2 == -1) return -1; - h3 = PyObject_Hash(co->co_localsplusnames); - if (h3 == -1) return -1; - Py_hash_t h4 = PyObject_Hash(co->co_linetable); - if (h4 == -1) { - return -1; + Py_uhash_t res = 20221211; + #define SCRAMBLE_IN(H) do { \ + res ^= (Py_uhash_t)(H); \ + res *= _PyHASH_MULTIPLIER; \ + } while (0) + #define SCRAMBLE_IN_OR_ERR(EXPR) do { \ + Py_hash_t h = (EXPR); \ + if (h == -1) { \ + return -1; \ + } \ + SCRAMBLE_IN(h); \ + } while (0) + + SCRAMBLE_IN_OR_ERR(PyObject_Hash(co->co_name)); + SCRAMBLE_IN_OR_ERR(PyObject_Hash(co->co_consts)); + SCRAMBLE_IN_OR_ERR(PyObject_Hash(co->co_names)); + SCRAMBLE_IN_OR_ERR(PyObject_Hash(co->co_localsplusnames)); + SCRAMBLE_IN_OR_ERR(PyObject_Hash(co->co_linetable)); + SCRAMBLE_IN_OR_ERR(PyObject_Hash(co->co_exceptiontable)); + SCRAMBLE_IN(co->co_argcount); + SCRAMBLE_IN(co->co_posonlyargcount); + SCRAMBLE_IN(co->co_kwonlyargcount); + SCRAMBLE_IN(co->co_flags); + SCRAMBLE_IN(co->co_firstlineno); + SCRAMBLE_IN(Py_SIZE(co)); + for (int i = 0; i < Py_SIZE(co); i++) { + _Py_CODEUNIT co_instr = _PyCode_CODE(co)[i]; + _Py_SET_OPCODE(co_instr, _PyOpcode_Deopt[_Py_OPCODE(co_instr)]); + SCRAMBLE_IN(co_instr); + i += _PyOpcode_Caches[_Py_OPCODE(co_instr)]; } - Py_hash_t h5 = PyObject_Hash(co->co_exceptiontable); - if (h5 == -1) { - return -1; + if (res == -1) { + return -2; } - h = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ - co->co_argcount ^ co->co_posonlyargcount ^ co->co_kwonlyargcount ^ - co->co_flags; - if (h == -1) h = -2; - return h; + return res; } From d54249cc8f63d3bfe3a7bd037e4214a4dc489500 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Mon, 12 Dec 2022 00:59:14 +0000 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst b/Misc/NEWS.d/next/Core and Builtins/2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst new file mode 100644 index 00000000000000..e7c7ed2fad0e35 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst @@ -0,0 +1 @@ +Improved the hashing algorithm for code objects, mitigating some hash collisions. From 3c38f5a0efb2b4dce2f406229cd4a1b7ba2b7c65 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 11 Dec 2022 20:21:20 -0500 Subject: [PATCH 3/5] don't compare unsigned to signed --- Objects/codeobject.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index cbdfa73dade616..fbe313589b3c9f 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -1834,10 +1834,10 @@ code_richcompare(PyObject *self, PyObject *other, int op) static Py_hash_t code_hash(PyCodeObject *co) { - Py_uhash_t res = 20221211; - #define SCRAMBLE_IN(H) do { \ - res ^= (Py_uhash_t)(H); \ - res *= _PyHASH_MULTIPLIER; \ + Py_uhash_t uhash = 20221211; + #define SCRAMBLE_IN(H) do { \ + uhash ^= (Py_uhash_t)(H); \ + uhash *= _PyHASH_MULTIPLIER; \ } while (0) #define SCRAMBLE_IN_OR_ERR(EXPR) do { \ Py_hash_t h = (EXPR); \ @@ -1865,10 +1865,10 @@ code_hash(PyCodeObject *co) SCRAMBLE_IN(co_instr); i += _PyOpcode_Caches[_Py_OPCODE(co_instr)]; } - if (res == -1) { + if ((Py_hash_t)uhash == -1) { return -2; } - return res; + return (Py_hash_t)uhash; } From dbb5c77578f361e2dc77847a9eed95bf87307e0d Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 11 Dec 2022 23:35:27 -0500 Subject: [PATCH 4/5] Disregard byte order --- Objects/codeobject.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index fbe313589b3c9f..6c33f81878534f 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -1860,10 +1860,10 @@ code_hash(PyCodeObject *co) SCRAMBLE_IN(co->co_firstlineno); SCRAMBLE_IN(Py_SIZE(co)); for (int i = 0; i < Py_SIZE(co); i++) { - _Py_CODEUNIT co_instr = _PyCode_CODE(co)[i]; - _Py_SET_OPCODE(co_instr, _PyOpcode_Deopt[_Py_OPCODE(co_instr)]); - SCRAMBLE_IN(co_instr); - i += _PyOpcode_Caches[_Py_OPCODE(co_instr)]; + int deop = _PyOpcode_Deopt[_Py_OPCODE(_PyCode_CODE(co)[i])]; + SCRAMBLE_IN(deop); + SCRAMBLE_IN(_Py_OPARG(_PyCode_CODE(co)[i])); + i += _PyOpcode_Caches[deop]; } if ((Py_hash_t)uhash == -1) { return -2; From 2014fb13e87b9885e650ab0b5d22af28ddf62237 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 18 Dec 2022 23:58:15 -0500 Subject: [PATCH 5/5] SCRAMBLE_IN_HASH macro --- Objects/codeobject.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 9c36d7972acca3..e174c6fee9cc24 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -1847,20 +1847,20 @@ code_hash(PyCodeObject *co) uhash ^= (Py_uhash_t)(H); \ uhash *= _PyHASH_MULTIPLIER; \ } while (0) - #define SCRAMBLE_IN_OR_ERR(EXPR) do { \ - Py_hash_t h = (EXPR); \ - if (h == -1) { \ - return -1; \ - } \ - SCRAMBLE_IN(h); \ + #define SCRAMBLE_IN_HASH(EXPR) do { \ + Py_hash_t h = PyObject_Hash(EXPR); \ + if (h == -1) { \ + return -1; \ + } \ + SCRAMBLE_IN(h); \ } while (0) - SCRAMBLE_IN_OR_ERR(PyObject_Hash(co->co_name)); - SCRAMBLE_IN_OR_ERR(PyObject_Hash(co->co_consts)); - SCRAMBLE_IN_OR_ERR(PyObject_Hash(co->co_names)); - SCRAMBLE_IN_OR_ERR(PyObject_Hash(co->co_localsplusnames)); - SCRAMBLE_IN_OR_ERR(PyObject_Hash(co->co_linetable)); - SCRAMBLE_IN_OR_ERR(PyObject_Hash(co->co_exceptiontable)); + SCRAMBLE_IN_HASH(co->co_name); + SCRAMBLE_IN_HASH(co->co_consts); + SCRAMBLE_IN_HASH(co->co_names); + SCRAMBLE_IN_HASH(co->co_localsplusnames); + SCRAMBLE_IN_HASH(co->co_linetable); + SCRAMBLE_IN_HASH(co->co_exceptiontable); SCRAMBLE_IN(co->co_argcount); SCRAMBLE_IN(co->co_posonlyargcount); SCRAMBLE_IN(co->co_kwonlyargcount);