From b493e3578c8e367f854ebba575a98e8ab813fb23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 6 Feb 2025 11:03:28 +0100 Subject: [PATCH 1/5] fix UBSan failures for `xmlparseobject` --- Modules/pyexpat.c | 116 +++++++++++++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 43 deletions(-) diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 9931ca2a8d4749..c13aae550e92b4 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -92,6 +92,8 @@ typedef struct { PyObject **handlers; } xmlparseobject; +#define xmlparseobject_CAST(op) ((xmlparseobject *)(op)) + #include "clinic/pyexpat.c.h" #define CHARACTER_DATA_BUFFER_SIZE 8192 @@ -1240,30 +1242,34 @@ newxmlparseobject(pyexpat_state *state, const char *encoding, } static int -xmlparse_traverse(xmlparseobject *op, visitproc visit, void *arg) +xmlparse_traverse(PyObject *op, visitproc visit, void *arg) { - for (int i = 0; handler_info[i].name != NULL; i++) { - Py_VISIT(op->handlers[i]); + xmlparseobject *self = xmlparseobject_CAST(op); + for (size_t i = 0; handler_info[i].name != NULL; i++) { + Py_VISIT(self->handlers[i]); } Py_VISIT(Py_TYPE(op)); return 0; } static int -xmlparse_clear(xmlparseobject *op) +xmlparse_clear(PyObject *op) { - clear_handlers(op, 0); - Py_CLEAR(op->intern); + xmlparseobject *self = xmlparseobject_CAST(op); + clear_handlers(self, 0); + Py_CLEAR(self->intern); return 0; } static void -xmlparse_dealloc(xmlparseobject *self) +xmlparse_dealloc(PyObject *op) { + xmlparseobject *self = xmlparseobject_CAST(op); PyObject_GC_UnTrack(self); - (void)xmlparse_clear(self); - if (self->itself != NULL) + (void)xmlparse_clear(op); + if (self->itself != NULL) { XML_ParserFree(self->itself); + } self->itself = NULL; if (self->handlers != NULL) { @@ -1281,19 +1287,24 @@ xmlparse_dealloc(xmlparseobject *self) static PyObject * -xmlparse_handler_getter(xmlparseobject *self, struct HandlerInfo *hi) +xmlparse_handler_getter(PyObject *op, void *closure) { + xmlparseobject *self = xmlparseobject_CAST(op); + struct HandlerInfo *hi = (struct HandlerInfo *)closure; assert((hi - handler_info) < (Py_ssize_t)Py_ARRAY_LENGTH(handler_info)); int handlernum = (int)(hi - handler_info); PyObject *result = self->handlers[handlernum]; - if (result == NULL) + if (result == NULL) { result = Py_None; + } return Py_NewRef(result); } static int -xmlparse_handler_setter(xmlparseobject *self, PyObject *v, struct HandlerInfo *hi) +xmlparse_handler_setter(PyObject *op, PyObject *v, void *closure) { + xmlparseobject *self = xmlparseobject_CAST(op); + struct HandlerInfo *hi = (struct HandlerInfo *)closure; assert((hi - handler_info) < (Py_ssize_t)Py_ARRAY_LENGTH(handler_info)); int handlernum = (int)(hi - handler_info); if (v == NULL) { @@ -1323,8 +1334,9 @@ xmlparse_handler_setter(xmlparseobject *self, PyObject *v, struct HandlerInfo *h otherwise would, but that's really an odd case. A more elaborate system of handlers and state could remove the C handler more effectively. */ - if (handlernum == CharacterData && self->in_callback) + if (handlernum == CharacterData && self->in_callback) { c_handler = noop_character_data_handler; + } v = NULL; } else if (v != NULL) { @@ -1336,11 +1348,12 @@ xmlparse_handler_setter(xmlparseobject *self, PyObject *v, struct HandlerInfo *h return 0; } -#define INT_GETTER(name) \ - static PyObject * \ - xmlparse_##name##_getter(xmlparseobject *self, void *closure) \ - { \ - return PyLong_FromLong((long) XML_Get##name(self->itself)); \ +#define INT_GETTER(name) \ + static PyObject * \ + xmlparse_##name##_getter(PyObject *op, void *Py_UNUSED(closure)) \ + { \ + xmlparseobject *self = xmlparseobject_CAST(op); \ + return PyLong_FromLong((long)XML_Get##name(self->itself)); \ } INT_GETTER(ErrorCode) INT_GETTER(ErrorLineNumber) @@ -1353,21 +1366,24 @@ INT_GETTER(CurrentByteIndex) #undef INT_GETTER static PyObject * -xmlparse_buffer_text_getter(xmlparseobject *self, void *closure) +xmlparse_buffer_text_getter(PyObject *op, void *Py_UNUSED(closure)) { + xmlparseobject *self = xmlparseobject_CAST(op); return PyBool_FromLong(self->buffer != NULL); } static int -xmlparse_buffer_text_setter(xmlparseobject *self, PyObject *v, void *closure) +xmlparse_buffer_text_setter(PyObject *op, PyObject *v, void *Py_UNUSED(closure)) { + xmlparseobject *self = xmlparseobject_CAST(op); if (v == NULL) { PyErr_SetString(PyExc_RuntimeError, "Cannot delete attribute"); return -1; } int b = PyObject_IsTrue(v); - if (b < 0) + if (b < 0) { return -1; + } if (b) { if (self->buffer == NULL) { self->buffer = PyMem_Malloc(self->buffer_size); @@ -1379,8 +1395,9 @@ xmlparse_buffer_text_setter(xmlparseobject *self, PyObject *v, void *closure) } } else if (self->buffer != NULL) { - if (flush_character_buffer(self) < 0) + if (flush_character_buffer(self) < 0) { return -1; + } PyMem_Free(self->buffer); self->buffer = NULL; } @@ -1388,14 +1405,16 @@ xmlparse_buffer_text_setter(xmlparseobject *self, PyObject *v, void *closure) } static PyObject * -xmlparse_buffer_size_getter(xmlparseobject *self, void *closure) +xmlparse_buffer_size_getter(PyObject *op, void *Py_UNUSED(closure)) { - return PyLong_FromLong((long) self->buffer_size); + xmlparseobject *self = xmlparseobject_CAST(op); + return PyLong_FromLong(self->buffer_size); } static int -xmlparse_buffer_size_setter(xmlparseobject *self, PyObject *v, void *closure) +xmlparse_buffer_size_setter(PyObject *op, PyObject *v, void *Py_UNUSED(closure)) { + xmlparseobject *self = xmlparseobject_CAST(op); if (v == NULL) { PyErr_SetString(PyExc_RuntimeError, "Cannot delete attribute"); return -1; @@ -1408,8 +1427,9 @@ xmlparse_buffer_size_setter(xmlparseobject *self, PyObject *v, void *closure) new_buffer_size = PyLong_AsLong(v); if (new_buffer_size <= 0) { - if (!PyErr_Occurred()) + if (!PyErr_Occurred()) { PyErr_SetString(PyExc_ValueError, "buffer_size must be greater than zero"); + } return -1; } @@ -1444,68 +1464,78 @@ xmlparse_buffer_size_setter(xmlparseobject *self, PyObject *v, void *closure) } static PyObject * -xmlparse_buffer_used_getter(xmlparseobject *self, void *closure) +xmlparse_buffer_used_getter(PyObject *op, void *Py_UNUSED(closure)) { - return PyLong_FromLong((long) self->buffer_used); + xmlparseobject *self = xmlparseobject_CAST(op); + return PyLong_FromLong(self->buffer_used); } static PyObject * -xmlparse_namespace_prefixes_getter(xmlparseobject *self, void *closure) +xmlparse_namespace_prefixes_getter(PyObject *op, void *Py_UNUSED(closure)) { + xmlparseobject *self = xmlparseobject_CAST(op); return PyBool_FromLong(self->ns_prefixes); } static int -xmlparse_namespace_prefixes_setter(xmlparseobject *self, PyObject *v, void *closure) +xmlparse_namespace_prefixes_setter(PyObject *op, PyObject *v, void *Py_UNUSED(closure)) { + xmlparseobject *self = xmlparseobject_CAST(op); if (v == NULL) { PyErr_SetString(PyExc_RuntimeError, "Cannot delete attribute"); return -1; } int b = PyObject_IsTrue(v); - if (b < 0) + if (b < 0) { return -1; + } self->ns_prefixes = b; XML_SetReturnNSTriplet(self->itself, self->ns_prefixes); return 0; } static PyObject * -xmlparse_ordered_attributes_getter(xmlparseobject *self, void *closure) +xmlparse_ordered_attributes_getter(PyObject *op, void *Py_UNUSED(closure)) { + xmlparseobject *self = xmlparseobject_CAST(op); return PyBool_FromLong(self->ordered_attributes); } static int -xmlparse_ordered_attributes_setter(xmlparseobject *self, PyObject *v, void *closure) +xmlparse_ordered_attributes_setter(PyObject *op, PyObject *v, void *Py_UNUSED(closure)) { + xmlparseobject *self = xmlparseobject_CAST(op); if (v == NULL) { PyErr_SetString(PyExc_RuntimeError, "Cannot delete attribute"); return -1; } int b = PyObject_IsTrue(v); - if (b < 0) + if (b < 0) { return -1; + } self->ordered_attributes = b; return 0; } static PyObject * -xmlparse_specified_attributes_getter(xmlparseobject *self, void *closure) +xmlparse_specified_attributes_getter(PyObject *op, void *Py_UNUSED(closure)) { - return PyBool_FromLong((long) self->specified_attributes); + xmlparseobject *self = xmlparseobject_CAST(op); + return PyBool_FromLong(self->specified_attributes); } static int -xmlparse_specified_attributes_setter(xmlparseobject *self, PyObject *v, void *closure) +xmlparse_specified_attributes_setter(PyObject *op, PyObject *v, void *Py_UNUSED(closure)) { + xmlparseobject *self = xmlparseobject_CAST(op); if (v == NULL) { PyErr_SetString(PyExc_RuntimeError, "Cannot delete attribute"); return -1; } int b = PyObject_IsTrue(v); - if (b < 0) + if (b < 0) { return -1; + } self->specified_attributes = b; return 0; } @@ -1516,10 +1546,10 @@ static PyMemberDef xmlparse_members[] = { }; #define XMLPARSE_GETTER_DEF(name) \ - {#name, (getter)xmlparse_##name##_getter, NULL, NULL}, + {#name, xmlparse_##name##_getter, NULL, NULL}, #define XMLPARSE_GETTER_SETTER_DEF(name) \ - {#name, (getter)xmlparse_##name##_getter, \ - (setter)xmlparse_##name##_setter, NULL}, + {#name, xmlparse_##name##_getter, \ + xmlparse_##name##_setter, NULL}, static PyGetSetDef xmlparse_getsetlist[] = { XMLPARSE_GETTER_DEF(ErrorCode) @@ -1655,8 +1685,8 @@ static int init_handler_descrs(pyexpat_state *state) for (i = 0; handler_info[i].name != NULL; i++) { struct HandlerInfo *hi = &handler_info[i]; hi->getset.name = hi->name; - hi->getset.get = (getter)xmlparse_handler_getter; - hi->getset.set = (setter)xmlparse_handler_setter; + hi->getset.get = xmlparse_handler_getter; + hi->getset.set = xmlparse_handler_setter; hi->getset.closure = &handler_info[i]; PyObject *descr = PyDescr_NewGetSet(state->xml_parse_type, &hi->getset); From 3aa3f987e6313c6819567c2df1a86ad89d10b76a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 6 Feb 2025 11:05:22 +0100 Subject: [PATCH 2/5] suppress unused return values --- Modules/pyexpat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index c13aae550e92b4..40e69ed2adb687 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -2149,7 +2149,7 @@ pyexpat_clear(PyObject *module) static void pyexpat_free(void *module) { - pyexpat_clear((PyObject *)module); + (void)pyexpat_clear((PyObject *)module); } static PyModuleDef_Slot pyexpat_slots[] = { From 579fc9e495dfff1e2880a1f5db172b8125281d9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 6 Feb 2025 11:36:26 +0100 Subject: [PATCH 3/5] fix UBSan failures for XML handler setters --- Modules/pyexpat.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 40e69ed2adb687..e1e2f1ffb80a6f 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -98,18 +98,28 @@ typedef struct { #define CHARACTER_DATA_BUFFER_SIZE 8192 -typedef void (*xmlhandlersetter)(XML_Parser self, void *meth); -typedef void* xmlhandler; +typedef const void* xmlhandler; +typedef void (*xmlhandlersetter)(XML_Parser self, xmlhandler handler); struct HandlerInfo { const char *name; - xmlhandlersetter setter; + // To avoid UBSan failures, we cannot use the 'xmlhandlersetter' type, + // as the signature of the setters will not be compatible. However, we + // can safely convert the pointers since a setter function will be passed + // a correct handler. + const void *setter; xmlhandler handler; PyGetSetDef getset; }; static struct HandlerInfo handler_info[64]; +#define CALL_XML_HANDLER_SETTER(HANDLER_INFO, XML_PARSER, XML_HANDLER) \ + do { \ + xmlhandlersetter setter = (xmlhandlersetter)(HANDLER_INFO).setter; \ + setter((XML_PARSER), (XML_HANDLER)); \ + } while (0) + /* Set an integer attribute on the error object; return true on success, * false on an exception. */ @@ -981,8 +991,6 @@ pyexpat_xmlparser_ExternalEntityParserCreate_impl(xmlparseobject *self, /*[clinic end generated code: output=01d4472b49cb3f92 input=ec70c6b9e6e9619a]*/ { xmlparseobject *new_parser; - int i; - pyexpat_state *state = PyType_GetModuleState(cls); new_parser = PyObject_GC_New(xmlparseobject, state->xml_parse_type); @@ -1017,6 +1025,7 @@ pyexpat_xmlparser_ExternalEntityParserCreate_impl(xmlparseobject *self, XML_SetUserData(new_parser->itself, (void *)new_parser); /* allocate and clear handlers first */ + size_t i; for (i = 0; handler_info[i].name != NULL; i++) /* do nothing */; @@ -1028,12 +1037,12 @@ pyexpat_xmlparser_ExternalEntityParserCreate_impl(xmlparseobject *self, clear_handlers(new_parser, 1); /* then copy handlers from self */ - for (i = 0; handler_info[i].name != NULL; i++) { + for (size_t i = 0; handler_info[i].name != NULL; i++) { PyObject *handler = self->handlers[i]; if (handler != NULL) { new_parser->handlers[i] = Py_NewRef(handler); - handler_info[i].setter(new_parser->itself, - handler_info[i].handler); + struct HandlerInfo info = handler_info[i]; + CALL_XML_HANDLER_SETTER(info, new_parser->itself, info.handler); } } @@ -1344,7 +1353,7 @@ xmlparse_handler_setter(PyObject *op, PyObject *v, void *closure) c_handler = handler_info[handlernum].handler; } Py_XSETREF(self->handlers[handlernum], v); - handler_info[handlernum].setter(self->itself, c_handler); + CALL_XML_HANDLER_SETTER(handler_info[handlernum], self->itself, c_handler); return 0; } @@ -2180,14 +2189,13 @@ PyInit_pyexpat(void) static void clear_handlers(xmlparseobject *self, int initial) { - int i = 0; - - for (; handler_info[i].name != NULL; i++) { - if (initial) + for (size_t i = 0; handler_info[i].name != NULL; i++) { + if (initial) { self->handlers[i] = NULL; + } else { Py_CLEAR(self->handlers[i]); - handler_info[i].setter(self->itself, NULL); + CALL_XML_HANDLER_SETTER(handler_info[i], self->itself, NULL); } } } @@ -2195,7 +2203,7 @@ clear_handlers(xmlparseobject *self, int initial) static struct HandlerInfo handler_info[] = { #define HANDLER_INFO(name) \ - {#name, (xmlhandlersetter)XML_Set##name, (xmlhandler)my_##name}, + {#name, XML_Set##name, my_##name}, HANDLER_INFO(StartElementHandler) HANDLER_INFO(EndElementHandler) From 53dc42b56eefabc79dd737f0291c03254d74b1f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 6 Feb 2025 12:28:59 +0100 Subject: [PATCH 4/5] rewrite `RC_HANDLER` with better semantics --- Modules/pyexpat.c | 92 ++++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 40 deletions(-) diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index e1e2f1ffb80a6f..ba97aa87657f43 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -324,7 +324,7 @@ flush_character_buffer(xmlparseobject *self) static void my_CharacterDataHandler(void *userData, const XML_Char *data, int len) { - xmlparseobject *self = (xmlparseobject *) userData; + xmlparseobject *self = xmlparseobject_CAST(userData); if (PyErr_Occurred()) return; @@ -357,7 +357,7 @@ static void my_StartElementHandler(void *userData, const XML_Char *name, const XML_Char *atts[]) { - xmlparseobject *self = (xmlparseobject *)userData; + xmlparseobject *self = xmlparseobject_CAST(userData); if (have_handler(self, StartElement)) { PyObject *container, *rv, *args; @@ -442,45 +442,57 @@ my_StartElementHandler(void *userData, } } -#define RC_HANDLER(RC, NAME, PARAMS, INIT, PARAM_FORMAT, CONVERSION, \ - RETURN, GETUSERDATA) \ -static RC \ -my_##NAME##Handler PARAMS {\ - xmlparseobject *self = GETUSERDATA ; \ - PyObject *args = NULL; \ - PyObject *rv = NULL; \ - INIT \ -\ - if (have_handler(self, NAME)) { \ - if (PyErr_Occurred()) \ - return RETURN; \ - if (flush_character_buffer(self) < 0) \ - return RETURN; \ - args = Py_BuildValue PARAM_FORMAT ;\ - if (!args) { flag_error(self); return RETURN;} \ - self->in_callback = 1; \ - rv = call_with_frame(#NAME,__LINE__, \ - self->handlers[NAME], args, self); \ - self->in_callback = 0; \ - Py_DECREF(args); \ - if (rv == NULL) { \ - flag_error(self); \ - return RETURN; \ - } \ - CONVERSION \ - Py_DECREF(rv); \ - } \ - return RETURN; \ +#define RC_HANDLER(RETURN_TYPE, NAME, PARAMS, \ + INIT, PARSE_FORMAT, CONVERSION, \ + RETURN_VARIABLE, GETUSERDATA) \ +static RETURN_TYPE \ +my_ ## NAME ## Handler PARAMS { \ + xmlparseobject *self = GETUSERDATA; \ + INIT \ + if (!have_handler(self, NAME)) { \ + return RETURN_VARIABLE; \ + } \ + if (PyErr_Occurred()) { \ + return RETURN_VARIABLE; \ + } \ + if (flush_character_buffer(self) < 0) { \ + return RETURN_VARIABLE; \ + } \ + PyObject *args = Py_BuildValue PARSE_FORMAT; \ + if (args == NULL) { \ + flag_error(self); \ + return RETURN_VARIABLE; \ + } \ + self->in_callback = 1; \ + PyObject *rv = call_with_frame( \ + #NAME, __LINE__, \ + self->handlers[NAME], args, self); \ + self->in_callback = 0; \ + Py_DECREF(args); \ + if (rv == NULL) { \ + flag_error(self); \ + return RETURN_VARIABLE; \ + } \ + CONVERSION \ + Py_DECREF(rv); \ + return RETURN_VARIABLE; \ } -#define VOID_HANDLER(NAME, PARAMS, PARAM_FORMAT) \ - RC_HANDLER(void, NAME, PARAMS, ;, PARAM_FORMAT, ;, ;,\ - (xmlparseobject *)userData) - -#define INT_HANDLER(NAME, PARAMS, PARAM_FORMAT)\ - RC_HANDLER(int, NAME, PARAMS, int rc=0;, PARAM_FORMAT, \ - rc = PyLong_AsLong(rv);, rc, \ - (xmlparseobject *)userData) +#define VOID_HANDLER(NAME, PARAMS, PARAM_FORMAT) \ + RC_HANDLER( \ + void, NAME, PARAMS, \ + /* no init */, PARAM_FORMAT, \ + /* no conversion */, /* no return var */, \ + xmlparseobject_CAST(userData) \ + ) + +#define INT_HANDLER(NAME, PARAMS, PARAM_FORMAT) \ + RC_HANDLER( \ + int, NAME, PARAMS, \ + int rc = 0;, PARAM_FORMAT, \ + rc = PyLong_AsLong(rv);, rc, \ + xmlparseobject_CAST(userData) \ + ) VOID_HANDLER(EndElement, (void *userData, const XML_Char *name), @@ -561,7 +573,7 @@ my_ElementDeclHandler(void *userData, const XML_Char *name, XML_Content *model) { - xmlparseobject *self = (xmlparseobject *)userData; + xmlparseobject *self = xmlparseobject_CAST(userData); PyObject *args = NULL; if (have_handler(self, ElementDecl)) { From 46ccb79a71b392b705b1ac344294b029ee73af44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 22 Feb 2025 10:42:45 +0100 Subject: [PATCH 5/5] address Petr's review --- Modules/pyexpat.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 1a384af25dbc9f..1ba644d3396f7f 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -98,16 +98,12 @@ typedef struct { #define CHARACTER_DATA_BUFFER_SIZE 8192 -typedef const void* xmlhandler; +typedef const void *xmlhandler; typedef void (*xmlhandlersetter)(XML_Parser self, xmlhandler handler); struct HandlerInfo { const char *name; - // To avoid UBSan failures, we cannot use the 'xmlhandlersetter' type, - // as the signature of the setters will not be compatible. However, we - // can safely convert the pointers since a setter function will be passed - // a correct handler. - const void *setter; + xmlhandlersetter setter; xmlhandler handler; PyGetSetDef getset; }; @@ -2215,8 +2211,11 @@ clear_handlers(xmlparseobject *self, int initial) static struct HandlerInfo handler_info[] = { + // The cast to `xmlhandlersetter` is needed as the signature of XML + // handler functions is not compatible with `xmlhandlersetter` since + // their second parameter is narrower than a `const void *`. #define HANDLER_INFO(name) \ - {#name, XML_Set##name, my_##name}, + {#name, (xmlhandlersetter)XML_Set##name, my_##name}, HANDLER_INFO(StartElementHandler) HANDLER_INFO(EndElementHandler)