python · encukou · Dec 4, 2024 · Aug 26, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/Doc/c-api/exceptions.rst b/Doc/c-api/exceptions.rst
@@ -853,12 +853,23 @@ The following functions are used to create and modify Unicode exceptions from C.
    *\*start*.  *start* must not be ``NULL``.  Return ``0`` on success, ``-1`` on
    failure.
 
+   If the :attr:`UnicodeError.object` is an empty sequence, the resulting
+   *start* is ``0``. Otherwise, it is clipped to ``[0, len(object) - 1]``.
+
+   .. seealso:: :attr:`UnicodeError.start`
+
 .. c:function:: int PyUnicodeDecodeError_SetStart(PyObject *exc, Py_ssize_t start)
                 int PyUnicodeEncodeError_SetStart(PyObject *exc, Py_ssize_t start)
                 int PyUnicodeTranslateError_SetStart(PyObject *exc, Py_ssize_t start)
 
-   Set the *start* attribute of the given exception object to *start*.  Return
-   ``0`` on success, ``-1`` on failure.
+   Set the *start* attribute of the given exception object to *start*.
+   Return ``0`` on success, ``-1`` on failure.
+
+   .. note::
+
+      While passing a negative *start* does not raise an exception,
+      the corresponding getters will not consider it as a relative
+      offset.
 
 .. c:function:: int PyUnicodeDecodeError_GetEnd(PyObject *exc, Py_ssize_t *end)
                 int PyUnicodeEncodeError_GetEnd(PyObject *exc, Py_ssize_t *end)
@@ -868,13 +879,18 @@ The following functions are used to create and modify Unicode exceptions from C.
    *\*end*.  *end* must not be ``NULL``.  Return ``0`` on success, ``-1`` on
    failure.
 
+   If the :attr:`UnicodeError.object` is an empty sequence, the resulting
+   *end* is ``0``. Otherwise, it is clipped to ``[1, len(object)]``.
+
 .. c:function:: int PyUnicodeDecodeError_SetEnd(PyObject *exc, Py_ssize_t end)
                 int PyUnicodeEncodeError_SetEnd(PyObject *exc, Py_ssize_t end)
                 int PyUnicodeTranslateError_SetEnd(PyObject *exc, Py_ssize_t end)
 
    Set the *end* attribute of the given exception object to *end*.  Return ``0``
    on success, ``-1`` on failure.
 
+   .. seealso:: :attr:`UnicodeError.end`
+
 .. c:function:: PyObject* PyUnicodeDecodeError_GetReason(PyObject *exc)
                 PyObject* PyUnicodeEncodeError_GetReason(PyObject *exc)
                 PyObject* PyUnicodeTranslateError_GetReason(PyObject *exc)

diff --git a/Doc/library/exceptions.rst b/Doc/library/exceptions.rst
@@ -644,10 +644,16 @@ The following exceptions are the exceptions that are usually raised.
 
        The first index of invalid data in :attr:`object`.
 
+       This value should not be negative as it is interpreted as an
+       absolute offset but this constraint is not enforced at runtime.
+
    .. attribute:: end
 
        The index after the last invalid data in :attr:`object`.
 
+       This value should not be negative as it is interpreted as an
+       absolute offset but this constraint is not enforced at runtime.
+
 
 .. exception:: UnicodeEncodeError
 

diff --git a/Lib/test/test_capi/test_exceptions.py b/Lib/test/test_capi/test_exceptions.py
@@ -415,6 +415,156 @@ def test_err_formatunraisable(self):
         # CRASHES formatunraisable(NULL, NULL)
 
 
+class TestUnicodeTranslateError(UnicodeTranslateError):
+    # UnicodeTranslateError takes 4 arguments instead of 5,
+    # so we just make a UnicodeTranslateError class that is
+    # compatible with the UnicodeError.__init__.
+    def __init__(self, encoding, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+
+class TestUnicodeError(unittest.TestCase):
+
+    def _check_no_crash(self, exc):
+        # ensure that the __str__() method does not crash
+        _ = str(exc)
+
+    def test_unicode_encode_error_get_start(self):
+        get_start = _testcapi.unicode_encode_get_start
+        self._test_unicode_error_get_start('x', UnicodeEncodeError, get_start)
+
+    def test_unicode_decode_error_get_start(self):
+        get_start = _testcapi.unicode_decode_get_start
+        self._test_unicode_error_get_start(b'x', UnicodeDecodeError, get_start)
+
+    def test_unicode_translate_error_get_start(self):
+        get_start = _testcapi.unicode_translate_get_start
+        self._test_unicode_error_get_start('x', TestUnicodeTranslateError, get_start)
+
+    def _test_unicode_error_get_start(self, literal, exc_type, get_start):
+        for obj_len, start, c_start in [
+            # normal cases
+            (5, 0, 0),
+            (5, 1, 1),
+            (5, 2, 2),
+            # out of range start is clamped to max(0, obj_len - 1)
+            (0, 0, 0),
+            (0, 1, 0),
+            (0, 10, 0),
+            (5, 5, 4),
+            (5, 10, 4),
+            # negative values are allowed but clipped in the getter
+            (0, -1, 0),
+            (1, -1, 0),
+            (2, -1, 0),
+            (2, -2, 0),
+        ]:
+            obj = literal * obj_len
+            with self.subTest(obj, exc_type=exc_type, start=start):
+                exc = exc_type('utf-8', obj, start, obj_len, 'reason')
+                self.assertEqual(get_start(exc), c_start)
+                self._check_no_crash(exc)
+
+    def test_unicode_encode_error_set_start(self):
+        set_start = _testcapi.unicode_encode_set_start
+        self._test_unicode_error_set_start('x', UnicodeEncodeError, set_start)
+
+    def test_unicode_decode_error_set_start(self):
+        set_start = _testcapi.unicode_decode_set_start
+        self._test_unicode_error_set_start(b'x', UnicodeDecodeError, set_start)
+
+    def test_unicode_translate_error_set_start(self):
+        set_start = _testcapi.unicode_translate_set_start
+        self._test_unicode_error_set_start('x', TestUnicodeTranslateError, set_start)
+
+    def _test_unicode_error_set_start(self, literal, exc_type, set_start):
+        obj_len = 5
+        obj = literal * obj_len
+        for new_start in range(-2 * obj_len, 2 * obj_len):
+            with self.subTest('C-API', obj=obj, exc_type=exc_type, new_start=new_start):
+                exc = exc_type('utf-8', obj, 0, obj_len, 'reason')
+                # arbitrary value is allowed in the C API setter
+                set_start(exc, new_start)
+                self.assertEqual(exc.start, new_start)
+                self._check_no_crash(exc)
+
+            with self.subTest('Py-API', obj=obj, exc_type=exc_type, new_start=new_start):
+                exc = exc_type('utf-8', obj, 0, obj_len, 'reason')
+                # arbitrary value is allowed in the attribute setter
+                exc.start = new_start
+                self.assertEqual(exc.start, new_start)
+                self._check_no_crash(exc)
+
+    def test_unicode_encode_error_get_end(self):
+        get_end = _testcapi.unicode_encode_get_end
+        self._test_unicode_error_get_end('x', UnicodeEncodeError, get_end)
+
+    def test_unicode_decode_error_get_end(self):
+        get_end = _testcapi.unicode_decode_get_end
+        self._test_unicode_error_get_end(b'x', UnicodeDecodeError, get_end)
+
+    def test_unicode_translate_error_get_end(self):
+        get_end = _testcapi.unicode_translate_get_end
+        self._test_unicode_error_get_end('x', TestUnicodeTranslateError, get_end)
+
+    def _test_unicode_error_get_end(self, literal, exc_type, get_end):
+        for obj_len, end, c_end in [
+            # normal cases
+            (5, 0, 1),
+            (5, 1, 1),
+            (5, 2, 2),
+            # out-of-range clipped in [MIN(1, OBJLEN), MAX(MIN(1, OBJLEN), OBJLEN)]
+            (0, 0, 0),
+            (0, 1, 0),
+            (0, 10, 0),
+            (1, 1, 1),
+            (1, 2, 1),
+            (5, 5, 5),
+            (5, 5, 5),
+            (5, 10, 5),
+            # negative values are allowed but clipped in the getter
+            (0, -1, 0),
+            (1, -1, 1),
+            (2, -1, 1),
+            (2, -2, 1),
+        ]:
+            obj = literal * obj_len
+            with self.subTest(obj, exc_type=exc_type, end=end):
+                exc = exc_type('utf-8', obj, 0, end, 'reason')
+                self.assertEqual(get_end(exc), c_end)
+                self._check_no_crash(exc)
+
+    def test_unicode_encode_error_set_end(self):
+        set_end = _testcapi.unicode_encode_set_end
+        self._test_unicode_error_set_end('x', UnicodeEncodeError, set_end)
+
+    def test_unicode_decode_error_set_end(self):
+        set_end = _testcapi.unicode_decode_set_end
+        self._test_unicode_error_set_end(b'x', UnicodeDecodeError, set_end)
+
+    def test_unicode_translate_error_set_end(self):
+        set_end = _testcapi.unicode_translate_set_end
+        self._test_unicode_error_set_end('x', TestUnicodeTranslateError, set_end)
+
+    def _test_unicode_error_set_end(self, literal, exc_type, set_end):
+        obj_len = 5
+        obj = literal * obj_len
+        for new_end in range(-2 * obj_len, 2 * obj_len):
+            with self.subTest('C-API', obj=obj, exc_type=exc_type, new_end=new_end):
+                exc = exc_type('utf-8', obj, 0, obj_len, 'reason')
+                # arbitrary value is allowed in the C API setter
+                set_end(exc, new_end)
+                self.assertEqual(exc.end, new_end)
+                self._check_no_crash(exc)
+
+            with self.subTest('Py-API', obj=obj, exc_type=exc_type, new_end=new_end):
+                exc = exc_type('utf-8', obj, 0, obj_len, 'reason')
+                # arbitrary value is allowed in the attribute setter
+                exc.end = new_end
+                self.assertEqual(exc.end, new_end)
+                self._check_no_crash(exc)
+
+
 class Test_PyUnstable_Exc_PrepReraiseStar(ExceptionIsLikeMixin, unittest.TestCase):
 
     def setUp(self):

diff --git a/Misc/NEWS.d/next/C_API/2024-08-27-09-07-56.gh-issue-123378.JJ6n_u.rst b/Misc/NEWS.d/next/C_API/2024-08-27-09-07-56.gh-issue-123378.JJ6n_u.rst
@@ -0,0 +1,6 @@
+Ensure that the value of :attr:`UnicodeEncodeError.start <UnicodeError.start>`
+retrieved by :c:func:`PyUnicodeEncodeError_GetStart` lie in
+``[0, max(0, objlen - 1)]`` where *objlen* is the length of
+:attr:`UnicodeEncodeError.object <UnicodeError.object>`. Similar
+arguments apply to :exc:`UnicodeDecodeError` and :exc:`UnicodeTranslateError`
+and their corresponding C interface. Patch by Bénédikt Tran.
diff --git a/Misc/NEWS.d/next/C_API/2024-12-02-16-10-36.gh-issue-123378.Q6YRwe.rst b/Misc/NEWS.d/next/C_API/2024-12-02-16-10-36.gh-issue-123378.Q6YRwe.rst
@@ -0,0 +1,6 @@
+Ensure that the value of :attr:`UnicodeEncodeError.end <UnicodeError.end>`
+retrieved by :c:func:`PyUnicodeEncodeError_GetEnd` lies in ``[min(1, objlen),
+max(min(1, objlen), objlen)]`` where *objlen* is the length of
+:attr:`UnicodeEncodeError.object <UnicodeError.object>`. Similar arguments
+apply to :exc:`UnicodeDecodeError` and :exc:`UnicodeTranslateError` and their
+corresponding C interface. Patch by Bénédikt Tran.