From 9984581a70cdbb9a63424d9fd71e7cdc97262950 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> Date: Fri, 14 Feb 2025 17:16:47 +0000 Subject: [PATCH 1/2] gh-82045: Correct and deduplicate "isprintable" docs; add test. (GH-130118) We had the definition of what makes a character "printable" documented in three places, giving two different definitions. The definition in the comment on `_PyUnicode_IsPrintable` was inverted; correct that. With that correction, the two definitions turn out to be equivalent -- but to confirm that, you have to go look up, or happen to know, that those are the only five "Other" categories and only three "Separator" categories in the Unicode character database. That makes it hard for the reader to tell whether they really are the same, or if there's some subtle difference in the intended semantics. Fix that by cutting the C API docs' and the C comment's copies of the subtle details, in favor of referring to the Python-level docs. That ensures it's explicit that these are all meant to agree, and also lets us concentrate improvements to the wording in one place. Speaking of which, borrow some ideas from the C comment, along with other tweaks, to hopefully add a bit more clarity to that one newly-centralized copy in the docs. Also add a thorough test that the implementation agrees with this definition. Author: Greg Price Co-authored-by: Greg Price (cherry picked from commit 3402e133ef26736296c07992266a82b181a5d532) --- Doc/c-api/unicode.rst | 9 ++------- Doc/library/stdtypes.rst | 20 +++++++++++++------- Lib/test/test_str.py | 9 +++++++++ Objects/clinic/unicodeobject.c.h | 7 +++---- Objects/unicodectype.c | 16 ++++------------ Objects/unicodeobject.c | 7 +++---- 6 files changed, 34 insertions(+), 34 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 64c16056ece9df..31a801f412850c 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -256,13 +256,8 @@ the Python configuration. .. c:function:: int Py_UNICODE_ISPRINTABLE(Py_UCS4 ch) - Return ``1`` or ``0`` depending on whether *ch* is a printable character. - Nonprintable characters are those characters defined in the Unicode character - database as "Other" or "Separator", excepting the ASCII space (0x20) which is - considered printable. (Note that printable characters in this context are - those which should not be escaped when :func:`repr` is invoked on a string. - It has no bearing on the handling of strings written to :data:`sys.stdout` or - :data:`sys.stderr`.) + Return ``1`` or ``0`` depending on whether *ch* is a printable character, + in the sense of :meth:`str.isprintable`. These APIs can be used for fast direct character conversions: diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index b5ba8060cb45a1..cb9624ab757330 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1876,13 +1876,19 @@ expression support in the :mod:`re` module). .. method:: str.isprintable() - Return ``True`` if all characters in the string are printable or the string is - empty, ``False`` otherwise. Nonprintable characters are those characters defined - in the Unicode character database as "Other" or "Separator", excepting the - ASCII space (0x20) which is considered printable. (Note that printable - characters in this context are those which should not be escaped when - :func:`repr` is invoked on a string. It has no bearing on the handling of - strings written to :data:`sys.stdout` or :data:`sys.stderr`.) + Return true if all characters in the string are printable, false if it + contains at least one non-printable character. + + Here "printable" means the character is suitable for :func:`repr` to use in + its output; "non-printable" means that :func:`repr` on built-in types will + hex-escape the character. It has no bearing on the handling of strings + written to :data:`sys.stdout` or :data:`sys.stderr`. + + The printable characters are those which in the Unicode character database + (see :mod:`unicodedata`) have a general category in group Letter, Mark, + Number, Punctuation, or Symbol (L, M, N, P, or S); plus the ASCII space 0x20. + Nonprintable characters are those in group Separator or Other (Z or C), + except the ASCII space. .. method:: str.isspace() diff --git a/Lib/test/test_str.py b/Lib/test/test_str.py index c4f59224a6fe6f..46673cc56adb35 100644 --- a/Lib/test/test_str.py +++ b/Lib/test/test_str.py @@ -853,6 +853,15 @@ def test_isprintable(self): self.assertTrue('\U0001F46F'.isprintable()) self.assertFalse('\U000E0020'.isprintable()) + @support.requires_resource('cpu') + def test_isprintable_invariant(self): + for codepoint in range(sys.maxunicode + 1): + char = chr(codepoint) + category = unicodedata.category(char) + self.assertEqual(char.isprintable(), + category[0] not in ('C', 'Z') + or char == ' ') + def test_surrogates(self): for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'): diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h index 78e14b0021d006..af026a168ae18a 100644 --- a/Objects/clinic/unicodeobject.c.h +++ b/Objects/clinic/unicodeobject.c.h @@ -703,10 +703,9 @@ PyDoc_STRVAR(unicode_isprintable__doc__, "isprintable($self, /)\n" "--\n" "\n" -"Return True if the string is printable, False otherwise.\n" +"Return True if all characters in the string are printable, False otherwise.\n" "\n" -"A string is printable if all of its characters are considered printable in\n" -"repr() or if it is empty."); +"A character is printable if repr() may use it in its output."); #define UNICODE_ISPRINTABLE_METHODDEF \ {"isprintable", (PyCFunction)unicode_isprintable, METH_NOARGS, unicode_isprintable__doc__}, @@ -1888,4 +1887,4 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=9fee62bd337f809b input=a9049054013a1b77]*/ +/*[clinic end generated code: output=db37497bf38a2c17 input=a9049054013a1b77]*/ diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index aa5c5b2a4ad2eb..7cd0dca3d13545 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -142,18 +142,10 @@ int _PyUnicode_IsNumeric(Py_UCS4 ch) return (ctype->flags & NUMERIC_MASK) != 0; } -/* Returns 1 for Unicode characters to be hex-escaped when repr()ed, - 0 otherwise. - All characters except those characters defined in the Unicode character - database as following categories are considered printable. - * Cc (Other, Control) - * Cf (Other, Format) - * Cs (Other, Surrogate) - * Co (Other, Private Use) - * Cn (Other, Not Assigned) - * Zl Separator, Line ('\u2028', LINE SEPARATOR) - * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) - * Zs (Separator, Space) other than ASCII space('\x20'). +/* Returns 1 for Unicode characters that repr() may use in its output, + and 0 for characters to be hex-escaped. + + See documentation of `str.isprintable` for details. */ int _PyUnicode_IsPrintable(Py_UCS4 ch) { diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 434cb5ffb61c0e..a00125345b2dd5 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12016,15 +12016,14 @@ unicode_isidentifier_impl(PyObject *self) /*[clinic input] str.isprintable as unicode_isprintable -Return True if the string is printable, False otherwise. +Return True if all characters in the string are printable, False otherwise. -A string is printable if all of its characters are considered printable in -repr() or if it is empty. +A character is printable if repr() may use it in its output. [clinic start generated code]*/ static PyObject * unicode_isprintable_impl(PyObject *self) -/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/ +/*[clinic end generated code: output=3ab9626cd32dd1a0 input=4e56bcc6b06ca18c]*/ { Py_ssize_t i, length; int kind; From ef054904d140f752a29f8e9088f836dcda3b8079 Mon Sep 17 00:00:00 2001 From: stan Date: Fri, 14 Feb 2025 17:49:52 +0000 Subject: [PATCH 2/2] Fix clinic errors --- Objects/clinic/unicodeobject.c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h index af026a168ae18a..1db304e7063d4c 100644 --- a/Objects/clinic/unicodeobject.c.h +++ b/Objects/clinic/unicodeobject.c.h @@ -1887,4 +1887,4 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=db37497bf38a2c17 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=b7d75c4898e8198d input=a9049054013a1b77]*/