Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9c12106

Browse files
committed
Change PyUnicode_FromString[AndSize] to expect UTF-8.
1 parent 64ce505 commit 9c12106

3 files changed

Lines changed: 25 additions & 20 deletions

File tree

Doc/api/concrete.tex

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -996,10 +996,11 @@ \subsection{Unicode Objects \label{unicodeObjects}}
996996
\var{u} is \NULL{}.
997997
\end{cfuncdesc}
998998

999-
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromString}{const char *u}
999+
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromStringAndSize}{const char *u,
1000+
Py_ssize_t size}
10001001
Create a Unicode Object from the char buffer \var{u}.
1001-
\var{u} must be 0-terminated, the bytes will be interpreted as
1002-
being latin-1 encoded. \var{u} may also be \NULL{} which causes the
1002+
The bytes will be interpreted as being UTF-8 encoded.
1003+
\var{u} may also be \NULL{} which causes the
10031004
contents to be undefined. It is the user's responsibility to fill
10041005
in the needed data. The buffer is copied into the new object.
10051006
If the buffer is not \NULL{}, the return value might be a shared object.
@@ -1008,6 +1009,12 @@ \subsection{Unicode Objects \label{unicodeObjects}}
10081009
\versionadded{3.0}
10091010
\end{cfuncdesc}
10101011

1012+
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromString}{const char*u}
1013+
Create a Unicode object from an UTF-8 encoded null-terminated
1014+
char buffer \var{u}.
1015+
\versionadded{3.0}
1016+
\end{funcdesc}
1017+
10111018
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromFormat}{const char *format, ...}
10121019
Take a C \cfunction{printf()}-style \var{format} string and a
10131020
variable number of arguments, calculate the size of the resulting

Objects/bytesobject.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2724,11 +2724,13 @@ PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
27242724
static PyObject *
27252725
bytes_reduce(PyBytesObject *self)
27262726
{
2727-
return Py_BuildValue("(O(s#s))",
2728-
Py_Type(self),
2729-
self->ob_bytes == NULL ? "" : self->ob_bytes,
2730-
Py_Size(self),
2731-
"latin-1");
2727+
PyObject *latin1;
2728+
if (self->ob_bytes)
2729+
latin1 = PyUnicode_DecodeLatin1(self->ob_bytes,
2730+
Py_Size(self), NULL);
2731+
else
2732+
latin1 = PyUnicode_FromString("");
2733+
return Py_BuildValue("(O(Ns))", Py_Type(self), latin1, "latin-1");
27322734
}
27332735

27342736
static PySequenceMethods bytes_as_sequence = {

Objects/unicodeobject.c

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
427427
{
428428
PyUnicodeObject *unicode;
429429
/* If the Unicode data is known at construction time, we can apply
430-
some optimizations which share commonly used objects. */
430+
some optimizations which share commonly used objects.
431+
Also, this means the input must be UTF-8, so fall back to the
432+
UTF-8 decoder at the end. */
431433
if (u != NULL) {
432434

433435
/* Optimization for empty strings */
@@ -436,8 +438,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
436438
return (PyObject *)unicode_empty;
437439
}
438440

439-
/* Single characters are shared when using this constructor */
440-
if (size == 1) {
441+
/* Single characters are shared when using this constructor.
442+
Restrict to ASCII, since the input must be UTF-8. */
443+
if (size == 1 && Py_CHARMASK(*u) < 128) {
441444
unicode = unicode_latin1[Py_CHARMASK(*u)];
442445
if (!unicode) {
443446
unicode = _PyUnicode_New(1);
@@ -449,21 +452,14 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
449452
Py_INCREF(unicode);
450453
return (PyObject *)unicode;
451454
}
455+
456+
return PyUnicode_DecodeUTF8(u, size, NULL);
452457
}
453458

454459
unicode = _PyUnicode_New(size);
455460
if (!unicode)
456461
return NULL;
457462

458-
/* Copy the Unicode data into the new object */
459-
if (u != NULL) {
460-
Py_UNICODE *p = unicode->str;
461-
while (size--)
462-
*p++ = Py_CHARMASK(*u++);
463-
/* Don't need to write trailing 0 because
464-
that's already done by _PyUnicode_New */
465-
}
466-
467463
return (PyObject *)unicode;
468464
}
469465

0 commit comments

Comments
 (0)