Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit be78eaf

Browse files
author
Victor Stinner
committed
PyUnicode_CopyCharacters() checks for buffer and character overflow
It now returns the number of written characters on success.
1 parent fb5f5f2 commit be78eaf

2 files changed

Lines changed: 92 additions & 85 deletions

File tree

Include/unicodeobject.h

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -519,10 +519,22 @@ PyAPI_FUNC(int) _PyUnicode_Ready(
519519
#endif
520520

521521
/* Copy character from one unicode object into another, this function performs
522-
character conversion when nessesary and falls back to memcpy if possible.
523-
Return -1 and raise an exception on error, return 0 on success. */
522+
character conversion when necessary and falls back to memcpy if possible.
523+
524+
Fail if 'to' is smaller than how_many or smaller than len(from)-from_start,
525+
or if kind(from[from_start:from_start+how_many]) > kind(to).
526+
527+
Return the number of written character, or return -1 and raise an exception
528+
on error.
529+
530+
Pseudo-code:
531+
532+
how_many = min(how_many, len(from) - from_start)
533+
to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
534+
return how_many
535+
*/
524536
#ifndef Py_LIMITED_API
525-
PyAPI_FUNC(int) PyUnicode_CopyCharacters(
537+
PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
526538
PyObject *to,
527539
Py_ssize_t to_start,
528540
PyObject *from,

Objects/unicodeobject.c

Lines changed: 77 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -606,13 +606,13 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
606606
}
607607
#endif
608608

609-
int
609+
Py_ssize_t
610610
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
611611
PyObject *from, Py_ssize_t from_start,
612612
Py_ssize_t how_many)
613613
{
614-
int from_kind;
615-
int to_kind;
614+
unsigned int from_kind;
615+
unsigned int to_kind;
616616

617617
assert(PyUnicode_Check(from));
618618
assert(PyUnicode_Check(to));
@@ -622,94 +622,89 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
622622
if (PyUnicode_READY(to))
623623
return -1;
624624

625+
how_many = PY_MIN(PyUnicode_GET_LENGTH(from), how_many);
626+
if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
627+
PyErr_Format(PyExc_ValueError,
628+
"Cannot write %zi characters at %zi "
629+
"in a string of %zi characters",
630+
how_many, to_start, PyUnicode_GET_LENGTH(to));
631+
return -1;
632+
}
633+
625634
from_kind = PyUnicode_KIND(from);
626635
to_kind = PyUnicode_KIND(to);
627636

628637
if (from_kind == to_kind) {
629-
const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(to);
630-
Py_MEMCPY(PyUnicode_1BYTE_DATA(to) + (to_start * char_size),
631-
PyUnicode_1BYTE_DATA(from) + (from_start * char_size),
632-
how_many * char_size);
633-
return 0;
634-
}
638+
/* fast path */
639+
Py_MEMCPY((char*)PyUnicode_DATA(to)
640+
+ PyUnicode_KIND_SIZE(to_kind, to_start),
641+
(char*)PyUnicode_DATA(from)
642+
+ PyUnicode_KIND_SIZE(from_kind, from_start),
643+
PyUnicode_KIND_SIZE(to_kind, how_many));
644+
return how_many;
645+
}
646+
647+
if (from_kind > to_kind) {
648+
/* slow path to check for character overflow */
649+
const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
650+
void *from_data = PyUnicode_DATA(from);
651+
void *to_data = PyUnicode_DATA(to);
652+
Py_UCS4 ch, maxchar;
653+
Py_ssize_t i;
654+
int overflow;
635655

636-
switch (from_kind) {
637-
case PyUnicode_1BYTE_KIND:
638-
switch (to_kind) {
639-
case PyUnicode_2BYTE_KIND:
640-
_PyUnicode_CONVERT_BYTES(
641-
unsigned char, Py_UCS2,
642-
PyUnicode_1BYTE_DATA(from) + from_start,
643-
PyUnicode_1BYTE_DATA(from) + from_start + how_many,
644-
PyUnicode_2BYTE_DATA(to) + to_start
645-
);
646-
break;
647-
case PyUnicode_4BYTE_KIND:
648-
_PyUnicode_CONVERT_BYTES(
649-
unsigned char, Py_UCS4,
650-
PyUnicode_1BYTE_DATA(from) + from_start,
651-
PyUnicode_1BYTE_DATA(from) + from_start + how_many,
652-
PyUnicode_4BYTE_DATA(to) + to_start
653-
);
654-
break;
655-
default:
656-
goto invalid_state;
657-
}
658-
break;
659-
case PyUnicode_2BYTE_KIND:
660-
switch (to_kind) {
661-
case PyUnicode_1BYTE_KIND:
662-
_PyUnicode_CONVERT_BYTES(
663-
Py_UCS2, unsigned char,
664-
PyUnicode_2BYTE_DATA(from) + from_start,
665-
PyUnicode_2BYTE_DATA(from) + from_start + how_many,
666-
PyUnicode_1BYTE_DATA(to) + to_start
667-
);
668-
break;
669-
case PyUnicode_4BYTE_KIND:
670-
_PyUnicode_CONVERT_BYTES(
671-
Py_UCS2, Py_UCS4,
672-
PyUnicode_2BYTE_DATA(from) + from_start,
673-
PyUnicode_2BYTE_DATA(from) + from_start + how_many,
674-
PyUnicode_4BYTE_DATA(to) + to_start
675-
);
676-
break;
677-
default:
678-
goto invalid_state;
679-
}
680-
break;
681-
case PyUnicode_4BYTE_KIND:
682-
switch (to_kind) {
683-
case PyUnicode_1BYTE_KIND:
684-
_PyUnicode_CONVERT_BYTES(
685-
Py_UCS4, unsigned char,
686-
PyUnicode_4BYTE_DATA(from) + from_start,
687-
PyUnicode_4BYTE_DATA(from) + from_start + how_many,
688-
PyUnicode_1BYTE_DATA(to) + to_start
689-
);
690-
break;
691-
case PyUnicode_2BYTE_KIND:
692-
_PyUnicode_CONVERT_BYTES(
693-
Py_UCS4, Py_UCS2,
694-
PyUnicode_4BYTE_DATA(from) + from_start,
695-
PyUnicode_4BYTE_DATA(from) + from_start + how_many,
696-
PyUnicode_2BYTE_DATA(to) + to_start
697-
);
656+
maxchar = 0;
657+
for (i=0; i < how_many; i++) {
658+
ch = PyUnicode_READ(from_kind, from_data, from_start + i);
659+
if (ch > maxchar) {
660+
maxchar = ch;
661+
if (maxchar > to_maxchar) {
662+
overflow = 1;
698663
break;
699-
default:
700-
goto invalid_state;
664+
}
701665
}
702-
break;
703-
default:
704-
goto invalid_state;
666+
PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
667+
}
668+
if (!overflow)
669+
return how_many;
670+
}
671+
else if (from_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_2BYTE_KIND)
672+
{
673+
_PyUnicode_CONVERT_BYTES(
674+
Py_UCS1, Py_UCS2,
675+
PyUnicode_1BYTE_DATA(from) + from_start,
676+
PyUnicode_1BYTE_DATA(from) + from_start + how_many,
677+
PyUnicode_2BYTE_DATA(to) + to_start
678+
);
679+
return how_many;
680+
}
681+
else if (from_kind == PyUnicode_1BYTE_KIND
682+
&& to_kind == PyUnicode_4BYTE_KIND)
683+
{
684+
_PyUnicode_CONVERT_BYTES(
685+
Py_UCS1, Py_UCS4,
686+
PyUnicode_1BYTE_DATA(from) + from_start,
687+
PyUnicode_1BYTE_DATA(from) + from_start + how_many,
688+
PyUnicode_4BYTE_DATA(to) + to_start
689+
);
690+
return how_many;
691+
}
692+
else if (from_kind == PyUnicode_2BYTE_KIND
693+
&& to_kind == PyUnicode_4BYTE_KIND)
694+
{
695+
_PyUnicode_CONVERT_BYTES(
696+
Py_UCS2, Py_UCS4,
697+
PyUnicode_2BYTE_DATA(from) + from_start,
698+
PyUnicode_2BYTE_DATA(from) + from_start + how_many,
699+
PyUnicode_4BYTE_DATA(to) + to_start
700+
);
701+
return how_many;
705702
}
706-
return 0;
707-
708-
invalid_state:
709703
PyErr_Format(PyExc_ValueError,
710-
"Impossible kind state (from=%i, to=%i) "
711-
"in PyUnicode_CopyCharacters",
712-
from_kind, to_kind);
704+
"Cannot copy UCS%u characters "
705+
"into a string of UCS%u characters",
706+
1 << (from_kind - 1),
707+
1 << (to_kind -1));
713708
return -1;
714709
}
715710

0 commit comments

Comments
 (0)