Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 910337b

Browse files
author
Victor Stinner
committed
Add _PyUnicode_CheckConsistency() macro to help debugging
* Document Unicode string states * Use _PyUnicode_CheckConsistency() to ensure that objects are always consistent.
1 parent 4fae54c commit 910337b

2 files changed

Lines changed: 144 additions & 37 deletions

File tree

Include/unicodeobject.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,52 @@ extern "C" {
206206
immediately follow the structure. utf8_length and wstr_length can be found
207207
in the length field; the utf8 pointer is equal to the data pointer. */
208208
typedef struct {
209+
/* Unicode strings can be in 4 states:
210+
211+
- compact ascii:
212+
213+
* structure = PyASCIIObject
214+
* kind = PyUnicode_1BYTE_KIND
215+
* compact = 1
216+
* ascii = 1
217+
* ready = 1
218+
* utf8 = data
219+
220+
- compact:
221+
222+
* structure = PyCompactUnicodeObject
223+
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
224+
PyUnicode_4BYTE_KIND
225+
* compact = 1
226+
* ready = 1
227+
* (ascii = 0)
228+
229+
- string created by the legacy API (not ready):
230+
231+
* structure = PyUnicodeObject
232+
* kind = PyUnicode_WCHAR_KIND
233+
* compact = 0
234+
* ready = 0
235+
* wstr is not NULL
236+
* data.any is NULL
237+
* utf8 is NULL
238+
* interned = SSTATE_NOT_INTERNED
239+
* (ascii = 0)
240+
241+
- string created by the legacy API, ready:
242+
243+
* structure = PyUnicodeObject structure
244+
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
245+
PyUnicode_4BYTE_KIND
246+
* compact = 0
247+
* ready = 1
248+
* data.any is not NULL
249+
* (ascii = 0)
250+
251+
String created by the legacy API becomes ready when calling
252+
PyUnicode_READY().
253+
254+
See also _PyUnicode_CheckConsistency(). */
209255
PyObject_HEAD
210256
Py_ssize_t length; /* Number of code points in the string */
211257
Py_hash_t hash; /* Hash value; -1 if not set */

Objects/unicodeobject.c

Lines changed: 98 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -89,33 +89,24 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
8989
extern "C" {
9090
#endif
9191

92-
/* Generic helper macro to convert characters of different types.
93-
from_type and to_type have to be valid type names, begin and end
94-
are pointers to the source characters which should be of type
95-
"from_type *". to is a pointer of type "to_type *" and points to the
96-
buffer where the result characters are written to. */
97-
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
98-
do { \
99-
const from_type *iter_; to_type *to_; \
100-
for (iter_ = (begin), to_ = (to_type *)(to); \
101-
iter_ < (end); \
102-
++iter_, ++to_) { \
103-
*to_ = (to_type)*iter_; \
104-
} \
105-
} while (0)
92+
#ifdef Py_DEBUG
93+
# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94+
#else
95+
# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96+
#endif
10697

10798
#define _PyUnicode_UTF8(op) \
10899
(((PyCompactUnicodeObject*)(op))->utf8)
109100
#define PyUnicode_UTF8(op) \
110-
(assert(PyUnicode_Check(op)), \
101+
(assert(_PyUnicode_CHECK(op)), \
111102
assert(PyUnicode_IS_READY(op)), \
112103
PyUnicode_IS_COMPACT_ASCII(op) ? \
113104
((char*)((PyASCIIObject*)(op) + 1)) : \
114105
_PyUnicode_UTF8(op))
115106
#define _PyUnicode_UTF8_LENGTH(op) \
116107
(((PyCompactUnicodeObject*)(op))->utf8_length)
117108
#define PyUnicode_UTF8_LENGTH(op) \
118-
(assert(PyUnicode_Check(op)), \
109+
(assert(_PyUnicode_CHECK(op)), \
119110
assert(PyUnicode_IS_READY(op)), \
120111
PyUnicode_IS_COMPACT_ASCII(op) ? \
121112
((PyASCIIObject*)(op))->length : \
@@ -125,22 +116,42 @@ extern "C" {
125116
#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
126117
#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
127118
#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
128-
#define _PyUnicode_KIND(op) \
129-
(assert(PyUnicode_Check(op)), \
119+
#define _PyUnicode_KIND(op) \
120+
(assert(_PyUnicode_CHECK(op)), \
130121
((PyASCIIObject *)(op))->state.kind)
131-
#define _PyUnicode_GET_LENGTH(op) \
132-
(assert(PyUnicode_Check(op)), \
122+
#define _PyUnicode_GET_LENGTH(op) \
123+
(assert(_PyUnicode_CHECK(op)), \
133124
((PyASCIIObject *)(op))->length)
134125
#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
135126

127+
#undef PyUnicode_READY
128+
#define PyUnicode_READY(op) \
129+
(assert(_PyUnicode_CHECK(op)), \
130+
(PyUnicode_IS_READY(op) ? \
131+
0 : _PyUnicode_Ready((PyObject *)(op))))
132+
136133
/* true if the Unicode object has an allocated UTF-8 memory block
137134
(not shared with other data) */
138-
#define _PyUnicode_HAS_UTF8_MEMORY(op) \
139-
(assert(PyUnicode_Check(op)), \
140-
(!PyUnicode_IS_COMPACT_ASCII(op) \
141-
&& _PyUnicode_UTF8(op) \
135+
#define _PyUnicode_HAS_UTF8_MEMORY(op) \
136+
(assert(_PyUnicode_CHECK(op)), \
137+
(!PyUnicode_IS_COMPACT_ASCII(op) \
138+
&& _PyUnicode_UTF8(op) \
142139
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
143140

141+
/* Generic helper macro to convert characters of different types.
142+
from_type and to_type have to be valid type names, begin and end
143+
are pointers to the source characters which should be of type
144+
"from_type *". to is a pointer of type "to_type *" and points to the
145+
buffer where the result characters are written to. */
146+
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147+
do { \
148+
const from_type *iter_; to_type *to_; \
149+
for (iter_ = (begin), to_ = (to_type *)(to); \
150+
iter_ < (end); \
151+
++iter_, ++to_) { \
152+
*to_ = (to_type)*iter_; \
153+
} \
154+
} while (0)
144155

145156
/* The Unicode string has been modified: reset the hash */
146157
#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
@@ -250,6 +261,57 @@ PyUnicode_GetMax(void)
250261
#endif
251262
}
252263

264+
#ifdef Py_DEBUG
265+
static int
266+
_PyUnicode_CheckConsistency(void *op)
267+
{
268+
PyASCIIObject *ascii;
269+
unsigned int kind;
270+
271+
assert(PyUnicode_Check(op));
272+
273+
ascii = (PyASCIIObject *)op;
274+
kind = ascii->state.kind;
275+
276+
if (ascii->state.ascii == 1) {
277+
assert(kind == PyUnicode_1BYTE_KIND);
278+
assert(ascii->state.compact == 1);
279+
assert(ascii->state.ready == 1);
280+
}
281+
else if (ascii->state.compact == 1) {
282+
assert(kind == PyUnicode_1BYTE_KIND
283+
|| kind == PyUnicode_2BYTE_KIND
284+
|| kind == PyUnicode_4BYTE_KIND);
285+
assert(ascii->state.compact == 1);
286+
assert(ascii->state.ascii == 0);
287+
assert(ascii->state.ready == 1);
288+
} else {
289+
PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
290+
PyUnicodeObject *unicode = (PyUnicodeObject *)op;
291+
292+
if (kind == PyUnicode_WCHAR_KIND) {
293+
assert(!ascii->state.compact == 1);
294+
assert(ascii->state.ascii == 0);
295+
assert(!ascii->state.ready == 1);
296+
assert(ascii->wstr != NULL);
297+
assert(unicode->data.any == NULL);
298+
assert(compact->utf8 == NULL);
299+
assert(ascii->state.interned == SSTATE_NOT_INTERNED);
300+
}
301+
else {
302+
assert(kind == PyUnicode_1BYTE_KIND
303+
|| kind == PyUnicode_2BYTE_KIND
304+
|| kind == PyUnicode_4BYTE_KIND);
305+
assert(!ascii->state.compact == 1);
306+
assert(ascii->state.ready == 1);
307+
assert(unicode->data.any != NULL);
308+
assert(ascii->state.ascii == 0);
309+
}
310+
}
311+
return 1;
312+
}
313+
#endif
314+
253315
/* --- Bloom Filters ----------------------------------------------------- */
254316

255317
/* stuff to implement simple "bloom filters" for Unicode characters.
@@ -542,7 +604,7 @@ _PyUnicode_New(Py_ssize_t length)
542604
static const char*
543605
unicode_kind_name(PyObject *unicode)
544606
{
545-
assert(PyUnicode_Check(unicode));
607+
assert(_PyUnicode_CHECK(unicode));
546608
if (!PyUnicode_IS_COMPACT(unicode))
547609
{
548610
if (!PyUnicode_IS_READY(unicode))
@@ -744,7 +806,8 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
744806
const wchar_t *iter;
745807
Py_UCS4 *ucs4_out;
746808

747-
assert(unicode && PyUnicode_Check(unicode));
809+
assert(unicode != NULL);
810+
assert(_PyUnicode_CHECK(unicode));
748811
assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
749812
ucs4_out = PyUnicode_4BYTE_DATA(unicode);
750813

@@ -771,7 +834,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
771834
static int
772835
_PyUnicode_Dirty(PyObject *unicode)
773836
{
774-
assert(PyUnicode_Check(unicode));
837+
assert(_PyUnicode_CHECK(unicode));
775838
if (Py_REFCNT(unicode) != 1) {
776839
PyErr_SetString(PyExc_ValueError,
777840
"Cannot modify a string having more than 1 reference");
@@ -966,10 +1029,8 @@ _PyUnicode_Ready(PyObject *obj)
9661029
strings were created using _PyObject_New() and where no canonical
9671030
representation (the str field) has been set yet aka strings
9681031
which are not yet ready. */
969-
assert(PyUnicode_Check(obj));
970-
assert(!PyUnicode_IS_READY(obj));
971-
assert(!PyUnicode_IS_COMPACT(obj));
972-
assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
1032+
assert(_PyUnicode_CHECK(unicode));
1033+
assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
9731034
assert(_PyUnicode_WSTR(unicode) != NULL);
9741035
assert(_PyUnicode_DATA_ANY(unicode) == NULL);
9751036
assert(_PyUnicode_UTF8(unicode) == NULL);
@@ -1154,7 +1215,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
11541215
assert(PyUnicode_Check(unicode));
11551216
assert(0 <= length);
11561217

1157-
if (!PyUnicode_IS_COMPACT(unicode) && !PyUnicode_IS_READY(unicode))
1218+
if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
11581219
old_length = PyUnicode_WSTR_LENGTH(unicode);
11591220
else
11601221
old_length = PyUnicode_GET_LENGTH(unicode);
@@ -1907,7 +1968,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
19071968
case 'U':
19081969
{
19091970
PyObject *obj = va_arg(count, PyObject *);
1910-
assert(obj && PyUnicode_Check(obj));
1971+
assert(obj && _PyUnicode_CHECK(obj));
19111972
if (PyUnicode_READY(obj) == -1)
19121973
goto fail;
19131974
argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
@@ -1921,7 +1982,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
19211982
const char *str = va_arg(count, const char *);
19221983
PyObject *str_obj;
19231984
assert(obj || str);
1924-
assert(!obj || PyUnicode_Check(obj));
1985+
assert(!obj || _PyUnicode_CHECK(obj));
19251986
if (obj) {
19261987
if (PyUnicode_READY(obj) == -1)
19271988
goto fail;
@@ -9570,7 +9631,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
95709631
void *data;
95719632
Py_UCS4 chr;
95729633

9573-
assert(PyUnicode_Check(uni));
9634+
assert(_PyUnicode_CHECK(uni));
95749635
if (PyUnicode_READY(uni) == -1)
95759636
return -1;
95769637
kind = PyUnicode_KIND(uni);
@@ -12698,7 +12759,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
1269812759
unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
1269912760
if (unicode == NULL)
1270012761
return NULL;
12701-
assert(PyUnicode_Check(unicode));
12762+
assert(_PyUnicode_CHECK(unicode));
1270212763
if (PyUnicode_READY(unicode))
1270312764
return NULL;
1270412765

@@ -13054,7 +13115,7 @@ unicodeiter_next(unicodeiterobject *it)
1305413115
seq = it->it_seq;
1305513116
if (seq == NULL)
1305613117
return NULL;
13057-
assert(PyUnicode_Check(seq));
13118+
assert(_PyUnicode_CHECK(seq));
1305813119

1305913120
if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
1306013121
int kind = PyUnicode_KIND(seq);

0 commit comments

Comments
 (0)