Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit a3b334d

Browse files
author
Victor Stinner
committed
PyUnicode_Ready() now sets ascii=1 if maxchar < 128
ascii=1 is no more reserved to PyASCIIObject. Use PyUnicode_IS_COMPACT_ASCII(obj) to check if obj is a PyASCIIObject (as before).
1 parent 1b4f9ce commit a3b334d

3 files changed

Lines changed: 42 additions & 33 deletions

File tree

Include/unicodeobject.h

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ typedef struct {
224224
PyUnicode_4BYTE_KIND
225225
* compact = 1
226226
* ready = 1
227-
* (ascii = 0)
227+
* ascii = 0
228228
229229
- string created by the legacy API (not ready):
230230
@@ -236,7 +236,7 @@ typedef struct {
236236
* data.any is NULL
237237
* utf8 is NULL
238238
* interned = SSTATE_NOT_INTERNED
239-
* (ascii = 0)
239+
* ascii = 0
240240
241241
- string created by the legacy API, ready:
242242
@@ -246,7 +246,6 @@ typedef struct {
246246
* compact = 0
247247
* ready = 1
248248
* data.any is not NULL
249-
* (ascii = 0)
250249
251250
String created by the legacy API becomes ready when calling
252251
PyUnicode_READY().
@@ -278,8 +277,9 @@ typedef struct {
278277
one block for the PyUnicodeObject struct and another for its data
279278
buffer. */
280279
unsigned int compact:1;
281-
/* Compact objects which are ASCII-only also have the state.compact
282-
flag set, and use the PyASCIIObject struct. */
280+
/* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
281+
characters. If ascii is 1 and compact is 1, use the PyASCIIObject
282+
structure. */
283283
unsigned int ascii:1;
284284
/* The ready flag indicates whether the object layout is initialized
285285
completely. This means that this is either a compact object, or
@@ -304,7 +304,7 @@ typedef struct {
304304

305305
/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
306306
PyUnicodeObject structure. The actual string data is initially in the wstr
307-
block, and copied into the data block using PyUnicode_Ready. */
307+
block, and copied into the data block using _PyUnicode_Ready. */
308308
typedef struct {
309309
PyCompactUnicodeObject _base;
310310
union {
@@ -327,7 +327,7 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
327327
#ifndef Py_LIMITED_API
328328

329329
#define PyUnicode_WSTR_LENGTH(op) \
330-
(((PyASCIIObject*)op)->state.ascii ? \
330+
(PyUnicode_IS_COMPACT_ASCII(op) ? \
331331
((PyASCIIObject*)op)->length : \
332332
((PyCompactUnicodeObject*)op)->wstr_length)
333333

@@ -369,10 +369,24 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
369369
#define SSTATE_INTERNED_MORTAL 1
370370
#define SSTATE_INTERNED_IMMORTAL 2
371371

372-
#define PyUnicode_IS_COMPACT_ASCII(op) (((PyASCIIObject*)op)->state.ascii)
372+
/* Return true if the string contains only ASCII characters, or 0 if not. The
373+
string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
374+
or Ready calls are performed. */
375+
#define PyUnicode_IS_ASCII(op) \
376+
(((PyASCIIObject*)op)->state.ascii)
377+
378+
/* Return true if the string is compact or 0 if not.
379+
No type checks or Ready calls are performed. */
380+
#define PyUnicode_IS_COMPACT(op) \
381+
(((PyASCIIObject*)(op))->state.compact)
382+
383+
/* Return true if the string is a compact ASCII string (use PyASCIIObject
384+
structure), or 0 if not. No type checks or Ready calls are performed. */
385+
#define PyUnicode_IS_COMPACT_ASCII(op) \
386+
(PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
373387

374388
/* String contains only wstr byte characters. This is only possible
375-
when the string was created with a legacy API and PyUnicode_Ready()
389+
when the string was created with a legacy API and _PyUnicode_Ready()
376390
has not been called yet. */
377391
#define PyUnicode_WCHAR_KIND 0
378392

@@ -399,11 +413,6 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
399413
#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
400414
#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
401415

402-
/* Return true if the string is compact or 0 if not.
403-
No type checks or Ready calls are performed. */
404-
#define PyUnicode_IS_COMPACT(op) \
405-
(((PyASCIIObject*)(op))->state.compact)
406-
407416
/* Return one of the PyUnicode_*_KIND values defined above. */
408417
#define PyUnicode_KIND(op) \
409418
(assert(PyUnicode_Check(op)), \
@@ -500,9 +509,9 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
500509

501510
#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
502511

503-
/* PyUnicode_READY() does less work than PyUnicode_Ready() in the best
512+
/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
504513
case. If the canonical representation is not yet set, it will still call
505-
PyUnicode_Ready().
514+
_PyUnicode_Ready().
506515
Returns 0 on success and -1 on errors. */
507516
#define PyUnicode_READY(op) \
508517
(assert(PyUnicode_Check(op)), \

Objects/unicodeobject.c

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -288,26 +288,24 @@ _PyUnicode_CheckConsistency(void *op)
288288
ascii = (PyASCIIObject *)op;
289289
kind = ascii->state.kind;
290290

291-
if (ascii->state.ascii == 1) {
291+
if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
292292
assert(kind == PyUnicode_1BYTE_KIND);
293-
assert(ascii->state.compact == 1);
294293
assert(ascii->state.ready == 1);
295294
}
296295
else if (ascii->state.compact == 1) {
297296
assert(kind == PyUnicode_1BYTE_KIND
298297
|| kind == PyUnicode_2BYTE_KIND
299298
|| kind == PyUnicode_4BYTE_KIND);
300-
assert(ascii->state.compact == 1);
301299
assert(ascii->state.ascii == 0);
302300
assert(ascii->state.ready == 1);
303301
} else {
304302
PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
305303
PyUnicodeObject *unicode = (PyUnicodeObject *)op;
306304

307305
if (kind == PyUnicode_WCHAR_KIND) {
308-
assert(!ascii->state.compact == 1);
306+
assert(ascii->state.compact == 0);
309307
assert(ascii->state.ascii == 0);
310-
assert(!ascii->state.ready == 1);
308+
assert(ascii->state.ready == 0);
311309
assert(ascii->wstr != NULL);
312310
assert(unicode->data.any == NULL);
313311
assert(compact->utf8 == NULL);
@@ -317,10 +315,9 @@ _PyUnicode_CheckConsistency(void *op)
317315
assert(kind == PyUnicode_1BYTE_KIND
318316
|| kind == PyUnicode_2BYTE_KIND
319317
|| kind == PyUnicode_4BYTE_KIND);
320-
assert(!ascii->state.compact == 1);
318+
assert(ascii->state.compact == 0);
321319
assert(ascii->state.ready == 1);
322320
assert(unicode->data.any != NULL);
323-
assert(ascii->state.ascii == 0);
324321
}
325322
}
326323
return 1;
@@ -638,7 +635,7 @@ unicode_kind_name(PyObject *unicode)
638635
switch(PyUnicode_KIND(unicode))
639636
{
640637
case PyUnicode_1BYTE_KIND:
641-
if (PyUnicode_IS_COMPACT_ASCII(unicode))
638+
if (PyUnicode_IS_ASCII(unicode))
642639
return "legacy ascii";
643640
else
644641
return "legacy latin1";
@@ -654,14 +651,14 @@ unicode_kind_name(PyObject *unicode)
654651
switch(PyUnicode_KIND(unicode))
655652
{
656653
case PyUnicode_1BYTE_KIND:
657-
if (PyUnicode_IS_COMPACT_ASCII(unicode))
654+
if (PyUnicode_IS_ASCII(unicode))
658655
return "ascii";
659656
else
660-
return "compact latin1";
657+
return "latin1";
661658
case PyUnicode_2BYTE_KIND:
662-
return "compact UCS2";
659+
return "UCS2";
663660
case PyUnicode_4BYTE_KIND:
664-
return "compact UCS4";
661+
return "UCS4";
665662
default:
666663
return "<invalid compact kind>";
667664
}
@@ -703,7 +700,7 @@ _PyUnicode_Dump(PyObject *op)
703700
if (ascii->wstr == data)
704701
printf("shared ");
705702
printf("wstr=%p", ascii->wstr);
706-
if (!ascii->state.ascii) {
703+
if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
707704
printf(" (%zu), ", compact->wstr_length);
708705
if (!ascii->state.compact && compact->utf8 == unicode->data.any)
709706
printf("shared ");
@@ -954,9 +951,9 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
954951
/* check if max_char(from substring) <= max_char(to) */
955952
if (from_kind > to_kind
956953
/* latin1 => ascii */
957-
|| (PyUnicode_IS_COMPACT_ASCII(to)
954+
|| (PyUnicode_IS_ASCII(to)
958955
&& to_kind == PyUnicode_1BYTE_KIND
959-
&& !PyUnicode_IS_COMPACT_ASCII(from)))
956+
&& !PyUnicode_IS_ASCII(from)))
960957
{
961958
/* slow path to check for character overflow */
962959
const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
@@ -1115,10 +1112,12 @@ unicode_ready(PyObject **p_obj, int replace)
11151112
_PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
11161113
_PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
11171114
if (maxchar < 128) {
1115+
_PyUnicode_STATE(unicode).ascii = 1;
11181116
_PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
11191117
_PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
11201118
}
11211119
else {
1120+
_PyUnicode_STATE(unicode).ascii = 0;
11221121
_PyUnicode_UTF8(unicode) = NULL;
11231122
_PyUnicode_UTF8_LENGTH(unicode) = 0;
11241123
}

Tools/gdb/libpython.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,15 +1132,16 @@ def proxyval(self, visited):
11321132
compact = self.field('_base')
11331133
ascii = compact['_base']
11341134
state = ascii['state']
1135+
is_compact_ascii = (int(state['ascii']) and int(state['compact']))
11351136
field_length = long(ascii['length'])
11361137
if not int(state['ready']):
11371138
# string is not ready
11381139
may_have_surrogates = True
11391140
field_str = ascii['wstr']
1140-
if not int(state['ascii']):
1141+
if not is_compact_ascii:
11411142
field_length = compact('wstr_length')
11421143
else:
1143-
if int(state['ascii']):
1144+
if is_compact_ascii:
11441145
field_str = ascii.address + 1
11451146
elif int(state['compact']):
11461147
field_str = compact.address + 1

0 commit comments

Comments
 (0)