Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 1680713

Browse files
committed
Add interning of unicode strings by copying the functionality from
stringobject.c. Intern "True" and "False" in bool_repr() again as it was in the 8bit string era.
1 parent 34a042d commit 1680713

5 files changed

Lines changed: 158 additions & 7 deletions

File tree

Include/stringobject.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,6 @@ typedef struct {
4848
*/
4949
} PyStringObject;
5050

51-
#define SSTATE_NOT_INTERNED 0
52-
#define SSTATE_INTERNED_MORTAL 1
53-
#define SSTATE_INTERNED_IMMORTAL 2
54-
5551
PyAPI_DATA(PyTypeObject) PyBaseString_Type;
5652
PyAPI_DATA(PyTypeObject) PyString_Type;
5753

Include/unicodeobject.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,13 +390,20 @@ typedef struct {
390390
Py_ssize_t length; /* Length of raw Unicode data in buffer */
391391
Py_UNICODE *str; /* Raw Unicode buffer */
392392
long hash; /* Hash value; -1 if not set */
393+
int state; /* != 0 if interned. In this case the two
394+
* references from the dictionary to this object
395+
* are *not* counted in ob_refcnt. */
393396
PyObject *defenc; /* (Default) Encoded version as Python
394397
string, or NULL; this is used for
395398
implementing the buffer protocol */
396399
} PyUnicodeObject;
397400

398401
PyAPI_DATA(PyTypeObject) PyUnicode_Type;
399402

403+
#define SSTATE_NOT_INTERNED 0
404+
#define SSTATE_INTERNED_MORTAL 1
405+
#define SSTATE_INTERNED_IMMORTAL 2
406+
400407
#define PyUnicode_Check(op) \
401408
PyType_FastSubclass((op)->ob_type, Py_TPFLAGS_UNICODE_SUBCLASS)
402409
#define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type)
@@ -529,6 +536,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
529536
PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
530537
PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
531538

539+
PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
540+
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
541+
PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
542+
PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
543+
544+
/* Use only if you know it's a string */
545+
#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
546+
532547
/* --- wchar_t support for platforms which support it --------------------- */
533548

534549
#ifdef HAVE_WCHAR_H

Modules/main.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,14 +521,15 @@ Py_Main(int argc, char **argv)
521521
#ifdef __INSURE__
522522
/* Insure++ is a memory analysis tool that aids in discovering
523523
* memory leaks and other memory problems. On Python exit, the
524-
* interned string dictionary is flagged as being in use at exit
524+
* interned string dictionaries are flagged as being in use at exit
525525
* (which it is). Under normal circumstances, this is fine because
526526
* the memory will be automatically reclaimed by the system. Under
527527
* memory debugging, it's a huge source of useless noise, so we
528528
* trade off slower shutdown for less distraction in the memory
529529
* reports. -baw
530530
*/
531531
_Py_ReleaseInternedStrings();
532+
_Py_ReleaseInternedUnicodeStrings();
532533
#endif /* __INSURE__ */
533534

534535
return sts;

Objects/boolobject.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,10 @@ bool_repr(PyObject *self)
2424

2525
if (self == Py_True)
2626
s = true_str ? true_str :
27-
(true_str = PyUnicode_FromString("True"));
27+
(true_str = PyUnicode_InternFromString("True"));
2828
else
2929
s = false_str ? false_str :
30-
(false_str = PyUnicode_FromString("False"));
30+
(false_str = PyUnicode_InternFromString("False"));
3131
Py_XINCREF(s);
3232
return s;
3333
}

Objects/unicodeobject.c

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,16 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
9292
extern "C" {
9393
#endif
9494

95+
/* This dictionary holds all interned unicode strings. Note that references
96+
to strings in this dictionary are *not* counted in the string's ob_refcnt.
97+
When the interned string reaches a refcnt of 0 the string deallocation
98+
function will delete the reference from this dictionary.
99+
100+
Another way to look at this is that to say that the actual reference
101+
count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102+
*/
103+
static PyObject *interned;
104+
95105
/* Free list for Unicode objects */
96106
static PyUnicodeObject *unicode_freelist;
97107
static int unicode_freelist_size;
@@ -276,6 +286,7 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
276286
unicode->str[length] = 0;
277287
unicode->length = length;
278288
unicode->hash = -1;
289+
unicode->state = 0;
279290
unicode->defenc = NULL;
280291
return unicode;
281292

@@ -288,6 +299,25 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
288299
static
289300
void unicode_dealloc(register PyUnicodeObject *unicode)
290301
{
302+
switch (PyUnicode_CHECK_INTERNED(unicode)) {
303+
case SSTATE_NOT_INTERNED:
304+
break;
305+
306+
case SSTATE_INTERNED_MORTAL:
307+
/* revive dead object temporarily for DelItem */
308+
unicode->ob_refcnt = 3;
309+
if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310+
Py_FatalError(
311+
"deletion of interned unicode string failed");
312+
break;
313+
314+
case SSTATE_INTERNED_IMMORTAL:
315+
Py_FatalError("Immortal interned unicode string died.");
316+
317+
default:
318+
Py_FatalError("Inconsistent interned unicode string state.");
319+
}
320+
291321
if (PyUnicode_CheckExact(unicode) &&
292322
unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
293323
/* Keep-Alive optimization */
@@ -8564,6 +8594,115 @@ _PyUnicode_Fini(void)
85648594
unicode_freelist_size = 0;
85658595
}
85668596

8597+
void
8598+
PyUnicode_InternInPlace(PyObject **p)
8599+
{
8600+
register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8601+
PyObject *t;
8602+
if (s == NULL || !PyUnicode_Check(s))
8603+
Py_FatalError(
8604+
"PyUnicode_InternInPlace: unicode strings only please!");
8605+
/* If it's a subclass, we don't really know what putting
8606+
it in the interned dict might do. */
8607+
if (!PyUnicode_CheckExact(s))
8608+
return;
8609+
if (PyUnicode_CHECK_INTERNED(s))
8610+
return;
8611+
if (interned == NULL) {
8612+
interned = PyDict_New();
8613+
if (interned == NULL) {
8614+
PyErr_Clear(); /* Don't leave an exception */
8615+
return;
8616+
}
8617+
}
8618+
t = PyDict_GetItem(interned, (PyObject *)s);
8619+
if (t) {
8620+
Py_INCREF(t);
8621+
Py_DECREF(*p);
8622+
*p = t;
8623+
return;
8624+
}
8625+
8626+
if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8627+
PyErr_Clear();
8628+
return;
8629+
}
8630+
/* The two references in interned are not counted by refcnt.
8631+
The deallocator will take care of this */
8632+
s->ob_refcnt -= 2;
8633+
PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8634+
}
8635+
8636+
void
8637+
PyUnicode_InternImmortal(PyObject **p)
8638+
{
8639+
PyUnicode_InternInPlace(p);
8640+
if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8641+
PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8642+
Py_INCREF(*p);
8643+
}
8644+
}
8645+
8646+
PyObject *
8647+
PyUnicode_InternFromString(const char *cp)
8648+
{
8649+
PyObject *s = PyUnicode_FromString(cp);
8650+
if (s == NULL)
8651+
return NULL;
8652+
PyUnicode_InternInPlace(&s);
8653+
return s;
8654+
}
8655+
8656+
void _Py_ReleaseInternedUnicodeStrings(void)
8657+
{
8658+
PyObject *keys;
8659+
PyUnicodeObject *s;
8660+
Py_ssize_t i, n;
8661+
Py_ssize_t immortal_size = 0, mortal_size = 0;
8662+
8663+
if (interned == NULL || !PyDict_Check(interned))
8664+
return;
8665+
keys = PyDict_Keys(interned);
8666+
if (keys == NULL || !PyList_Check(keys)) {
8667+
PyErr_Clear();
8668+
return;
8669+
}
8670+
8671+
/* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8672+
detector, interned unicode strings are not forcibly deallocated;
8673+
rather, we give them their stolen references back, and then clear
8674+
and DECREF the interned dict. */
8675+
8676+
n = PyList_GET_SIZE(keys);
8677+
fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8678+
n);
8679+
for (i = 0; i < n; i++) {
8680+
s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8681+
switch (s->state) {
8682+
case SSTATE_NOT_INTERNED:
8683+
/* XXX Shouldn't happen */
8684+
break;
8685+
case SSTATE_INTERNED_IMMORTAL:
8686+
s->ob_refcnt += 1;
8687+
immortal_size += s->length;
8688+
break;
8689+
case SSTATE_INTERNED_MORTAL:
8690+
s->ob_refcnt += 2;
8691+
mortal_size += s->length;
8692+
break;
8693+
default:
8694+
Py_FatalError("Inconsistent interned string state.");
8695+
}
8696+
s->state = SSTATE_NOT_INTERNED;
8697+
}
8698+
fprintf(stderr, "total size of all interned strings: "
8699+
"%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8700+
"mortal/immortal\n", mortal_size, immortal_size);
8701+
Py_DECREF(keys);
8702+
PyDict_Clear(interned);
8703+
Py_DECREF(interned);
8704+
interned = NULL;
8705+
}
85678706

85688707

85698708
/********************* Unicode Iterator **************************/

0 commit comments

Comments
 (0)