[Python-checkins] bpo-40521: Per-interpreter interned strings (GH-20085)

vstinner webhook-mailer at python.org
Fri Dec 25 20:58:41 EST 2020


https://github.com/python/cpython/commit/ea251806b8dffff11b30d2182af1e589caf88acf
commit: ea251806b8dffff11b30d2182af1e589caf88acf
branch: master
author: Victor Stinner <vstinner at python.org>
committer: vstinner <vstinner at python.org>
date: 2020-12-26T02:58:33+01:00
summary:

bpo-40521: Per-interpreter interned strings (GH-20085)

Make the Unicode dictionary of interned strings compatible with
subinterpreters.

Remove the INTERN_NAME_STRINGS macro in typeobject.c: names are
always now interned (even if EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
macro is defined).

_PyUnicode_ClearInterned() now uses PyDict_Next() to no longer
allocate memory, to ensure that the interned dictionary is cleared.

files:
A Misc/NEWS.d/next/Core and Builtins/2020-05-14-02-55-39.bpo-40521.dIlXsZ.rst
M Include/internal/pycore_interp.h
M Objects/typeobject.c
M Objects/unicodeobject.c
M Python/pylifecycle.c

diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
index 339c2c4c61f78..58b1126712646 100644
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@@ -76,6 +76,17 @@ struct _Py_unicode_state {
        shared as well. */
     PyObject *latin1[256];
     struct _Py_unicode_fs_codec fs_codec;
+
+    /* This dictionary holds all interned unicode strings.  Note that references
+       to strings in this dictionary are *not* counted in the string's ob_refcnt.
+       When the interned string reaches a refcnt of 0 the string deallocation
+       function will delete the reference from this dictionary.
+
+       Another way to look at this is that to say that the actual reference
+       count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
+    */
+    PyObject *interned;
+
     // Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId()
     struct _Py_unicode_ids ids;
 };
diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-14-02-55-39.bpo-40521.dIlXsZ.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-14-02-55-39.bpo-40521.dIlXsZ.rst
new file mode 100644
index 0000000000000..654757a5f90cb
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-14-02-55-39.bpo-40521.dIlXsZ.rst	
@@ -0,0 +1,2 @@
+Make the Unicode dictionary of interned strings compatible with
+subinterpreters. Patch by Victor Stinner.
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 661ccb71ab0c7..43c499a04524f 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -48,11 +48,6 @@ typedef struct PySlot_Offset {
 } PySlot_Offset;
 
 
-/* bpo-40521: Interned strings are shared by all subinterpreters */
-#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
-#  define INTERN_NAME_STRINGS
-#endif
-
 /* alphabetical order */
 _Py_IDENTIFIER(__abstractmethods__);
 _Py_IDENTIFIER(__class__);
@@ -3527,7 +3522,6 @@ type_setattro(PyTypeObject *type, PyObject *name, PyObject *value)
             if (name == NULL)
                 return -1;
         }
-#ifdef INTERN_NAME_STRINGS
         if (!PyUnicode_CHECK_INTERNED(name)) {
             PyUnicode_InternInPlace(&name);
             if (!PyUnicode_CHECK_INTERNED(name)) {
@@ -3537,7 +3531,6 @@ type_setattro(PyTypeObject *type, PyObject *name, PyObject *value)
                 return -1;
             }
         }
-#endif
     }
     else {
         /* Will fail in _PyObject_GenericSetAttrWithDict. */
@@ -7683,17 +7676,10 @@ _PyTypes_InitSlotDefs(void)
     for (slotdef *p = slotdefs; p->name; p++) {
         /* Slots must be ordered by their offset in the PyHeapTypeObject. */
         assert(!p[1].name || p->offset <= p[1].offset);
-#ifdef INTERN_NAME_STRINGS
         p->name_strobj = PyUnicode_InternFromString(p->name);
         if (!p->name_strobj || !PyUnicode_CHECK_INTERNED(p->name_strobj)) {
             return _PyStatus_NO_MEMORY();
         }
-#else
-        p->name_strobj = PyUnicode_FromString(p->name);
-        if (!p->name_strobj) {
-            return _PyStatus_NO_MEMORY();
-        }
-#endif
     }
     slotdefs_initialized = 1;
     return _PyStatus_OK();
@@ -7718,24 +7704,16 @@ update_slot(PyTypeObject *type, PyObject *name)
     int offset;
 
     assert(PyUnicode_CheckExact(name));
-#ifdef INTERN_NAME_STRINGS
     assert(PyUnicode_CHECK_INTERNED(name));
-#endif
 
     assert(slotdefs_initialized);
     pp = ptrs;
     for (p = slotdefs; p->name; p++) {
         assert(PyUnicode_CheckExact(p->name_strobj));
         assert(PyUnicode_CheckExact(name));
-#ifdef INTERN_NAME_STRINGS
         if (p->name_strobj == name) {
             *pp++ = p;
         }
-#else
-        if (p->name_strobj == name || _PyUnicode_EQ(p->name_strobj, name)) {
-            *pp++ = p;
-        }
-#endif
     }
     *pp = NULL;
     for (pp = ptrs; *pp; pp++) {
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 3238d1e692a51..a03ca9a10d152 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -206,22 +206,6 @@ extern "C" {
 #  define OVERALLOCATE_FACTOR 4
 #endif
 
-/* bpo-40521: Interned strings are shared by all interpreters. */
-#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
-#  define INTERNED_STRINGS
-#endif
-
-/* This dictionary holds all interned unicode strings.  Note that references
-   to strings in this dictionary are *not* counted in the string's ob_refcnt.
-   When the interned string reaches a refcnt of 0 the string deallocation
-   function will delete the reference from this dictionary.
-
-   Another way to look at this is that to say that the actual reference
-   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
-*/
-#ifdef INTERNED_STRINGS
-static PyObject *interned = NULL;
-#endif
 
 static struct _Py_unicode_state*
 get_unicode_state(void)
@@ -1946,7 +1930,8 @@ unicode_dealloc(PyObject *unicode)
         break;
 
     case SSTATE_INTERNED_MORTAL:
-#ifdef INTERNED_STRINGS
+    {
+        struct _Py_unicode_state *state = get_unicode_state();
         /* Revive the dead object temporarily. PyDict_DelItem() removes two
            references (key and value) which were ignored by
            PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
@@ -1954,14 +1939,14 @@ unicode_dealloc(PyObject *unicode)
            PyDict_DelItem(). */
         assert(Py_REFCNT(unicode) == 0);
         Py_SET_REFCNT(unicode, 3);
-        if (PyDict_DelItem(interned, unicode) != 0) {
+        if (PyDict_DelItem(state->interned, unicode) != 0) {
             _PyErr_WriteUnraisableMsg("deletion of interned string failed",
                                       NULL);
         }
         assert(Py_REFCNT(unicode) == 1);
         Py_SET_REFCNT(unicode, 0);
-#endif
         break;
+    }
 
     case SSTATE_INTERNED_IMMORTAL:
         _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
@@ -11536,12 +11521,11 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
     if (PyUnicode_CHECK_INTERNED(left))
         return 0;
 
-#ifdef INTERNED_STRINGS
     assert(_PyUnicode_HASH(right_uni) != -1);
     Py_hash_t hash = _PyUnicode_HASH(left);
-    if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
+    if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
         return 0;
-#endif
+    }
 
     return unicode_compare_eq(left, right_uni);
 }
@@ -15765,23 +15749,21 @@ PyUnicode_InternInPlace(PyObject **p)
         return;
     }
 
-#ifdef INTERNED_STRINGS
     if (PyUnicode_READY(s) == -1) {
         PyErr_Clear();
         return;
     }
 
-    if (interned == NULL) {
-        interned = PyDict_New();
-        if (interned == NULL) {
+    struct _Py_unicode_state *state = get_unicode_state();
+    if (state->interned == NULL) {
+        state->interned = PyDict_New();
+        if (state->interned == NULL) {
             PyErr_Clear(); /* Don't leave an exception */
             return;
         }
     }
 
-    PyObject *t;
-    t = PyDict_SetDefault(interned, s, s);
-
+    PyObject *t = PyDict_SetDefault(state->interned, s, s);
     if (t == NULL) {
         PyErr_Clear();
         return;
@@ -15798,13 +15780,9 @@ PyUnicode_InternInPlace(PyObject **p)
        this. */
     Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
-#else
-    // PyDict expects that interned strings have their hash
-    // (PyASCIIObject.hash) already computed.
-    (void)unicode_hash(s);
-#endif
 }
 
+
 void
 PyUnicode_InternImmortal(PyObject **p)
 {
@@ -15838,35 +15816,25 @@ PyUnicode_InternFromString(const char *cp)
 void
 _PyUnicode_ClearInterned(PyThreadState *tstate)
 {
-    if (!_Py_IsMainInterpreter(tstate)) {
-        // interned dict is shared by all interpreters
-        return;
-    }
-
-    if (interned == NULL) {
-        return;
-    }
-    assert(PyDict_CheckExact(interned));
-
-    PyObject *keys = PyDict_Keys(interned);
-    if (keys == NULL) {
-        PyErr_Clear();
+    struct _Py_unicode_state *state = &tstate->interp->unicode;
+    if (state->interned == NULL) {
         return;
     }
-    assert(PyList_CheckExact(keys));
+    assert(PyDict_CheckExact(state->interned));
 
     /* Interned unicode strings are not forcibly deallocated; rather, we give
        them their stolen references back, and then clear and DECREF the
        interned dict. */
 
-    Py_ssize_t n = PyList_GET_SIZE(keys);
 #ifdef INTERNED_STATS
-    fprintf(stderr, "releasing %zd interned strings\n", n);
+    fprintf(stderr, "releasing %zd interned strings\n",
+            PyDict_GET_SIZE(state->interned));
 
     Py_ssize_t immortal_size = 0, mortal_size = 0;
 #endif
-    for (Py_ssize_t i = 0; i < n; i++) {
-        PyObject *s = PyList_GET_ITEM(keys, i);
+    Py_ssize_t pos = 0;
+    PyObject *s, *ignored_value;
+    while (PyDict_Next(state->interned, &pos, &s, &ignored_value)) {
         assert(PyUnicode_IS_READY(s));
 
         switch (PyUnicode_CHECK_INTERNED(s)) {
@@ -15896,10 +15864,9 @@ _PyUnicode_ClearInterned(PyThreadState *tstate)
             "total size of all interned strings: %zd/%zd mortal/immortal\n",
             mortal_size, immortal_size);
 #endif
-    Py_DECREF(keys);
 
-    PyDict_Clear(interned);
-    Py_CLEAR(interned);
+    PyDict_Clear(state->interned);
+    Py_CLEAR(state->interned);
 }
 
 
@@ -16269,19 +16236,19 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void)
 void
 _PyUnicode_Fini(PyThreadState *tstate)
 {
+    struct _Py_unicode_state *state = &tstate->interp->unicode;
+
     // _PyUnicode_ClearInterned() must be called before
+    assert(state->interned == NULL);
 
-    struct _Py_unicode_state *state = &tstate->interp->unicode;
+    _PyUnicode_FiniEncodings(&state->fs_codec);
 
-    Py_CLEAR(state->empty_string);
+    unicode_clear_identifiers(tstate);
 
     for (Py_ssize_t i = 0; i < 256; i++) {
         Py_CLEAR(state->latin1[i]);
     }
-
-    unicode_clear_identifiers(tstate);
-
-    _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
+    Py_CLEAR(state->empty_string);
 }
 
 
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index c3c1aa22e94a3..ccbacb481947c 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1573,6 +1573,8 @@ finalize_interp_types(PyThreadState *tstate)
     _PyFrame_Fini(tstate);
     _PyAsyncGen_Fini(tstate);
     _PyContext_Fini(tstate);
+    // Call _PyUnicode_ClearInterned() before _PyDict_Fini() since it uses
+    // a dict internally.
     _PyUnicode_ClearInterned(tstate);
 
     _PyDict_Fini(tstate);



More information about the Python-checkins mailing list