[3.13] gh-128013: fix data race in PyUnicode_AsUTF8AndSize on free-threading (#128021) (#128417)
https://github.com/python/cpython/commit/fa6c48e4b3d5b6f80a9aed07d4a5620bf62... commit: fa6c48e4b3d5b6f80a9aed07d4a5620bf62f195f branch: 3.13 author: Kumar Aditya <kumaraditya@python.org> committer: kumaraditya303 <kumaraditya@python.org> date: 2025-01-02T22:10:17+05:30 summary: [3.13] gh-128013: fix data race in PyUnicode_AsUTF8AndSize on free-threading (#128021) (#128417) files: M Lib/test/test_capi/test_unicode.py M Objects/unicodeobject.c diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index a69f817c515ba7..f750ec1a56fef9 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1,7 +1,7 @@ import unittest import sys from test import support -from test.support import import_helper +from test.support import threading_helper, import_helper try: import _testcapi @@ -959,6 +959,24 @@ def test_asutf8(self): self.assertRaises(TypeError, unicode_asutf8, [], 0) # CRASHES unicode_asutf8(NULL, 0) + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + @threading_helper.requires_working_threading() + def test_asutf8_race(self): + """Test that there's no race condition in PyUnicode_AsUTF8()""" + unicode_asutf8 = _testcapi.unicode_asutf8 + from threading import Thread + + data = "😊" + + def worker(): + for _ in range(1000): + self.assertEqual(unicode_asutf8(data, 5), b'\xf0\x9f\x98\x8a\0') + + threads = [Thread(target=worker) for _ in range(10)] + with threading_helper.start_threads(threads): + pass + + @support.cpython_only @unittest.skipIf(_testlimitedcapi is None, 'need _testlimitedcapi module') def test_asutf8andsize(self): diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d4ce7fd7c97970..7e61d615418722 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -111,20 +111,42 @@ NOTE: In the interpreter's initialization phase, some globals are currently # define _PyUnicode_CHECK(op) PyUnicode_Check(op) #endif -#define _PyUnicode_UTF8(op) \ - (_PyCompactUnicodeObject_CAST(op)->utf8) -#define PyUnicode_UTF8(op) \ - (assert(_PyUnicode_CHECK(op)), \ - PyUnicode_IS_COMPACT_ASCII(op) ? \ - ((char*)(_PyASCIIObject_CAST(op) + 1)) : \ - _PyUnicode_UTF8(op)) -#define _PyUnicode_UTF8_LENGTH(op) \ - (_PyCompactUnicodeObject_CAST(op)->utf8_length) -#define PyUnicode_UTF8_LENGTH(op) \ - (assert(_PyUnicode_CHECK(op)), \ - PyUnicode_IS_COMPACT_ASCII(op) ? \ - _PyASCIIObject_CAST(op)->length : \ - _PyUnicode_UTF8_LENGTH(op)) +static inline char* _PyUnicode_UTF8(PyObject *op) +{ + return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8); +} + +static inline char* PyUnicode_UTF8(PyObject *op) +{ + assert(_PyUnicode_CHECK(op)); + if (PyUnicode_IS_COMPACT_ASCII(op)) { + return ((char*)(_PyASCIIObject_CAST(op) + 1)); + } + else { + return _PyUnicode_UTF8(op); + } +} + +static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8) +{ + FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8); +} + +static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op) +{ + assert(_PyUnicode_CHECK(op)); + if (PyUnicode_IS_COMPACT_ASCII(op)) { + return _PyASCIIObject_CAST(op)->length; + } + else { + return _PyCompactUnicodeObject_CAST(op)->utf8_length; + } +} + +static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length) +{ + _PyCompactUnicodeObject_CAST(op)->utf8_length = length; +} #define _PyUnicode_LENGTH(op) \ (_PyASCIIObject_CAST(op)->length) @@ -132,26 +154,37 @@ NOTE: In the interpreter's initialization phase, some globals are currently (_PyASCIIObject_CAST(op)->state) #define _PyUnicode_HASH(op) \ (_PyASCIIObject_CAST(op)->hash) -#define _PyUnicode_KIND(op) \ - (assert(_PyUnicode_CHECK(op)), \ - _PyASCIIObject_CAST(op)->state.kind) -#define _PyUnicode_GET_LENGTH(op) \ - (assert(_PyUnicode_CHECK(op)), \ - _PyASCIIObject_CAST(op)->length) + +static inline Py_hash_t PyUnicode_HASH(PyObject *op) +{ + assert(_PyUnicode_CHECK(op)); + return FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash); +} + +static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash) +{ + FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash); +} + #define _PyUnicode_DATA_ANY(op) \ (_PyUnicodeObject_CAST(op)->data.any) -#define _PyUnicode_SHARE_UTF8(op) \ - (assert(_PyUnicode_CHECK(op)), \ - assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ - (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) +static inline int _PyUnicode_SHARE_UTF8(PyObject *op) +{ + assert(_PyUnicode_CHECK(op)); + assert(!PyUnicode_IS_COMPACT_ASCII(op)); + return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)); +} /* true if the Unicode object has an allocated UTF-8 memory block (not shared with other data) */ -#define _PyUnicode_HAS_UTF8_MEMORY(op) \ - ((!PyUnicode_IS_COMPACT_ASCII(op) \ - && _PyUnicode_UTF8(op) \ - && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) +static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op) +{ + return (!PyUnicode_IS_COMPACT_ASCII(op) + && _PyUnicode_UTF8(op) != NULL + && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)); +} + /* Generic helper macro to convert characters of different types. from_type and to_type have to be valid type names, begin and end @@ -650,7 +683,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) || kind == PyUnicode_2BYTE_KIND || kind == PyUnicode_4BYTE_KIND); CHECK(ascii->state.ascii == 0); - CHECK(compact->utf8 != data); + CHECK(_PyUnicode_UTF8(op) != data); } else { PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op); @@ -662,16 +695,17 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) CHECK(ascii->state.compact == 0); CHECK(data != NULL); if (ascii->state.ascii) { - CHECK(compact->utf8 == data); + CHECK(_PyUnicode_UTF8(op) == data); CHECK(compact->utf8_length == ascii->length); } else { - CHECK(compact->utf8 != data); + CHECK(_PyUnicode_UTF8(op) != data); } } - - if (compact->utf8 == NULL) +#ifndef Py_GIL_DISABLED + if (_PyUnicode_UTF8(op) == NULL) CHECK(compact->utf8_length == 0); +#endif } /* check that the best kind is used: O(n) operation */ @@ -1115,8 +1149,8 @@ resize_compact(PyObject *unicode, Py_ssize_t length) if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { PyMem_Free(_PyUnicode_UTF8(unicode)); - _PyUnicode_UTF8(unicode) = NULL; - _PyUnicode_UTF8_LENGTH(unicode) = 0; + PyUnicode_SET_UTF8_LENGTH(unicode, 0); + PyUnicode_SET_UTF8(unicode, NULL); } #ifdef Py_TRACE_REFS _Py_ForgetReference(unicode); @@ -1169,8 +1203,8 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) { PyMem_Free(_PyUnicode_UTF8(unicode)); - _PyUnicode_UTF8(unicode) = NULL; - _PyUnicode_UTF8_LENGTH(unicode) = 0; + PyUnicode_SET_UTF8_LENGTH(unicode, 0); + PyUnicode_SET_UTF8(unicode, NULL); } data = (PyObject *)PyObject_Realloc(data, new_size); @@ -1180,8 +1214,8 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) } _PyUnicode_DATA_ANY(unicode) = data; if (share_utf8) { - _PyUnicode_UTF8(unicode) = data; - _PyUnicode_UTF8_LENGTH(unicode) = length; + PyUnicode_SET_UTF8_LENGTH(unicode, length); + PyUnicode_SET_UTF8(unicode, data); } _PyUnicode_LENGTH(unicode) = length; PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); @@ -1411,12 +1445,12 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, assert(unicode != NULL); assert(_PyUnicode_CHECK(unicode)); - assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); + assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); ucs4_out = PyUnicode_4BYTE_DATA(unicode); for (iter = begin; iter < end; ) { assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + - _PyUnicode_GET_LENGTH(unicode))); + PyUnicode_GET_LENGTH(unicode))); if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) && (iter+1) < end && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) @@ -1430,7 +1464,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, } } assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + - _PyUnicode_GET_LENGTH(unicode))); + PyUnicode_GET_LENGTH(unicode))); } #endif @@ -1801,7 +1835,7 @@ unicode_modifiable(PyObject *unicode) assert(_PyUnicode_CHECK(unicode)); if (Py_REFCNT(unicode) != 1) return 0; - if (FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(unicode)) != -1) + if (PyUnicode_HASH(unicode) != -1) return 0; if (PyUnicode_CHECK_INTERNED(unicode)) return 0; @@ -4052,6 +4086,21 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr) static int unicode_fill_utf8(PyObject *unicode); + +static int +unicode_ensure_utf8(PyObject *unicode) +{ + int err = 0; + if (PyUnicode_UTF8(unicode) == NULL) { + Py_BEGIN_CRITICAL_SECTION(unicode); + if (PyUnicode_UTF8(unicode) == NULL) { + err = unicode_fill_utf8(unicode); + } + Py_END_CRITICAL_SECTION(); + } + return err; +} + const char * PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) { @@ -4063,13 +4112,11 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) return NULL; } - if (PyUnicode_UTF8(unicode) == NULL) { - if (unicode_fill_utf8(unicode) == -1) { - if (psize) { - *psize = -1; - } - return NULL; + if (unicode_ensure_utf8(unicode) == -1) { + if (psize) { + *psize = -1; } + return NULL; } if (psize) { @@ -5401,6 +5448,7 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, static int unicode_fill_utf8(PyObject *unicode) { + _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode); /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ assert(!PyUnicode_IS_ASCII(unicode)); @@ -5442,10 +5490,10 @@ unicode_fill_utf8(PyObject *unicode) PyErr_NoMemory(); return -1; } - _PyUnicode_UTF8(unicode) = cache; - _PyUnicode_UTF8_LENGTH(unicode) = len; memcpy(cache, start, len); cache[len] = '\0'; + PyUnicode_SET_UTF8_LENGTH(unicode, len); + PyUnicode_SET_UTF8(unicode, cache); _PyBytesWriter_Dealloc(&writer); return 0; } @@ -10996,9 +11044,9 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right) return 0; } - Py_hash_t right_hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(right_uni)); + Py_hash_t right_hash = PyUnicode_HASH(right_uni); assert(right_hash != -1); - Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(left)); + Py_hash_t hash = PyUnicode_HASH(left); if (hash != -1 && hash != right_hash) { return 0; } @@ -11484,14 +11532,14 @@ unicode_hash(PyObject *self) #ifdef Py_DEBUG assert(_Py_HashSecret_Initialized); #endif - Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(self)); + Py_hash_t hash = PyUnicode_HASH(self); if (hash != -1) { return hash; } x = _Py_HashBytes(PyUnicode_DATA(self), PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); - FT_ATOMIC_STORE_SSIZE_RELAXED(_PyUnicode_HASH(self), x); + PyUnicode_SET_HASH(self, x); return x; } @@ -14888,8 +14936,8 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode) _PyUnicode_STATE(self).compact = 0; _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; _PyUnicode_STATE(self).statically_allocated = 0; - _PyUnicode_UTF8_LENGTH(self) = 0; - _PyUnicode_UTF8(self) = NULL; + PyUnicode_SET_UTF8_LENGTH(self, 0); + PyUnicode_SET_UTF8(self, NULL); _PyUnicode_DATA_ANY(self) = NULL; share_utf8 = 0; @@ -14919,8 +14967,8 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode) _PyUnicode_DATA_ANY(self) = data; if (share_utf8) { - _PyUnicode_UTF8_LENGTH(self) = length; - _PyUnicode_UTF8(self) = data; + PyUnicode_SET_UTF8_LENGTH(self, length); + PyUnicode_SET_UTF8(self, data); } memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
participants (1)
-
kumaraditya303