Mailman 3 August 2018 - Python-checkins

bpo-34523: Support surrogatepass in locale codecs (GH-8995)
by Victor Stinner Aug. 29, 2018

Aug. 29, 2018

https://github.com/python/cpython/commit/3d4226a832cabc630402589cc671cc4035… commit: 3d4226a832cabc630402589cc671cc4035d504e5 branch: master author: Victor Stinner <vstinner(a)redhat.com> committer: GitHub <noreply(a)github.com> date: 2018-08-29T22:21:32+02:00 summary: bpo-34523: Support surrogatepass in locale codecs (GH-8995) Add support for the "surrogatepass" error handler in PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault() for the UTF-8 encoding. Changes: * _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex() now support the surrogatepass error handler (_Py_ERROR_SURROGATEPASS). * _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() now use the _Py_error_handler enum instead of "int surrogateescape" to pass the error handler. These functions now return -3 if the error handler is unknown. * Add unit tests on _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() in test_codecs. * Rename get_error_handler() to _Py_GetErrorHandler() and expose it as a private function. * _freeze_importlib doesn't need config.filesystem_errors="strict" workaround anymore. files: M Include/fileutils.h M Lib/test/test_codecs.py M Modules/_testcapimodule.c M Objects/stringlib/codecs.h M Objects/unicodeobject.c M Programs/_freeze_importlib.c M Python/fileutils.c diff --git a/Include/fileutils.h b/Include/fileutils.h index 370878469df7..f0a8e2c61a4f 100644 --- a/Include/fileutils.h +++ b/Include/fileutils.h @@ -5,6 +5,24 @@ extern "C" { #endif + +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000 +typedef enum { + _Py_ERROR_UNKNOWN=0, + _Py_ERROR_STRICT, + _Py_ERROR_SURROGATEESCAPE, + _Py_ERROR_REPLACE, + _Py_ERROR_IGNORE, + _Py_ERROR_BACKSLASHREPLACE, + _Py_ERROR_SURROGATEPASS, + _Py_ERROR_XMLCHARREFREPLACE, + _Py_ERROR_OTHER +} _Py_error_handler; + +PyAPI_FUNC(_Py_error_handler) _Py_GetErrorHandler(const char *errors); +#endif + + #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000 PyAPI_FUNC(wchar_t *) Py_DecodeLocale( const char *arg, @@ -26,7 +44,7 @@ PyAPI_FUNC(int) _Py_DecodeUTF8Ex( wchar_t **wstr, size_t *wlen, const char **reason, - int surrogateescape); + _Py_error_handler errors); PyAPI_FUNC(int) _Py_EncodeUTF8Ex( const wchar_t *text, @@ -34,19 +52,22 @@ PyAPI_FUNC(int) _Py_EncodeUTF8Ex( size_t *error_pos, const char **reason, int raw_malloc, - int surrogateescape); + _Py_error_handler errors); PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape( const char *arg, Py_ssize_t arglen); +#endif + +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000 PyAPI_FUNC(int) _Py_DecodeLocaleEx( const char *arg, wchar_t **wstr, size_t *wlen, const char **reason, int current_locale, - int surrogateescape); + _Py_error_handler errors); PyAPI_FUNC(int) _Py_EncodeLocaleEx( const wchar_t *text, @@ -54,7 +75,7 @@ PyAPI_FUNC(int) _Py_EncodeLocaleEx( size_t *error_pos, const char **reason, int current_locale, - int surrogateescape); + _Py_error_handler errors); #endif #ifndef Py_LIMITED_API diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 86d0dde17057..00b5d317c401 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -9,6 +9,11 @@ from test import support +try: + import _testcapi +except ImportError as exc: + _testcapi = None + try: import ctypes except ImportError: @@ -2051,13 +2056,12 @@ def test_basics(self): @support.cpython_only def test_basics_capi(self): - from _testcapi import codec_incrementalencoder, codec_incrementaldecoder s = "abc123" # all codecs should be able to encode these for encoding in all_unicode_encodings: if encoding not in broken_unicode_with_stateful: # check incremental decoder/encoder (fetched via the C API) try: - cencoder = codec_incrementalencoder(encoding) + cencoder = _testcapi.codec_incrementalencoder(encoding) except LookupError: # no IncrementalEncoder pass else: @@ -2066,7 +2070,7 @@ def test_basics_capi(self): for c in s: encodedresult += cencoder.encode(c) encodedresult += cencoder.encode("", True) - cdecoder = codec_incrementaldecoder(encoding) + cdecoder = _testcapi.codec_incrementaldecoder(encoding) decodedresult = "" for c in encodedresult: decodedresult += cdecoder.decode(bytes([c])) @@ -2077,12 +2081,12 @@ def test_basics_capi(self): if encoding not in ("idna", "mbcs"): # check incremental decoder/encoder with errors argument try: - cencoder = codec_incrementalencoder(encoding, "ignore") + cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore") except LookupError: # no IncrementalEncoder pass else: encodedresult = b"".join(cencoder.encode(c) for c in s) - cdecoder = codec_incrementaldecoder(encoding, "ignore") + cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore") decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult) self.assertEqual(decodedresult, s, @@ -3263,5 +3267,109 @@ def test_decode(self): self.assertEqual(data.decode('latin1'), expected) +(a)unittest.skipIf(_testcapi is None, 'need _testcapi module') +class LocaleCodecTest(unittest.TestCase): + """ + Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex(). + """ + ENCODING = sys.getfilesystemencoding() + STRINGS = ("ascii", "ulatin1:\xa7\xe9", + "u255:\xff", + "UCS:\xe9\u20ac\U0010ffff", + "surrogates:\uDC80\uDCFF") + BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff") + SURROGATES = "\uDC80\uDCFF" + + def encode(self, text, errors="strict"): + return _testcapi.EncodeLocaleEx(text, 0, errors) + + def check_encode_strings(self, errors): + for text in self.STRINGS: + with self.subTest(text=text): + try: + expected = text.encode(self.ENCODING, errors) + except UnicodeEncodeError: + with self.assertRaises(RuntimeError) as cm: + self.encode(self.SURROGATES) + errmsg = str(cm.exception) + self.assertTrue(errmsg.startswith("encode error: pos=0, reason="), errmsg) + else: + encoded = self.encode(text, errors) + self.assertEqual(encoded, expected) + + def test_encode_strict(self): + self.check_encode_strings("strict") + + def test_encode_surrogateescape(self): + self.check_encode_strings("surrogateescape") + + def test_encode_surrogatepass(self): + try: + self.encode('', 'surrogatepass') + except ValueError as exc: + if str(exc) == 'unsupported error handler': + self.skipTest(f"{self.ENCODING!r} encoder doesn't support " + f"surrogatepass error handler") + else: + raise + + self.check_encode_strings("surrogatepass") + + def decode(self, encoded, errors="strict"): + return _testcapi.DecodeLocaleEx(encoded, 0, errors) + + def check_decode_strings(self, errors): + is_utf8 = (self.ENCODING == "utf-8") + if is_utf8: + encode_errors = 'surrogateescape' + else: + encode_errors = 'strict' + + strings = list(self.BYTES_STRINGS) + for text in self.STRINGS: + try: + encoded = text.encode(self.ENCODING, encode_errors) + if encoded not in strings: + strings.append(encoded) + except UnicodeEncodeError: + encoded = None + + if is_utf8: + encoded2 = text.encode(self.ENCODING, 'surrogatepass') + if encoded2 != encoded: + strings.append(encoded2) + + for encoded in strings: + with self.subTest(encoded=encoded): + try: + expected = encoded.decode(self.ENCODING, errors) + except UnicodeDecodeError: + with self.assertRaises(RuntimeError) as cm: + self.decode(encoded, errors) + errmsg = str(cm.exception) + self.assertTrue(errmsg.startswith("decode error: "), errmsg) + else: + decoded = self.decode(encoded, errors) + self.assertEqual(decoded, expected) + + def test_decode_strict(self): + self.check_decode_strings("strict") + + def test_decode_surrogateescape(self): + self.check_decode_strings("surrogateescape") + + def test_decode_surrogatepass(self): + try: + self.decode(b'', 'surrogatepass') + except ValueError as exc: + if str(exc) == 'unsupported error handler': + self.skipTest(f"{self.ENCODING!r} decoder doesn't support " + f"surrogatepass error handler") + else: + raise + + self.check_decode_strings("surrogatepass") + + if __name__ == "__main__": unittest.main() diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c index 014c2f325af3..7c2c57b98001 100644 --- a/Modules/_testcapimodule.c +++ b/Modules/_testcapimodule.c @@ -4550,6 +4550,98 @@ new_hamt(PyObject *self, PyObject *args) } +static PyObject * +encode_locale_ex(PyObject *self, PyObject *args) +{ + PyObject *unicode; + int current_locale = 0; + wchar_t *wstr; + PyObject *res = NULL; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "U|is", &unicode, &current_locale, &errors)) { + return NULL; + } + wstr = PyUnicode_AsWideCharString(unicode, NULL); + if (wstr == NULL) { + return NULL; + } + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); + + char *str = NULL; + size_t error_pos; + const char *reason = NULL; + int ret = _Py_EncodeLocaleEx(wstr, + &str, &error_pos, &reason, + current_locale, error_handler); + PyMem_Free(wstr); + + switch(ret) { + case 0: + res = PyBytes_FromString(str); + PyMem_RawFree(str); + break; + case -1: + PyErr_NoMemory(); + break; + case -2: + PyErr_Format(PyExc_RuntimeError, "encode error: pos=%zu, reason=%s", + error_pos, reason); + break; + case -3: + PyErr_SetString(PyExc_ValueError, "unsupported error handler"); + break; + default: + PyErr_SetString(PyExc_ValueError, "unknow error code"); + break; + } + return res; +} + + +static PyObject * +decode_locale_ex(PyObject *self, PyObject *args) +{ + char *str; + int current_locale = 0; + PyObject *res = NULL; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "y|is", &str, &current_locale, &errors)) { + return NULL; + } + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); + + wchar_t *wstr = NULL; + size_t wlen = 0; + const char *reason = NULL; + int ret = _Py_DecodeLocaleEx(str, + &wstr, &wlen, &reason, + current_locale, error_handler); + + switch(ret) { + case 0: + res = PyUnicode_FromWideChar(wstr, wlen); + PyMem_RawFree(wstr); + break; + case -1: + PyErr_NoMemory(); + break; + case -2: + PyErr_Format(PyExc_RuntimeError, "decode error: pos=%zu, reason=%s", + wlen, reason); + break; + case -3: + PyErr_SetString(PyExc_ValueError, "unsupported error handler"); + break; + default: + PyErr_SetString(PyExc_ValueError, "unknow error code"); + break; + } + return res; +} + + static PyMethodDef TestMethods[] = { {"raise_exception", raise_exception, METH_VARARGS}, {"raise_memoryerror", raise_memoryerror, METH_NOARGS}, @@ -4771,6 +4863,8 @@ static PyMethodDef TestMethods[] = { {"get_mapping_items", get_mapping_items, METH_O}, {"test_pythread_tss_key_state", test_pythread_tss_key_state, METH_VARARGS}, {"hamt", new_hamt, METH_NOARGS}, + {"EncodeLocaleEx", encode_locale_ex, METH_VARARGS}, + {"DecodeLocaleEx", decode_locale_ex, METH_VARARGS}, {NULL, NULL} /* sentinel */ }; diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index f019d9a96bfb..0abb4c8abb92 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -313,7 +313,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, Py_ssize_t startpos, endpos, newpos; Py_ssize_t k; if (error_handler == _Py_ERROR_UNKNOWN) { - error_handler = get_error_handler(errors); + error_handler = _Py_GetErrorHandler(errors); } startpos = i-1; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 60adcd9c88df..a797f838eb41 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -318,20 +318,8 @@ static int convert_uc(PyObject *obj, void *addr); #include "clinic/unicodeobject.c.h" -typedef enum { - _Py_ERROR_UNKNOWN=0, - _Py_ERROR_STRICT, - _Py_ERROR_SURROGATEESCAPE, - _Py_ERROR_REPLACE, - _Py_ERROR_IGNORE, - _Py_ERROR_BACKSLASHREPLACE, - _Py_ERROR_SURROGATEPASS, - _Py_ERROR_XMLCHARREFREPLACE, - _Py_ERROR_OTHER -} _Py_error_handler; - -static _Py_error_handler -get_error_handler(const char *errors) +_Py_error_handler +_Py_GetErrorHandler(const char *errors) { if (errors == NULL || strcmp(errors, "strict") == 0) { return _Py_ERROR_STRICT; @@ -3327,34 +3315,12 @@ PyUnicode_AsEncodedObject(PyObject *unicode, return NULL; } -static int -locale_error_handler(const char *errors, int *surrogateescape) -{ - _Py_error_handler error_handler = get_error_handler(errors); - switch (error_handler) - { - case _Py_ERROR_STRICT: - *surrogateescape = 0; - return 0; - case _Py_ERROR_SURROGATEESCAPE: - *surrogateescape = 1; - return 0; - default: - PyErr_Format(PyExc_ValueError, - "only 'strict' and 'surrogateescape' error handlers " - "are supported, not '%s'", - errors); - return -1; - } -} static PyObject * unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale) { - int surrogateescape; - if (locale_error_handler(errors, &surrogateescape) < 0) - return NULL; + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); Py_ssize_t wlen; wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen); @@ -3373,7 +3339,7 @@ unicode_encode_locale(PyObject *unicode, const char *errors, size_t error_pos; const char *reason; int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason, - current_locale, surrogateescape); + current_locale, error_handler); if (res != 0) { if (res == -2) { PyObject *exc; @@ -3388,6 +3354,9 @@ unicode_encode_locale(PyObject *unicode, const char *errors, } return NULL; } + else if (res == -3) { + PyErr_SetString(PyExc_ValueError, "unsupported error handler"); + } else { PyErr_NoMemory(); PyMem_Free(wstr); @@ -3571,9 +3540,7 @@ static PyObject* unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, int current_locale) { - int surrogateescape; - if (locale_error_handler(errors, &surrogateescape) < 0) - return NULL; + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); if (str[len] != '\0' || (size_t)len != strlen(str)) { PyErr_SetString(PyExc_ValueError, "embedded null byte"); @@ -3584,7 +3551,7 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, size_t wlen; const char *reason; int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason, - current_locale, surrogateescape); + current_locale, error_handler); if (res != 0) { if (res == -2) { PyObject *exc; @@ -3598,6 +3565,9 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, Py_DECREF(exc); } } + else if (res == -3) { + PyErr_SetString(PyExc_ValueError, "unsupported error handler"); + } else { PyErr_NoMemory(); } @@ -4863,7 +4833,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, } if (error_handler == _Py_ERROR_UNKNOWN) - error_handler = get_error_handler(errors); + error_handler = _Py_GetErrorHandler(errors); switch (error_handler) { case _Py_ERROR_IGNORE: @@ -4932,13 +4902,29 @@ PyUnicode_DecodeUTF8Stateful(const char *s, is not NULL, write the decoding error message into *reason. */ int _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen, - const char **reason, int surrogateescape) + const char **reason, _Py_error_handler errors) { const char *orig_s = s; const char *e; wchar_t *unicode; Py_ssize_t outpos; + int surrogateescape = 0; + int surrogatepass = 0; + switch (errors) + { + case _Py_ERROR_STRICT: + break; + case _Py_ERROR_SURROGATEESCAPE: + surrogateescape = 1; + break; + case _Py_ERROR_SURROGATEPASS: + surrogatepass = 1; + break; + default: + return -3; + } + /* Note: size will always be longer than the resulting Unicode character count */ if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) { @@ -4971,31 +4957,47 @@ _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen, #endif } else { - if (!ch && s == e) + if (!ch && s == e) { break; - if (!surrogateescape) { - PyMem_RawFree(unicode ); - if (reason != NULL) { - switch (ch) { - case 0: - *reason = "unexpected end of data"; - break; - case 1: - *reason = "invalid start byte"; - break; - /* 2, 3, 4 */ - default: - *reason = "invalid continuation byte"; - break; - } + } + + if (surrogateescape) { + unicode[outpos++] = 0xDC00 + (unsigned char)*s++; + } + else { + /* Is it a valid three-byte code? */ + if (surrogatepass + && (e - s) >= 3 + && (s[0] & 0xf0) == 0xe0 + && (s[1] & 0xc0) == 0x80 + && (s[2] & 0xc0) == 0x80) + { + ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); + s += 3; + unicode[outpos++] = ch; } - if (wlen != NULL) { - *wlen = s - orig_s; + else { + PyMem_RawFree(unicode ); + if (reason != NULL) { + switch (ch) { + case 0: + *reason = "unexpected end of data"; + break; + case 1: + *reason = "invalid start byte"; + break; + /* 2, 3, 4 */ + default: + *reason = "invalid continuation byte"; + break; + } + } + if (wlen != NULL) { + *wlen = s - orig_s; + } + return -2; } - return -2; } - /* surrogateescape */ - unicode[outpos++] = 0xDC00 + (unsigned char)*s++; } } unicode[outpos] = L'\0'; @@ -5030,13 +5032,29 @@ _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen) On memory allocation failure, return -1. */ int _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, - const char **reason, int raw_malloc, int surrogateescape) + const char **reason, int raw_malloc, _Py_error_handler errors) { const Py_ssize_t max_char_size = 4; Py_ssize_t len = wcslen(text); assert(len >= 0); + int surrogateescape = 0; + int surrogatepass = 0; + switch (errors) + { + case _Py_ERROR_STRICT: + break; + case _Py_ERROR_SURROGATEESCAPE: + surrogateescape = 1; + break; + case _Py_ERROR_SURROGATEPASS: + surrogatepass = 1; + break; + default: + return -3; + } + if (len > PY_SSIZE_T_MAX / max_char_size - 1) { return -1; } @@ -5053,8 +5071,19 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, char *p = bytes; Py_ssize_t i; - for (i = 0; i < len; i++) { + for (i = 0; i < len; ) { + Py_ssize_t ch_pos = i; Py_UCS4 ch = text[i]; + i++; +#if Py_UNICODE_SIZE == 2 + if (Py_UNICODE_IS_HIGH_SURROGATE(ch) + && i < len + && Py_UNICODE_IS_LOW_SURROGATE(text[i])) + { + ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]); + i++; + } +#endif if (ch < 0x80) { /* Encode ASCII */ @@ -5066,11 +5095,11 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, *p++ = (char)(0xc0 | (ch >> 6)); *p++ = (char)(0x80 | (ch & 0x3f)); } - else if (Py_UNICODE_IS_SURROGATE(ch)) { + else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) { /* surrogateescape error handler */ if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) { if (error_pos != NULL) { - *error_pos = (size_t)i; + *error_pos = (size_t)ch_pos; } if (reason != NULL) { *reason = "encoding error"; @@ -6741,7 +6770,7 @@ unicode_encode_ucs1(PyObject *unicode, /* cache callback name lookup (if not done yet, i.e. it's the first error) */ if (error_handler == _Py_ERROR_UNKNOWN) - error_handler = get_error_handler(errors); + error_handler = _Py_GetErrorHandler(errors); switch (error_handler) { case _Py_ERROR_STRICT: @@ -6945,7 +6974,7 @@ PyUnicode_DecodeASCII(const char *s, /* byte outsize range 0x00..0x7f: call the error handler */ if (error_handler == _Py_ERROR_UNKNOWN) - error_handler = get_error_handler(errors); + error_handler = _Py_GetErrorHandler(errors); switch (error_handler) { @@ -8404,7 +8433,7 @@ charmap_encoding_error( /* cache callback name lookup * (if not done yet, i.e. it's the first error) */ if (*error_handler == _Py_ERROR_UNKNOWN) - *error_handler = get_error_handler(errors); + *error_handler = _Py_GetErrorHandler(errors); switch (*error_handler) { case _Py_ERROR_STRICT: diff --git a/Programs/_freeze_importlib.c b/Programs/_freeze_importlib.c index 2621a7687edd..8830d131d6f4 100644 --- a/Programs/_freeze_importlib.c +++ b/Programs/_freeze_importlib.c @@ -82,14 +82,6 @@ main(int argc, char *argv[]) /* Don't install importlib, since it could execute outdated bytecode. */ config._install_importlib = 0; config._frozen = 1; -#ifdef MS_WINDOWS - /* bpo-34523: initfsencoding() is not called if _install_importlib=0, - so interp->fscodec_initialized value remains 0. - PyUnicode_EncodeFSDefault() doesn't support the "surrogatepass" error - handler in such case, whereas it's the default error handler on Windows. - Force the "strict" error handler to work around this bootstrap issue. */ - config.filesystem_errors = "strict"; -#endif _PyInitError err = _Py_InitializeFromConfig(&config); /* No need to call _PyCoreConfig_Clear() since we didn't allocate any diff --git a/Python/fileutils.c b/Python/fileutils.c index 9a3c334d43bf..0486f865924a 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -32,6 +32,24 @@ extern int winerror_to_errno(int); int _Py_open_cloexec_works = -1; #endif + +static int +get_surrogateescape(_Py_error_handler errors, int *surrogateescape) +{ + switch (errors) + { + case _Py_ERROR_STRICT: + *surrogateescape = 0; + return 0; + case _Py_ERROR_SURROGATEESCAPE: + *surrogateescape = 1; + return 0; + default: + return -1; + } +} + + PyObject * _Py_device_encoding(int fd) { @@ -215,12 +233,17 @@ _Py_GetForceASCII(void) static int encode_ascii(const wchar_t *text, char **str, size_t *error_pos, const char **reason, - int raw_malloc, int surrogateescape) + int raw_malloc, _Py_error_handler errors) { char *result = NULL, *out; size_t len, i; wchar_t ch; + int surrogateescape; + if (get_surrogateescape(errors, &surrogateescape) < 0) { + return -3; + } + len = wcslen(text); /* +1 for NULL byte */ @@ -278,13 +301,18 @@ _Py_GetForceASCII(void) #if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII) static int decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen, - const char **reason, int surrogateescape) + const char **reason, _Py_error_handler errors) { wchar_t *res; unsigned char *in; wchar_t *out; size_t argsize = strlen(arg) + 1; + int surrogateescape; + if (get_surrogateescape(errors, &surrogateescape) < 0) { + return -3; + } + if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) { return -1; } @@ -325,7 +353,7 @@ decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen, static int decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, - const char **reason, int surrogateescape) + const char **reason, _Py_error_handler errors) { wchar_t *res; size_t argsize; @@ -336,6 +364,11 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, mbstate_t mbs; #endif + int surrogateescape; + if (get_surrogateescape(errors, &surrogateescape) < 0) { + return -3; + } + #ifdef HAVE_BROKEN_MBSTOWCS /* Some platforms have a broken implementation of * mbstowcs which does not count the characters that @@ -456,7 +489,7 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, /* Cannot use C locale for escaping; manually escape as if charset is ASCII (i.e. escape all bytes > 128. This will still roundtrip correctly in the locale's charset, which must be an ASCII superset. */ - return decode_ascii(arg, wstr, wlen, reason, surrogateescape); + return decode_ascii(arg, wstr, wlen, reason, errors); #endif /* HAVE_MBRTOWC */ } @@ -479,33 +512,35 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, invalid byte sequence in the input string into *wlen. If reason is not NULL, write the decoding error message into *reason. + Return -3 if the error handler 'errors' is not supported. + Use the Py_EncodeLocaleEx() function to encode the character string back to a byte string. */ int _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen, const char **reason, - int current_locale, int surrogateescape) + int current_locale, _Py_error_handler errors) { if (current_locale) { #ifdef __ANDROID__ return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, - surrogateescape); + errors); #else - return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); + return decode_current_locale(arg, wstr, wlen, reason, errors); #endif } #if defined(__APPLE__) || defined(__ANDROID__) return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, - surrogateescape); + errors); #else int use_utf8 = (Py_UTF8Mode == 1); #ifdef MS_WINDOWS use_utf8 |= !Py_LegacyWindowsFSEncodingFlag; #endif if (use_utf8) { - return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, - reason, surrogateescape); + return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, + errors); } #ifdef USE_FORCE_ASCII @@ -515,11 +550,11 @@ _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen, if (force_ascii) { /* force ASCII encoding to workaround mbstowcs() issue */ - return decode_ascii(arg, wstr, wlen, reason, surrogateescape); + return decode_ascii(arg, wstr, wlen, reason, errors); } #endif - return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); + return decode_current_locale(arg, wstr, wlen, reason, errors); #endif /* __APPLE__ or __ANDROID__ */ } @@ -547,8 +582,11 @@ wchar_t* Py_DecodeLocale(const char* arg, size_t *wlen) { wchar_t *wstr; - int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1); + int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, + NULL, 0, + _Py_ERROR_SURROGATEESCAPE); if (res != 0) { + assert(res != -3); if (wlen != NULL) { *wlen = (size_t)res; } @@ -561,13 +599,18 @@ Py_DecodeLocale(const char* arg, size_t *wlen) static int encode_current_locale(const wchar_t *text, char **str, size_t *error_pos, const char **reason, - int raw_malloc, int surrogateescape) + int raw_malloc, _Py_error_handler errors) { const size_t len = wcslen(text); char *result = NULL, *bytes = NULL; size_t i, size, converted; wchar_t c, buf[2]; + int surrogateescape; + if (get_surrogateescape(errors, &surrogateescape) < 0) { + return -3; + } + /* The function works in two steps: 1. compute the length of the output buffer in bytes (size) 2. outputs the bytes */ @@ -646,32 +689,50 @@ encode_current_locale(const wchar_t *text, char **str, return -2; } + +/* Encode a string to the locale encoding. + + Parameters: + + * raw_malloc: if non-zero, allocate memory using PyMem_RawMalloc() instead + of PyMem_Malloc(). + * current_locale: if non-zero, use the current LC_CTYPE, otherwise use + Python filesystem encoding. + * errors: error handler like "strict" or "surrogateescape". + + Return value: + + 0: success, *str is set to a newly allocated decoded string. + -1: memory allocation failure + -2: encoding error, set *error_pos and *reason (if set). + -3: the error handler 'errors' is not supported. + */ static int encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos, const char **reason, - int raw_malloc, int current_locale, int surrogateescape) + int raw_malloc, int current_locale, _Py_error_handler errors) { if (current_locale) { #ifdef __ANDROID__ return _Py_EncodeUTF8Ex(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); #else return encode_current_locale(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); #endif } #if defined(__APPLE__) || defined(__ANDROID__) return _Py_EncodeUTF8Ex(text, str, error_pos, reason, - raw_malloc, surrogateescape); -#else /* __APPLE__ */ + raw_malloc, errors); +#else int use_utf8 = (Py_UTF8Mode == 1); #ifdef MS_WINDOWS use_utf8 |= !Py_LegacyWindowsFSEncodingFlag; #endif if (use_utf8) { return _Py_EncodeUTF8Ex(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); } #ifdef USE_FORCE_ASCII @@ -681,12 +742,12 @@ encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos, if (force_ascii) { return encode_ascii(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); } #endif return encode_current_locale(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); #endif /* __APPLE__ or __ANDROID__ */ } @@ -696,7 +757,8 @@ encode_locale(const wchar_t *text, size_t *error_pos, { char *str; int res = encode_locale_ex(text, &str, error_pos, NULL, - raw_malloc, current_locale, 1); + raw_malloc, current_locale, + _Py_ERROR_SURROGATEESCAPE); if (res != -2 && error_pos) { *error_pos = (size_t)-1; } @@ -737,10 +799,10 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos) int _Py_EncodeLocaleEx(const wchar_t *text, char **str, size_t *error_pos, const char **reason, - int current_locale, int surrogateescape) + int current_locale, _Py_error_handler errors) { return encode_locale_ex(text, str, error_pos, reason, 1, - current_locale, surrogateescape); + current_locale, errors); }

1 0

bpo-34523: Py_DecodeLocale() use UTF-8 on Windows (GH-8998)
by Victor Stinner Aug. 29, 2018

Aug. 29, 2018

https://github.com/python/cpython/commit/c5989cd87659acbfd4d19dc00dbe99c3a0… commit: c5989cd87659acbfd4d19dc00dbe99c3a0fc9bd2 branch: master author: Victor Stinner <vstinner(a)redhat.com> committer: GitHub <noreply(a)github.com> date: 2018-08-29T19:32:47+02:00 summary: bpo-34523: Py_DecodeLocale() use UTF-8 on Windows (GH-8998) Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding on Windows if Py_LegacyWindowsFSEncodingFlag is zero. pymain_read_conf() now sets Py_LegacyWindowsFSEncodingFlag in its loop, but restore its value at exit. files: A Misc/NEWS.d/next/C API/2018-08-29-18-48-47.bpo-34523.lLQ8rh.rst M Doc/c-api/sys.rst M Lib/test/test_embed.py M Modules/main.c M Python/fileutils.c diff --git a/Doc/c-api/sys.rst b/Doc/c-api/sys.rst index 994509aa50f2..0eee35a1285c 100644 --- a/Doc/c-api/sys.rst +++ b/Doc/c-api/sys.rst @@ -109,6 +109,7 @@ Operating System Utilities Encoding, highest priority to lowest priority: * ``UTF-8`` on macOS and Android; + * ``UTF-8`` on Windows if :c:data:`Py_LegacyWindowsFSEncodingFlag` is zero; * ``UTF-8`` if the Python UTF-8 mode is enabled; * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``, ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias), @@ -140,6 +141,10 @@ Operating System Utilities .. versionchanged:: 3.7 The function now uses the UTF-8 encoding in the UTF-8 mode. + .. versionchanged:: 3.8 + The function now uses the UTF-8 encoding on Windows if + :c:data:`Py_LegacyWindowsFSEncodingFlag` is zero; + .. c:function:: char* Py_EncodeLocale(const wchar_t *text, size_t *error_pos) @@ -150,6 +155,7 @@ Operating System Utilities Encoding, highest priority to lowest priority: * ``UTF-8`` on macOS and Android; + * ``UTF-8`` on Windows if :c:data:`Py_LegacyWindowsFSEncodingFlag` is zero; * ``UTF-8`` if the Python UTF-8 mode is enabled; * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``, ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias), @@ -169,9 +175,6 @@ Operating System Utilities Use the :c:func:`Py_DecodeLocale` function to decode the bytes string back to a wide character string. - .. versionchanged:: 3.7 - The function now uses the UTF-8 encoding in the UTF-8 mode. - .. seealso:: The :c:func:`PyUnicode_EncodeFSDefault` and @@ -180,7 +183,11 @@ Operating System Utilities .. versionadded:: 3.5 .. versionchanged:: 3.7 - The function now supports the UTF-8 mode. + The function now uses the UTF-8 encoding in the UTF-8 mode. + + .. versionchanged:: 3.8 + The function now uses the UTF-8 encoding on Windows if + :c:data:`Py_LegacyWindowsFSEncodingFlag` is zero; .. _systemfunctions: diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index b6311e4b334b..9155c40f405e 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -268,10 +268,10 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): 'dump_refs': 0, 'malloc_stats': 0, - # None means that the default encoding is read at runtime: - # see get_locale_encoding(). + # None means that the value is get by get_locale_encoding() 'filesystem_encoding': None, - 'filesystem_errors': sys.getfilesystemencodeerrors(), + 'filesystem_errors': None, + 'utf8_mode': 0, 'coerce_c_locale': 0, 'coerce_c_locale_warn': 0, @@ -294,7 +294,8 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): 'quiet': 0, 'user_site_directory': 1, 'buffered_stdio': 1, - # None means that check_config() gets the expected encoding at runtime + + # None means that the value is get by get_stdio_encoding() 'stdio_encoding': None, 'stdio_errors': None, @@ -303,7 +304,6 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): '_frozen': 0, } - def get_stdio_encoding(self, env): code = 'import sys; print(sys.stdout.encoding, sys.stdout.errors)' args = (sys.executable, '-c', code) @@ -315,18 +315,12 @@ def get_stdio_encoding(self, env): out = proc.stdout.rstrip() return out.split() - def get_locale_encoding(self, isolated): - if sys.platform in ('win32', 'darwin') or support.is_android: - # Windows, macOS and Android use UTF-8 - return "utf-8" - - code = ('import codecs, locale, sys', - 'locale.setlocale(locale.LC_CTYPE, "")', - 'enc = locale.nl_langinfo(locale.CODESET)', - 'enc = codecs.lookup(enc).name', - 'print(enc)') - args = (sys.executable, '-c', '; '.join(code)) - env = dict(os.environ) + def get_filesystem_encoding(self, isolated, env): + code = ('import codecs, locale, sys; ' + 'print(sys.getfilesystemencoding(), ' + 'sys.getfilesystemencodeerrors())') + args = (sys.executable, '-c', code) + env = dict(env) if not isolated: env['PYTHONCOERCECLOCALE'] = '0' env['PYTHONUTF8'] = '0' @@ -336,7 +330,8 @@ def get_locale_encoding(self, isolated): if proc.returncode: raise Exception(f"failed to get the locale encoding: " f"stdout={proc.stdout!r} stderr={proc.stderr!r}") - return proc.stdout.rstrip() + out = proc.stdout.rstrip() + return out.split() def check_config(self, testname, expected): expected = dict(self.DEFAULT_CONFIG, **expected) @@ -356,8 +351,12 @@ def check_config(self, testname, expected): expected['stdio_encoding'] = res[0] if expected['stdio_errors'] is None: expected['stdio_errors'] = res[1] - if expected['filesystem_encoding'] is None: - expected['filesystem_encoding'] = self.get_locale_encoding(expected['isolated']) + if expected['filesystem_encoding'] is None or expected['filesystem_errors'] is None: + res = self.get_filesystem_encoding(expected['isolated'], env) + if expected['filesystem_encoding'] is None: + expected['filesystem_encoding'] = res[0] + if expected['filesystem_errors'] is None: + expected['filesystem_errors'] = res[1] for key, value in expected.items(): expected[key] = str(value) diff --git a/Misc/NEWS.d/next/C API/2018-08-29-18-48-47.bpo-34523.lLQ8rh.rst b/Misc/NEWS.d/next/C API/2018-08-29-18-48-47.bpo-34523.lLQ8rh.rst new file mode 100644 index 000000000000..95368f1c6847 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2018-08-29-18-48-47.bpo-34523.lLQ8rh.rst @@ -0,0 +1,2 @@ +Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding on +Windows if Py_LegacyWindowsFSEncodingFlag is zero. diff --git a/Modules/main.c b/Modules/main.c index 2e9e23b652f3..bf7290a54a45 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -1287,6 +1287,9 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, _PyCmdline *cmdline) { int init_utf8_mode = Py_UTF8Mode; +#ifdef MS_WINDOWS + int init_legacy_encoding = Py_LegacyWindowsFSEncodingFlag; +#endif _PyCoreConfig save_config = _PyCoreConfig_INIT; int res = -1; @@ -1313,9 +1316,12 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, goto done; } - /* bpo-34207: Py_DecodeLocale(), Py_EncodeLocale() and similar - functions depend on Py_UTF8Mode. */ + /* bpo-34207: Py_DecodeLocale() and Py_EncodeLocale() depend + on Py_UTF8Mode and Py_LegacyWindowsFSEncodingFlag. */ Py_UTF8Mode = config->utf8_mode; +#ifdef MS_WINDOWS + Py_LegacyWindowsFSEncodingFlag = config->legacy_windows_fs_encoding; +#endif if (pymain_init_cmdline_argv(pymain, config, cmdline) < 0) { goto done; @@ -1380,6 +1386,9 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, done: _PyCoreConfig_Clear(&save_config); Py_UTF8Mode = init_utf8_mode ; +#ifdef MS_WINDOWS + Py_LegacyWindowsFSEncodingFlag = init_legacy_encoding; +#endif return res; } diff --git a/Python/fileutils.c b/Python/fileutils.c index e756c260cdcc..9a3c334d43bf 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -499,9 +499,13 @@ _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen, return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, surrogateescape); #else - if (Py_UTF8Mode == 1) { - return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, - surrogateescape); + int use_utf8 = (Py_UTF8Mode == 1); +#ifdef MS_WINDOWS + use_utf8 |= !Py_LegacyWindowsFSEncodingFlag; +#endif + if (use_utf8) { + return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, + reason, surrogateescape); } #ifdef USE_FORCE_ASCII @@ -661,7 +665,11 @@ encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos, return _Py_EncodeUTF8Ex(text, str, error_pos, reason, raw_malloc, surrogateescape); #else /* __APPLE__ */ - if (Py_UTF8Mode == 1) { + int use_utf8 = (Py_UTF8Mode == 1); +#ifdef MS_WINDOWS + use_utf8 |= !Py_LegacyWindowsFSEncodingFlag; +#endif + if (use_utf8) { return _Py_EncodeUTF8Ex(text, str, error_pos, reason, raw_malloc, surrogateescape); }

1 0

bpo-34523: Fix config_init_fs_encoding() (GH-8991)
by Victor Stinner Aug. 29, 2018

Aug. 29, 2018

https://github.com/python/cpython/commit/70fead25e503a742ad4c919b151b9b2b5f… commit: 70fead25e503a742ad4c919b151b9b2b5facee36 branch: master author: Victor Stinner <vstinner(a)redhat.com> committer: GitHub <noreply(a)github.com> date: 2018-08-29T13:45:34+02:00 summary: bpo-34523: Fix config_init_fs_encoding() (GH-8991) Call config_init_fs_encoding() if filesystem_errors is not NULL but filesystem_encoding is NULL. files: M Python/coreconfig.c diff --git a/Python/coreconfig.c b/Python/coreconfig.c index 0ec4640336c7..625c743a419a 100644 --- a/Python/coreconfig.c +++ b/Python/coreconfig.c @@ -1344,7 +1344,7 @@ _PyCoreConfig_Read(_PyCoreConfig *config) config->argc = 0; } - if (config->filesystem_encoding == NULL && config->filesystem_errors == NULL) { + if (config->filesystem_encoding == NULL || config->filesystem_errors == NULL) { err = config_init_fs_encoding(config); if (_Py_INIT_FAILED(err)) { return err;

1 0

bpo-34523: Add _PyCoreConfig.filesystem_encoding (GH-8963)
by Victor Stinner Aug. 29, 2018

Aug. 29, 2018

https://github.com/python/cpython/commit/b2457efc78b74a1d6d1b77d11a939e886b… commit: b2457efc78b74a1d6d1b77d11a939e886b8a4e2c branch: master author: Victor Stinner <vstinner(a)redhat.com> committer: GitHub <noreply(a)github.com> date: 2018-08-29T13:25:36+02:00 summary: bpo-34523: Add _PyCoreConfig.filesystem_encoding (GH-8963) _PyCoreConfig_Read() is now responsible to choose the filesystem encoding and error handler. Using Py_Main(), the encoding is now chosen even before calling Py_Initialize(). _PyCoreConfig.filesystem_encoding is now the reference, instead of Py_FileSystemDefaultEncoding, for the Python filesystem encoding. Changes: * Add filesystem_encoding and filesystem_errors to _PyCoreConfig * _PyCoreConfig_Read() now reads the locale encoding for the file system encoding. * PyUnicode_EncodeFSDefault() and PyUnicode_DecodeFSDefaultAndSize() now use the interpreter configuration rather than Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors global configuration variables. * Add _Py_SetFileSystemEncoding() and _Py_ClearFileSystemEncoding() private functions to only modify Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors in coreconfig.c. * _Py_CoerceLegacyLocale() now takes an int rather than _PyCoreConfig for the warning. files: A Misc/NEWS.d/next/Core and Builtins/2018-08-28-01-45-01.bpo-34523.aUUkc3.rst M Include/coreconfig.h M Include/pylifecycle.h M Lib/test/test_embed.py M Lib/test/test_sys.py M Modules/main.c M Objects/unicodeobject.c M Programs/_freeze_importlib.c M Programs/_testembed.c M Python/coreconfig.c M Python/pylifecycle.c M Python/sysmodule.c diff --git a/Include/coreconfig.h b/Include/coreconfig.h index ffba306a9f8a..f46bc9d7e9f3 100644 --- a/Include/coreconfig.h +++ b/Include/coreconfig.h @@ -66,6 +66,17 @@ typedef struct { int coerce_c_locale; /* PYTHONCOERCECLOCALE, -1 means unknown */ int coerce_c_locale_warn; /* PYTHONCOERCECLOCALE=warn */ + /* Python filesystem encoding and error handler: see + sys.getfilesystemencoding() and sys.getfilesystemencodeerrors(). + + Updated later by initfsencoding(). On Windows, can be updated by + sys._enablelegacywindowsfsencoding() at runtime. + + See Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors. + */ + char *filesystem_encoding; + char *filesystem_errors; + /* Enable UTF-8 mode? Set by -X utf8 command line option and PYTHONUTF8 environment variable. If set to -1 (default), inherit Py_UTF8Mode value. */ @@ -325,6 +336,14 @@ PyAPI_FUNC(int) _PyCoreConfig_GetEnvDup( #endif +#ifdef Py_BUILD_CORE +PyAPI_FUNC(int) _Py_SetFileSystemEncoding( + const char *encoding, + const char *errors); +PyAPI_FUNC(void) _Py_ClearFileSystemEncoding(void); +#endif + + #ifdef __cplusplus } #endif diff --git a/Include/pylifecycle.h b/Include/pylifecycle.h index b96db1e38b9f..b84568e54c1a 100644 --- a/Include/pylifecycle.h +++ b/Include/pylifecycle.h @@ -175,7 +175,7 @@ PyAPI_FUNC(int) _PyOS_URandomNonblock(void *buffer, Py_ssize_t size); /* Legacy locale support */ #ifndef Py_LIMITED_API -PyAPI_FUNC(void) _Py_CoerceLegacyLocale(const _PyCoreConfig *config); +PyAPI_FUNC(void) _Py_CoerceLegacyLocale(int warn); PyAPI_FUNC(int) _Py_LegacyLocaleDetected(void); PyAPI_FUNC(char *) _Py_SetLocaleFromEnv(int category); #endif diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index 2ec9cf3686e4..b6311e4b334b 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -251,6 +251,8 @@ def test_initialize_pymain(self): class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): maxDiff = 4096 + UTF8_MODE_ERRORS = ('surrogatepass' if sys.platform == 'win32' + else 'surrogateescape') DEFAULT_CONFIG = { 'install_signal_handlers': 1, 'use_environment': 1, @@ -265,8 +267,12 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): 'show_alloc_count': 0, 'dump_refs': 0, 'malloc_stats': 0, - 'utf8_mode': 0, + # None means that the default encoding is read at runtime: + # see get_locale_encoding(). + 'filesystem_encoding': None, + 'filesystem_errors': sys.getfilesystemencodeerrors(), + 'utf8_mode': 0, 'coerce_c_locale': 0, 'coerce_c_locale_warn': 0, @@ -297,6 +303,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): '_frozen': 0, } + def get_stdio_encoding(self, env): code = 'import sys; print(sys.stdout.encoding, sys.stdout.errors)' args = (sys.executable, '-c', code) @@ -308,6 +315,29 @@ def get_stdio_encoding(self, env): out = proc.stdout.rstrip() return out.split() + def get_locale_encoding(self, isolated): + if sys.platform in ('win32', 'darwin') or support.is_android: + # Windows, macOS and Android use UTF-8 + return "utf-8" + + code = ('import codecs, locale, sys', + 'locale.setlocale(locale.LC_CTYPE, "")', + 'enc = locale.nl_langinfo(locale.CODESET)', + 'enc = codecs.lookup(enc).name', + 'print(enc)') + args = (sys.executable, '-c', '; '.join(code)) + env = dict(os.environ) + if not isolated: + env['PYTHONCOERCECLOCALE'] = '0' + env['PYTHONUTF8'] = '0' + proc = subprocess.run(args, text=True, env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if proc.returncode: + raise Exception(f"failed to get the locale encoding: " + f"stdout={proc.stdout!r} stderr={proc.stderr!r}") + return proc.stdout.rstrip() + def check_config(self, testname, expected): expected = dict(self.DEFAULT_CONFIG, **expected) @@ -326,6 +356,8 @@ def check_config(self, testname, expected): expected['stdio_encoding'] = res[0] if expected['stdio_errors'] is None: expected['stdio_errors'] = res[1] + if expected['filesystem_encoding'] is None: + expected['filesystem_encoding'] = self.get_locale_encoding(expected['isolated']) for key, value in expected.items(): expected[key] = str(value) @@ -357,7 +389,8 @@ def test_init_global_config(self): 'utf8_mode': 1, 'stdio_encoding': 'utf-8', 'stdio_errors': 'surrogateescape', - + 'filesystem_encoding': 'utf-8', + 'filesystem_errors': self.UTF8_MODE_ERRORS, 'user_site_directory': 0, '_frozen': 1, } @@ -378,6 +411,8 @@ def test_init_from_config(self): 'utf8_mode': 1, 'stdio_encoding': 'iso8859-1', 'stdio_errors': 'replace', + 'filesystem_encoding': 'utf-8', + 'filesystem_errors': self.UTF8_MODE_ERRORS, 'pycache_prefix': 'conf_pycache_prefix', 'program_name': './conf_program_name', @@ -409,6 +444,8 @@ def test_init_env(self): 'import_time': 1, 'malloc_stats': 1, 'utf8_mode': 1, + 'filesystem_encoding': 'utf-8', + 'filesystem_errors': self.UTF8_MODE_ERRORS, 'inspect': 1, 'optimization_level': 2, 'pycache_prefix': 'env_pycache_prefix', diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index f3dd3bb67b38..b90366d81445 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -861,6 +861,16 @@ def check(tracebacklimit, expected): def test_no_duplicates_in_meta_path(self): self.assertEqual(len(sys.meta_path), len(set(sys.meta_path))) + @unittest.skipUnless(hasattr(sys, "_enablelegacywindowsfsencoding"), + 'needs sys._enablelegacywindowsfsencoding()') + def test__enablelegacywindowsfsencoding(self): + code = ('import sys', + 'sys._enablelegacywindowsfsencoding()', + 'print(sys.getfilesystemencoding(), sys.getfilesystemencodeerrors())') + rc, out, err = assert_python_ok('-c', '; '.join(code)) + out = out.decode('ascii', 'replace').rstrip() + self.assertEqual(out, 'mbcs replace') + @test.support.cpython_only class SizeofTest(unittest.TestCase): diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-08-28-01-45-01.bpo-34523.aUUkc3.rst b/Misc/NEWS.d/next/Core and Builtins/2018-08-28-01-45-01.bpo-34523.aUUkc3.rst new file mode 100644 index 000000000000..333939d821d0 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2018-08-28-01-45-01.bpo-34523.aUUkc3.rst @@ -0,0 +1,2 @@ +The Python filesystem encoding is now read earlier during the Python +initialization. diff --git a/Modules/main.c b/Modules/main.c index 3a15b2bf60be..2e9e23b652f3 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -1339,7 +1339,7 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, */ if (config->coerce_c_locale && !locale_coerced) { locale_coerced = 1; - _Py_CoerceLegacyLocale(config); + _Py_CoerceLegacyLocale(config->coerce_c_locale_warn); encoding_changed = 1; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 087cfca58d36..60adcd9c88df 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3410,27 +3410,24 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) PyObject * PyUnicode_EncodeFSDefault(PyObject *unicode) { + PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); + const _PyCoreConfig *config = &interp->core_config; #if defined(__APPLE__) - return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors); + return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors); #else - PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); /* Bootstrap check: if the filesystem codec is implemented in Python, we cannot use it to encode and decode filenames before it is loaded. Load the Python codec requires to encode at least its own filename. Use the C - version of the locale codec until the codec registry is initialized and - the Python codec is loaded. - - Py_FileSystemDefaultEncoding is shared between all interpreters, we - cannot only rely on it: check also interp->fscodec_initialized for - subinterpreters. */ - if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { + implementation of the locale codec until the codec registry is + initialized and the Python codec is loaded. See initfsencoding(). */ + if (interp->fscodec_initialized) { return PyUnicode_AsEncodedString(unicode, - Py_FileSystemDefaultEncoding, - Py_FileSystemDefaultEncodeErrors); + config->filesystem_encoding, + config->filesystem_errors); } else { return unicode_encode_locale(unicode, - Py_FileSystemDefaultEncodeErrors, 0); + config->filesystem_errors, 0); } #endif } @@ -3636,27 +3633,24 @@ PyUnicode_DecodeFSDefault(const char *s) { PyObject* PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) { + PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); + const _PyCoreConfig *config = &interp->core_config; #if defined(__APPLE__) - return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL); + return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL); #else - PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); /* Bootstrap check: if the filesystem codec is implemented in Python, we cannot use it to encode and decode filenames before it is loaded. Load the Python codec requires to encode at least its own filename. Use the C - version of the locale codec until the codec registry is initialized and - the Python codec is loaded. - - Py_FileSystemDefaultEncoding is shared between all interpreters, we - cannot only rely on it: check also interp->fscodec_initialized for - subinterpreters. */ - if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { + implementation of the locale codec until the codec registry is + initialized and the Python codec is loaded. See initfsencoding(). */ + if (interp->fscodec_initialized) { return PyUnicode_Decode(s, size, - Py_FileSystemDefaultEncoding, - Py_FileSystemDefaultEncodeErrors); + config->filesystem_encoding, + config->filesystem_errors); } else { return unicode_decode_locale(s, size, - Py_FileSystemDefaultEncodeErrors, 0); + config->filesystem_errors, 0); } #endif } diff --git a/Programs/_freeze_importlib.c b/Programs/_freeze_importlib.c index fdf5013e5473..2621a7687edd 100644 --- a/Programs/_freeze_importlib.c +++ b/Programs/_freeze_importlib.c @@ -81,8 +81,15 @@ main(int argc, char *argv[]) config.program_name = L"./_freeze_importlib"; /* Don't install importlib, since it could execute outdated bytecode. */ config._install_importlib = 0; - config.install_signal_handlers = 1; config._frozen = 1; +#ifdef MS_WINDOWS + /* bpo-34523: initfsencoding() is not called if _install_importlib=0, + so interp->fscodec_initialized value remains 0. + PyUnicode_EncodeFSDefault() doesn't support the "surrogatepass" error + handler in such case, whereas it's the default error handler on Windows. + Force the "strict" error handler to work around this bootstrap issue. */ + config.filesystem_errors = "strict"; +#endif _PyInitError err = _Py_InitializeFromConfig(&config); /* No need to call _PyCoreConfig_Clear() since we didn't allocate any diff --git a/Programs/_testembed.c b/Programs/_testembed.c index d5694178b11b..99772eacbdc4 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -328,6 +328,8 @@ dump_config(void) printf("dump_refs = %i\n", config->dump_refs); printf("malloc_stats = %i\n", config->malloc_stats); + printf("filesystem_encoding = %s\n", config->filesystem_encoding); + printf("filesystem_errors = %s\n", config->filesystem_errors); printf("coerce_c_locale = %i\n", config->coerce_c_locale); printf("coerce_c_locale_warn = %i\n", config->coerce_c_locale_warn); printf("utf8_mode = %i\n", config->utf8_mode); diff --git a/Python/coreconfig.c b/Python/coreconfig.c index 00037d973d5d..0ec4640336c7 100644 --- a/Python/coreconfig.c +++ b/Python/coreconfig.c @@ -5,6 +5,11 @@ # include <langinfo.h> #endif +#include <locale.h> /* setlocale() */ +#ifdef HAVE_LANGINFO_H +#include <langinfo.h> /* nl_langinfo(CODESET) */ +#endif + #define DECODE_LOCALE_ERR(NAME, LEN) \ (((LEN) == -2) \ @@ -32,6 +37,8 @@ const char *Py_FileSystemDefaultEncoding = NULL; /* set by initfsencoding() */ int Py_HasFileSystemDefaultEncoding = 0; #endif const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape"; +static int _Py_HasFileSystemDefaultEncodeErrors = 1; + /* UTF-8 mode (PEP 540): if equals to 1, use the UTF-8 encoding, and change stdin and stdout error handler to "surrogateescape". It is equal to -1 by default: unknown, will be set by Py_Main() */ @@ -88,6 +95,47 @@ _Py_wstrlist_copy(int len, wchar_t **list) } +void +_Py_ClearFileSystemEncoding(void) +{ + if (!Py_HasFileSystemDefaultEncoding && Py_FileSystemDefaultEncoding) { + PyMem_RawFree((char*)Py_FileSystemDefaultEncoding); + Py_FileSystemDefaultEncoding = NULL; + } + if (!_Py_HasFileSystemDefaultEncodeErrors && Py_FileSystemDefaultEncodeErrors) { + PyMem_RawFree((char*)Py_FileSystemDefaultEncodeErrors); + Py_FileSystemDefaultEncodeErrors = NULL; + } +} + + +/* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors + global configuration variables. */ +int +_Py_SetFileSystemEncoding(const char *encoding, const char *errors) +{ + char *encoding2 = _PyMem_RawStrdup(encoding); + if (encoding2 == NULL) { + return -1; + } + + char *errors2 = _PyMem_RawStrdup(errors); + if (errors2 == NULL) { + PyMem_RawFree(encoding2); + return -1; + } + + _Py_ClearFileSystemEncoding(); + + Py_FileSystemDefaultEncoding = encoding2; + Py_HasFileSystemDefaultEncoding = 0; + + Py_FileSystemDefaultEncodeErrors = errors2; + _Py_HasFileSystemDefaultEncodeErrors = 0; + return 0; +} + + /* Helper to allow an embedding application to override the normal * mechanism that attempts to figure out an appropriate IO encoding */ @@ -209,6 +257,8 @@ _PyCoreConfig_Clear(_PyCoreConfig *config) #endif CLEAR(config->base_exec_prefix); + CLEAR(config->filesystem_encoding); + CLEAR(config->filesystem_errors); CLEAR(config->stdio_encoding); CLEAR(config->stdio_errors); #undef CLEAR @@ -302,6 +352,8 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2) COPY_ATTR(quiet); COPY_ATTR(user_site_directory); COPY_ATTR(buffered_stdio); + COPY_STR_ATTR(filesystem_encoding); + COPY_STR_ATTR(filesystem_errors); COPY_STR_ATTR(stdio_encoding); COPY_STR_ATTR(stdio_errors); #ifdef MS_WINDOWS @@ -312,6 +364,7 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2) COPY_ATTR(_frozen); #undef COPY_ATTR +#undef COPY_STR_ATTR #undef COPY_WSTR_ATTR #undef COPY_WSTRLIST return 0; @@ -976,8 +1029,8 @@ get_stdio_errors(const _PyCoreConfig *config) } -_PyInitError -_Py_get_locale_encoding(char **locale_encoding) +static _PyInitError +get_locale_encoding(char **locale_encoding) { #ifdef MS_WINDOWS char encoding[20]; @@ -1087,7 +1140,7 @@ config_init_stdio_encoding(_PyCoreConfig *config) /* Choose the default error handler based on the current locale. */ if (config->stdio_encoding == NULL) { - _PyInitError err = _Py_get_locale_encoding(&config->stdio_encoding); + _PyInitError err = get_locale_encoding(&config->stdio_encoding); if (_Py_INIT_FAILED(err)) { return err; } @@ -1104,6 +1157,81 @@ config_init_stdio_encoding(_PyCoreConfig *config) } +static _PyInitError +config_init_fs_encoding(_PyCoreConfig *config) +{ +#ifdef MS_WINDOWS + if (config->legacy_windows_fs_encoding) { + /* Legacy Windows filesystem encoding: mbcs/replace */ + if (config->filesystem_encoding == NULL) { + config->filesystem_encoding = _PyMem_RawStrdup("mbcs"); + if (config->filesystem_encoding == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } + if (config->filesystem_errors == NULL) { + config->filesystem_errors = _PyMem_RawStrdup("replace"); + if (config->filesystem_errors == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } + } + + /* Windows defaults to utf-8/surrogatepass (PEP 529) */ + if (config->filesystem_encoding == NULL) { + config->filesystem_encoding = _PyMem_RawStrdup("utf-8"); + if (config->filesystem_encoding == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } + if (config->filesystem_errors == NULL) { + config->filesystem_errors = _PyMem_RawStrdup("surrogatepass"); + if (config->filesystem_errors == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } +#else + if (config->utf8_mode) { + /* UTF-8 Mode use: utf-8/surrogateescape */ + if (config->filesystem_encoding == NULL) { + config->filesystem_encoding = _PyMem_RawStrdup("utf-8"); + if (config->filesystem_encoding == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } + /* errors defaults to surrogateescape above */ + } + + if (config->filesystem_encoding == NULL) { + /* macOS and Android use UTF-8, other platforms use + the locale encoding. */ + char *locale_encoding; +#if defined(__APPLE__) || defined(__ANDROID__) + locale_encoding = "UTF-8"; +#else + _PyInitError err = get_locale_encoding(&locale_encoding); + if (_Py_INIT_FAILED(err)) { + return err; + } +#endif + config->filesystem_encoding = _PyMem_RawStrdup(locale_encoding); + if (config->filesystem_encoding == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } + + if (config->filesystem_errors == NULL) { + /* by default, use the "surrogateescape" error handler */ + config->filesystem_errors = _PyMem_RawStrdup("surrogateescape"); + if (config->filesystem_errors == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } +#endif + return _Py_INIT_OK(); +} + + /* Read configuration settings from standard locations * * This function doesn't make any changes to the interpreter state - it @@ -1216,6 +1344,13 @@ _PyCoreConfig_Read(_PyCoreConfig *config) config->argc = 0; } + if (config->filesystem_encoding == NULL && config->filesystem_errors == NULL) { + err = config_init_fs_encoding(config); + if (_Py_INIT_FAILED(err)) { + return err; + } + } + err = config_init_stdio_encoding(config); if (_Py_INIT_FAILED(err)) { return err; @@ -1223,6 +1358,10 @@ _PyCoreConfig_Read(_PyCoreConfig *config) assert(config->coerce_c_locale >= 0); assert(config->use_environment >= 0); + assert(config->filesystem_encoding != NULL); + assert(config->filesystem_errors != NULL); + assert(config->stdio_encoding != NULL); + assert(config->stdio_errors != NULL); return _Py_INIT_OK(); } diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 9f6757fe808d..6d97f2f24d5d 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -339,7 +339,7 @@ static const char C_LOCALE_COERCION_WARNING[] = "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n"; static void -_coerce_default_locale_settings(const _PyCoreConfig *config, const _LocaleCoercionTarget *target) +_coerce_default_locale_settings(int warn, const _LocaleCoercionTarget *target) { const char *newloc = target->locale_name; @@ -352,7 +352,7 @@ _coerce_default_locale_settings(const _PyCoreConfig *config, const _LocaleCoerci "Error setting LC_CTYPE, skipping C locale coercion\n"); return; } - if (config->coerce_c_locale_warn) { + if (warn) { fprintf(stderr, C_LOCALE_COERCION_WARNING, newloc); } @@ -362,7 +362,7 @@ _coerce_default_locale_settings(const _PyCoreConfig *config, const _LocaleCoerci #endif void -_Py_CoerceLegacyLocale(const _PyCoreConfig *config) +_Py_CoerceLegacyLocale(int warn) { #ifdef PY_COERCE_C_LOCALE const char *locale_override = getenv("LC_ALL"); @@ -385,7 +385,7 @@ defined(HAVE_LANGINFO_H) && defined(CODESET) } #endif /* Successfully configured locale, so make it the default */ - _coerce_default_locale_settings(config, target); + _coerce_default_locale_settings(warn, target); return; } } @@ -1162,11 +1162,7 @@ Py_FinalizeEx(void) /* Cleanup Unicode implementation */ _PyUnicode_Fini(); - /* reset file system default encoding */ - if (!Py_HasFileSystemDefaultEncoding && Py_FileSystemDefaultEncoding) { - PyMem_RawFree((char*)Py_FileSystemDefaultEncoding); - Py_FileSystemDefaultEncoding = NULL; - } + _Py_ClearFileSystemEncoding(); /* XXX Still allocated: - various static ad-hoc pointers to interned strings @@ -1475,59 +1471,31 @@ add_main_module(PyInterpreterState *interp) static _PyInitError initfsencoding(PyInterpreterState *interp) { - PyObject *codec; + _PyCoreConfig *config = &interp->core_config; -#ifdef MS_WINDOWS - if (Py_LegacyWindowsFSEncodingFlag) { - Py_FileSystemDefaultEncoding = "mbcs"; - Py_FileSystemDefaultEncodeErrors = "replace"; - } - else { - Py_FileSystemDefaultEncoding = "utf-8"; - Py_FileSystemDefaultEncodeErrors = "surrogatepass"; + char *encoding = get_codec_name(config->filesystem_encoding); + if (encoding == NULL) { + /* Such error can only occurs in critical situations: no more + memory, import a module of the standard library failed, etc. */ + return _Py_INIT_ERR("failed to get the Python codec " + "of the filesystem encoding"); } -#else - if (Py_FileSystemDefaultEncoding == NULL) { - if (interp->core_config.utf8_mode) { - Py_FileSystemDefaultEncoding = "utf-8"; - Py_HasFileSystemDefaultEncoding = 1; - } - else if (_Py_GetForceASCII()) { - Py_FileSystemDefaultEncoding = "ascii"; - Py_HasFileSystemDefaultEncoding = 1; - } - else { - extern _PyInitError _Py_get_locale_encoding(char **locale_encoding); - char *locale_encoding; - _PyInitError err = _Py_get_locale_encoding(&locale_encoding); - if (_Py_INIT_FAILED(err)) { - return err; - } - - Py_FileSystemDefaultEncoding = get_codec_name(locale_encoding); - PyMem_RawFree(locale_encoding); - if (Py_FileSystemDefaultEncoding == NULL) { - return _Py_INIT_ERR("failed to get the Python codec " - "of the locale encoding"); - } + /* Update the filesystem encoding to the normalized Python codec name. + For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii" + (Python codec name). */ + PyMem_RawFree(config->filesystem_encoding); + config->filesystem_encoding = encoding; - Py_HasFileSystemDefaultEncoding = 0; - interp->fscodec_initialized = 1; - return _Py_INIT_OK(); - } + /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors + global configuration variables. */ + if (_Py_SetFileSystemEncoding(config->filesystem_encoding, + config->filesystem_errors) < 0) { + return _Py_INIT_NO_MEMORY(); } -#endif - /* the encoding is mbcs, utf-8 or ascii */ - codec = _PyCodec_Lookup(Py_FileSystemDefaultEncoding); - if (!codec) { - /* Such error can only occurs in critical situations: no more - * memory, import a module of the standard library failed, - * etc. */ - return _Py_INIT_ERR("unable to load the file system codec"); - } - Py_DECREF(codec); + /* PyUnicode can now use the Python codec rather than C implementation + for the filesystem encoding */ interp->fscodec_initialized = 1; return _Py_INIT_OK(); } diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 177b8307626d..91df4b031e32 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -389,11 +389,9 @@ implementation." static PyObject * sys_getfilesystemencoding(PyObject *self, PyObject *Py_UNUSED(ignored)) { - if (Py_FileSystemDefaultEncoding) - return PyUnicode_FromString(Py_FileSystemDefaultEncoding); - PyErr_SetString(PyExc_RuntimeError, - "filesystem encoding is not initialized"); - return NULL; + PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); + const _PyCoreConfig *config = &interp->core_config; + return PyUnicode_FromString(config->filesystem_encoding); } PyDoc_STRVAR(getfilesystemencoding_doc, @@ -406,11 +404,9 @@ operating system filenames." static PyObject * sys_getfilesystemencodeerrors(PyObject *self, PyObject *Py_UNUSED(ignored)) { - if (Py_FileSystemDefaultEncodeErrors) - return PyUnicode_FromString(Py_FileSystemDefaultEncodeErrors); - PyErr_SetString(PyExc_RuntimeError, - "filesystem encoding is not initialized"); - return NULL; + PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); + const _PyCoreConfig *config = &interp->core_config; + return PyUnicode_FromString(config->filesystem_errors); } PyDoc_STRVAR(getfilesystemencodeerrors_doc, @@ -1150,8 +1146,30 @@ environment variable before launching Python." static PyObject * sys_enablelegacywindowsfsencoding(PyObject *self) { - Py_FileSystemDefaultEncoding = "mbcs"; - Py_FileSystemDefaultEncodeErrors = "replace"; + PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); + _PyCoreConfig *config = &interp->core_config; + + /* Set the filesystem encoding to mbcs/replace (PEP 529) */ + char *encoding = _PyMem_RawStrdup("mbcs"); + char *errors = _PyMem_RawStrdup("replace"); + if (encoding == NULL || errors == NULL) { + PyMem_Free(encoding); + PyMem_Free(errors); + PyErr_NoMemory(); + return NULL; + } + + PyMem_RawFree(config->filesystem_encoding); + config->filesystem_encoding = encoding; + PyMem_RawFree(config->filesystem_errors); + config->filesystem_errors = errors; + + if (_Py_SetFileSystemEncoding(config->filesystem_encoding, + config->filesystem_errors) < 0) { + PyErr_NoMemory(); + return NULL; + } + Py_RETURN_NONE; }

1 0

bpo-34485: Add _PyCoreConfig.stdio_encoding (GH-8881)
by Victor Stinner Aug. 29, 2018

Aug. 29, 2018

https://github.com/python/cpython/commit/dfe0dc74536dfb6f331131d9b2b4955767… commit: dfe0dc74536dfb6f331131d9b2b49557675bb6b7 branch: master author: Victor Stinner <vstinner(a)redhat.com> committer: GitHub <noreply(a)github.com> date: 2018-08-29T11:47:29+02:00 summary: bpo-34485: Add _PyCoreConfig.stdio_encoding (GH-8881) * Add stdio_encoding and stdio_errors fields to _PyCoreConfig. * Add unit tests on stdio_encoding and stdio_errors. files: M Include/coreconfig.h M Include/pylifecycle.h M Lib/test/test_embed.py M Programs/_testembed.c M Python/coreconfig.c M Python/pylifecycle.c diff --git a/Include/coreconfig.h b/Include/coreconfig.h index b2799075f930..ffba306a9f8a 100644 --- a/Include/coreconfig.h +++ b/Include/coreconfig.h @@ -203,6 +203,18 @@ typedef struct { If set to -1 (default), it is set to !Py_UnbufferedStdioFlag. */ int buffered_stdio; + /* Encoding of sys.stdin, sys.stdout and sys.stderr. + Value set from PYTHONIOENCODING environment variable and + Py_SetStandardStreamEncoding() function. + See also 'stdio_errors' attribute. */ + char *stdio_encoding; + + /* Error handler of sys.stdin and sys.stdout. + Value set from PYTHONIOENCODING environment variable and + Py_SetStandardStreamEncoding() function. + See also 'stdio_encoding' attribute. */ + char *stdio_errors; + #ifdef MS_WINDOWS /* If greater than 1, use the "mbcs" encoding instead of the UTF-8 encoding for the filesystem encoding. diff --git a/Include/pylifecycle.h b/Include/pylifecycle.h index 20298277023f..b96db1e38b9f 100644 --- a/Include/pylifecycle.h +++ b/Include/pylifecycle.h @@ -179,6 +179,9 @@ PyAPI_FUNC(void) _Py_CoerceLegacyLocale(const _PyCoreConfig *config); PyAPI_FUNC(int) _Py_LegacyLocaleDetected(void); PyAPI_FUNC(char *) _Py_SetLocaleFromEnv(int category); #endif +#ifdef Py_BUILD_CORE +PyAPI_FUNC(int) _Py_IsLocaleCoercionTarget(const char *ctype_loc); +#endif #ifdef __cplusplus } diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index 3922447c645e..2ec9cf3686e4 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -288,13 +288,29 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): 'quiet': 0, 'user_site_directory': 1, 'buffered_stdio': 1, + # None means that check_config() gets the expected encoding at runtime + 'stdio_encoding': None, + 'stdio_errors': None, '_install_importlib': 1, '_check_hash_pycs_mode': 'default', '_frozen': 0, } + def get_stdio_encoding(self, env): + code = 'import sys; print(sys.stdout.encoding, sys.stdout.errors)' + args = (sys.executable, '-c', code) + proc = subprocess.run(args, env=env, text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + if proc.returncode: + raise Exception(f"failed to get the stdio encoding: stdout={proc.stdout!r}") + out = proc.stdout.rstrip() + return out.split() + def check_config(self, testname, expected): + expected = dict(self.DEFAULT_CONFIG, **expected) + env = dict(os.environ) for key in list(env): if key.startswith('PYTHON'): @@ -303,13 +319,19 @@ def check_config(self, testname, expected): # on the current locale env['PYTHONCOERCECLOCALE'] = '0' env['PYTHONUTF8'] = '0' - out, err = self.run_embedded_interpreter(testname, env=env) - # Ignore err - expected = dict(self.DEFAULT_CONFIG, **expected) + if expected['stdio_encoding'] is None or expected['stdio_errors'] is None: + res = self.get_stdio_encoding(env) + if expected['stdio_encoding'] is None: + expected['stdio_encoding'] = res[0] + if expected['stdio_errors'] is None: + expected['stdio_errors'] = res[1] for key, value in expected.items(): expected[key] = str(value) + out, err = self.run_embedded_interpreter(testname, env=env) + # Ignore err + config = {} for line in out.splitlines(): key, value = line.split(' = ', 1) @@ -331,7 +353,11 @@ def test_init_global_config(self): 'verbose': 1, 'quiet': 1, 'buffered_stdio': 0, + 'utf8_mode': 1, + 'stdio_encoding': 'utf-8', + 'stdio_errors': 'surrogateescape', + 'user_site_directory': 0, '_frozen': 1, } @@ -350,6 +376,8 @@ def test_init_from_config(self): 'malloc_stats': 1, 'utf8_mode': 1, + 'stdio_encoding': 'iso8859-1', + 'stdio_errors': 'replace', 'pycache_prefix': 'conf_pycache_prefix', 'program_name': './conf_program_name', @@ -387,6 +415,8 @@ def test_init_env(self): 'write_bytecode': 0, 'verbose': 1, 'buffered_stdio': 0, + 'stdio_encoding': 'iso8859-1', + 'stdio_errors': 'replace', 'user_site_directory': 0, 'faulthandler': 1, 'dev_mode': 1, diff --git a/Programs/_testembed.c b/Programs/_testembed.c index d0c00cfc6cd4..d5694178b11b 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -374,6 +374,8 @@ dump_config(void) printf("user_site_directory = %i\n", config->user_site_directory); printf("buffered_stdio = %i\n", config->buffered_stdio); ASSERT_EQUAL(config->buffered_stdio, !Py_UnbufferedStdioFlag); + printf("stdio_encoding = %s\n", config->stdio_encoding); + printf("stdio_errors = %s\n", config->stdio_errors); /* FIXME: test legacy_windows_fs_encoding */ /* FIXME: test legacy_windows_stdio */ @@ -532,6 +534,11 @@ static int test_init_from_config(void) Py_UnbufferedStdioFlag = 0; config.buffered_stdio = 0; + putenv("PYTHONIOENCODING=cp424"); + Py_SetStandardStreamEncoding("ascii", "ignore"); + config.stdio_encoding = "iso8859-1"; + config.stdio_errors = "replace"; + putenv("PYTHONNOUSERSITE="); Py_NoUserSiteDirectory = 0; config.user_site_directory = 0; @@ -569,6 +576,7 @@ static void test_init_env_putenvs(void) putenv("PYTHONNOUSERSITE=1"); putenv("PYTHONFAULTHANDLER=1"); putenv("PYTHONDEVMODE=1"); + putenv("PYTHONIOENCODING=iso8859-1:replace"); /* FIXME: test PYTHONWARNINGS */ /* FIXME: test PYTHONEXECUTABLE */ /* FIXME: test PYTHONHOME */ diff --git a/Python/coreconfig.c b/Python/coreconfig.c index 99d703cab92d..00037d973d5d 100644 --- a/Python/coreconfig.c +++ b/Python/coreconfig.c @@ -1,6 +1,9 @@ #include "Python.h" #include "internal/pystate.h" #include <locale.h> +#ifdef HAVE_LANGINFO_H +# include <langinfo.h> +#endif #define DECODE_LOCALE_ERR(NAME, LEN) \ @@ -89,8 +92,8 @@ _Py_wstrlist_copy(int len, wchar_t **list) * mechanism that attempts to figure out an appropriate IO encoding */ -char *_Py_StandardStreamEncoding = NULL; -char *_Py_StandardStreamErrors = NULL; +static char *_Py_StandardStreamEncoding = NULL; +static char *_Py_StandardStreamErrors = NULL; int Py_SetStandardStreamEncoding(const char *encoding, const char *errors) @@ -205,6 +208,9 @@ _PyCoreConfig_Clear(_PyCoreConfig *config) CLEAR(config->dll_path); #endif CLEAR(config->base_exec_prefix); + + CLEAR(config->stdio_encoding); + CLEAR(config->stdio_errors); #undef CLEAR #undef CLEAR_WSTRLIST } @@ -216,6 +222,15 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2) _PyCoreConfig_Clear(config); #define COPY_ATTR(ATTR) config->ATTR = config2->ATTR +#define COPY_STR_ATTR(ATTR) \ + do { \ + if (config2->ATTR != NULL) { \ + config->ATTR = _PyMem_RawStrdup(config2->ATTR); \ + if (config->ATTR == NULL) { \ + return -1; \ + } \ + } \ + } while (0) #define COPY_WSTR_ATTR(ATTR) \ do { \ if (config2->ATTR != NULL) { \ @@ -287,6 +302,8 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2) COPY_ATTR(quiet); COPY_ATTR(user_site_directory); COPY_ATTR(buffered_stdio); + COPY_STR_ATTR(stdio_encoding); + COPY_STR_ATTR(stdio_errors); #ifdef MS_WINDOWS COPY_ATTR(legacy_windows_fs_encoding); COPY_ATTR(legacy_windows_stdio); @@ -932,6 +949,161 @@ config_init_locale(_PyCoreConfig *config) } +static const char * +get_stdio_errors(const _PyCoreConfig *config) +{ +#ifndef MS_WINDOWS + const char *loc = setlocale(LC_CTYPE, NULL); + if (loc != NULL) { + /* surrogateescape is the default in the legacy C and POSIX locales */ + if (strcmp(loc, "C") == 0 || strcmp(loc, "POSIX") == 0) { + return "surrogateescape"; + } + +#ifdef PY_COERCE_C_LOCALE + /* surrogateescape is the default in locale coercion target locales */ + if (_Py_IsLocaleCoercionTarget(loc)) { + return "surrogateescape"; + } +#endif + } + + return "strict"; +#else + /* On Windows, always use surrogateescape by default */ + return "surrogateescape"; +#endif +} + + +_PyInitError +_Py_get_locale_encoding(char **locale_encoding) +{ +#ifdef MS_WINDOWS + char encoding[20]; + PyOS_snprintf(encoding, sizeof(encoding), "cp%d", GetACP()); +#elif defined(__ANDROID__) + const char *encoding = "UTF-8"; +#else + const char *encoding = nl_langinfo(CODESET); + if (!encoding || encoding[0] == '\0') { + return _Py_INIT_USER_ERR("failed to get the locale encoding: " + "nl_langinfo(CODESET) failed"); + } +#endif + *locale_encoding = _PyMem_RawStrdup(encoding); + if (*locale_encoding == NULL) { + return _Py_INIT_NO_MEMORY(); + } + return _Py_INIT_OK(); +} + + +static _PyInitError +config_init_stdio_encoding(_PyCoreConfig *config) +{ + /* If Py_SetStandardStreamEncoding() have been called, use these + parameters. */ + if (config->stdio_encoding == NULL && _Py_StandardStreamEncoding != NULL) { + config->stdio_encoding = _PyMem_RawStrdup(_Py_StandardStreamEncoding); + if (config->stdio_encoding == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } + + if (config->stdio_errors == NULL && _Py_StandardStreamErrors != NULL) { + config->stdio_errors = _PyMem_RawStrdup(_Py_StandardStreamErrors); + if (config->stdio_errors == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } + + if (config->stdio_encoding != NULL && config->stdio_errors != NULL) { + return _Py_INIT_OK(); + } + + /* PYTHONIOENCODING environment variable */ + const char *opt = _PyCoreConfig_GetEnv(config, "PYTHONIOENCODING"); + if (opt) { + char *pythonioencoding = _PyMem_RawStrdup(opt); + if (pythonioencoding == NULL) { + return _Py_INIT_NO_MEMORY(); + } + + char *err = strchr(pythonioencoding, ':'); + if (err) { + *err = '\0'; + err++; + if (!err[0]) { + err = NULL; + } + } + + /* Does PYTHONIOENCODING contain an encoding? */ + if (pythonioencoding[0]) { + if (config->stdio_encoding == NULL) { + config->stdio_encoding = _PyMem_RawStrdup(pythonioencoding); + if (config->stdio_encoding == NULL) { + PyMem_RawFree(pythonioencoding); + return _Py_INIT_NO_MEMORY(); + } + } + + /* If the encoding is set but not the error handler, + use "strict" error handler by default. + PYTHONIOENCODING=latin1 behaves as + PYTHONIOENCODING=latin1:strict. */ + if (!err) { + err = "strict"; + } + } + + if (config->stdio_errors == NULL && err != NULL) { + config->stdio_errors = _PyMem_RawStrdup(err); + if (config->stdio_errors == NULL) { + PyMem_RawFree(pythonioencoding); + return _Py_INIT_NO_MEMORY(); + } + } + + PyMem_RawFree(pythonioencoding); + } + + /* UTF-8 Mode uses UTF-8/surrogateescape */ + if (config->utf8_mode) { + if (config->stdio_encoding == NULL) { + config->stdio_encoding = _PyMem_RawStrdup("utf-8"); + if (config->stdio_encoding == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } + if (config->stdio_errors == NULL) { + config->stdio_errors = _PyMem_RawStrdup("surrogateescape"); + if (config->stdio_errors == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } + } + + /* Choose the default error handler based on the current locale. */ + if (config->stdio_encoding == NULL) { + _PyInitError err = _Py_get_locale_encoding(&config->stdio_encoding); + if (_Py_INIT_FAILED(err)) { + return err; + } + } + if (config->stdio_errors == NULL) { + const char *errors = get_stdio_errors(config); + config->stdio_errors = _PyMem_RawStrdup(errors); + if (config->stdio_errors == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } + + return _Py_INIT_OK(); +} + + /* Read configuration settings from standard locations * * This function doesn't make any changes to the interpreter state - it @@ -1044,6 +1216,11 @@ _PyCoreConfig_Read(_PyCoreConfig *config) config->argc = 0; } + err = config_init_stdio_encoding(config); + if (_Py_INIT_FAILED(err)) { + return err; + } + assert(config->coerce_c_locale >= 0); assert(config->use_environment >= 0); diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 88403f4cbe5d..9f6757fe808d 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -184,27 +184,6 @@ get_codec_name(const char *encoding) return NULL; } -static _PyInitError -get_locale_encoding(char **locale_encoding) -{ -#ifdef MS_WINDOWS - char encoding[20]; - PyOS_snprintf(encoding, sizeof(encoding), "cp%d", GetACP()); -#elif defined(__ANDROID__) - const char *encoding = "UTF-8"; -#else - const char *encoding = nl_langinfo(CODESET); - if (!encoding || encoding[0] == '\0') { - return _Py_INIT_USER_ERR("failed to get the locale encoding: " - "nl_langinfo(CODESET) failed"); - } -#endif - *locale_encoding = _PyMem_RawStrdup(encoding); - if (*locale_encoding == NULL) { - return _Py_INIT_NO_MEMORY(); - } - return _Py_INIT_OK(); -} static _PyInitError initimport(PyInterpreterState *interp, PyObject *sysmod) @@ -340,35 +319,20 @@ static _LocaleCoercionTarget _TARGET_LOCALES[] = { {NULL} }; -static const char * -get_stdio_errors(void) -{ -#ifndef MS_WINDOWS - const char *ctype_loc = setlocale(LC_CTYPE, NULL); - if (ctype_loc != NULL) { - /* surrogateescape is the default in the legacy C and POSIX locales */ - if (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "POSIX") == 0) { - return "surrogateescape"; - } -#ifdef PY_COERCE_C_LOCALE - /* surrogateescape is the default in locale coercion target locales */ - const _LocaleCoercionTarget *target = NULL; - for (target = _TARGET_LOCALES; target->locale_name; target++) { - if (strcmp(ctype_loc, target->locale_name) == 0) { - return "surrogateescape"; - } +int +_Py_IsLocaleCoercionTarget(const char *ctype_loc) +{ + const _LocaleCoercionTarget *target = NULL; + for (target = _TARGET_LOCALES; target->locale_name; target++) { + if (strcmp(ctype_loc, target->locale_name) == 0) { + return 1; } -#endif } - - return "strict"; -#else - /* On Windows, always use surrogateescape by default */ - return "surrogateescape"; -#endif + return 0; } + #ifdef PY_COERCE_C_LOCALE static const char C_LOCALE_COERCION_WARNING[] = "Python detected LC_CTYPE=C: LC_CTYPE coerced to %.20s (set another locale " @@ -1533,8 +1497,10 @@ initfsencoding(PyInterpreterState *interp) Py_HasFileSystemDefaultEncoding = 1; } else { + extern _PyInitError _Py_get_locale_encoding(char **locale_encoding); + char *locale_encoding; - _PyInitError err = get_locale_encoding(&locale_encoding); + _PyInitError err = _Py_get_locale_encoding(&locale_encoding); if (_Py_INIT_FAILED(err)) { return err; } @@ -1740,13 +1706,16 @@ init_sys_streams(PyInterpreterState *interp) PyObject *std = NULL; int fd; PyObject * encoding_attr; - char *pythonioencoding = NULL; - const char *encoding, *errors; - char *locale_encoding = NULL; - char *codec_name = NULL; _PyInitError res = _Py_INIT_OK(); - extern char *_Py_StandardStreamEncoding; - extern char *_Py_StandardStreamErrors; + _PyCoreConfig *config = &interp->core_config; + + char *codec_name = get_codec_name(config->stdio_encoding); + if (codec_name == NULL) { + return _Py_INIT_ERR("failed to get the Python codec name " + "of the stdio encoding"); + } + PyMem_RawFree(config->stdio_encoding); + config->stdio_encoding = codec_name; /* Hack to avoid a nasty recursion issue when Python is invoked in verbose mode: pre-import the Latin-1 and UTF-8 codecs */ @@ -1778,85 +1747,15 @@ init_sys_streams(PyInterpreterState *interp) } Py_DECREF(wrapper); - encoding = _Py_StandardStreamEncoding; - errors = _Py_StandardStreamErrors; - if (!encoding || !errors) { - char *opt = Py_GETENV("PYTHONIOENCODING"); - if (opt && opt[0] != '\0') { - char *err; - pythonioencoding = _PyMem_Strdup(opt); - if (pythonioencoding == NULL) { - PyErr_NoMemory(); - goto error; - } - err = strchr(pythonioencoding, ':'); - if (err) { - *err = '\0'; - err++; - if (!err[0]) { - err = NULL; - } - } - - /* Does PYTHONIOENCODING contain an encoding? */ - if (pythonioencoding[0]) { - if (!encoding) { - encoding = pythonioencoding; - } - - /* If the encoding is set but not the error handler, - use "strict" error handler by default. - PYTHONIOENCODING=latin1 behaves as - PYTHONIOENCODING=latin1:strict. */ - if (!err) { - err = "strict"; - } - } - - if (!errors && err != NULL) { - errors = err; - } - } - - if (interp->core_config.utf8_mode) { - if (!encoding) { - encoding = "utf-8"; - } - if (!errors) { - errors = "surrogateescape"; - } - } - - if (!errors) { - /* Choose the default error handler based on the current locale */ - errors = get_stdio_errors(); - } - } - - if (encoding == NULL) { - _PyInitError err = get_locale_encoding(&locale_encoding); - if (_Py_INIT_FAILED(err)) { - return err; - } - encoding = locale_encoding; - } - - codec_name = get_codec_name(encoding); - if (codec_name == NULL) { - PyErr_SetString(PyExc_RuntimeError, - "failed to get the Python codec name " - "of stdio encoding"); - goto error; - } - encoding = codec_name; - /* Set sys.stdin */ fd = fileno(stdin); /* Under some conditions stdin, stdout and stderr may not be connected * and fileno() may point to an invalid file descriptor. For example * GUI apps don't have valid standard streams by default. */ - std = create_stdio(iomod, fd, 0, "<stdin>", encoding, errors); + std = create_stdio(iomod, fd, 0, "<stdin>", + config->stdio_encoding, + config->stdio_errors); if (std == NULL) goto error; PySys_SetObject("__stdin__", std); @@ -1865,7 +1764,9 @@ init_sys_streams(PyInterpreterState *interp) /* Set sys.stdout */ fd = fileno(stdout); - std = create_stdio(iomod, fd, 1, "<stdout>", encoding, errors); + std = create_stdio(iomod, fd, 1, "<stdout>", + config->stdio_encoding, + config->stdio_errors); if (std == NULL) goto error; PySys_SetObject("__stdout__", std); @@ -1875,7 +1776,9 @@ init_sys_streams(PyInterpreterState *interp) #if 1 /* Disable this if you have trouble debugging bootstrap stuff */ /* Set sys.stderr, replaces the preliminary stderr */ fd = fileno(stderr); - std = create_stdio(iomod, fd, 1, "<stderr>", encoding, "backslashreplace"); + std = create_stdio(iomod, fd, 1, "<stderr>", + config->stdio_encoding, + "backslashreplace"); if (std == NULL) goto error; @@ -1911,9 +1814,6 @@ init_sys_streams(PyInterpreterState *interp) done: _Py_ClearStandardStreamEncoding(); - PyMem_RawFree(locale_encoding); - PyMem_RawFree(codec_name); - PyMem_Free(pythonioencoding); Py_XDECREF(bimod); Py_XDECREF(iomod); return res;

1 0

bpo-34485, Windows: LC_CTYPE set to user preference (GH-8988)
by Victor Stinner Aug. 29, 2018

Aug. 29, 2018

https://github.com/python/cpython/commit/177d921c8c03d30daa32994362023f7776… commit: 177d921c8c03d30daa32994362023f777624b10d branch: master author: Victor Stinner <vstinner(a)redhat.com> committer: GitHub <noreply(a)github.com> date: 2018-08-29T11:25:15+02:00 summary: bpo-34485, Windows: LC_CTYPE set to user preference (GH-8988) On Windows, the LC_CTYPE is now set to the user preferred locale at startup: _Py_SetLocaleFromEnv(LC_CTYPE) is now called during the Python initialization. Previously, the LC_CTYPE locale was "C" at startup, but changed when calling setlocale(LC_CTYPE, "") or setlocale(LC_ALL, ""). pymain_read_conf() now also calls _Py_SetLocaleFromEnv(LC_CTYPE) to behave as _Py_InitializeCore(). Moreover, it doesn't save/restore the LC_ALL anymore. On Windows, standard streams like sys.stdout now always use surrogateescape error handler by default (ignore the locale). files: A Misc/NEWS.d/next/Core and Builtins/2018-08-29-11-04-19.bpo-34485.c2AFdp.rst M Modules/main.c M Python/pylifecycle.c diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-08-29-11-04-19.bpo-34485.c2AFdp.rst b/Misc/NEWS.d/next/Core and Builtins/2018-08-29-11-04-19.bpo-34485.c2AFdp.rst new file mode 100644 index 000000000000..f66a4f26f593 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2018-08-29-11-04-19.bpo-34485.c2AFdp.rst @@ -0,0 +1,3 @@ +On Windows, the LC_CTYPE is now set to the user preferred locale at startup. +Previously, the LC_CTYPE locale was "C" at startup, but changed when calling +setlocale(LC_CTYPE, "") or setlocale(LC_ALL, ""). diff --git a/Modules/main.c b/Modules/main.c index f93ca4d41662..3a15b2bf60be 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -1280,25 +1280,18 @@ pymain_read_conf_impl(_PyMain *pymain, _PyCoreConfig *config, } -/* Read the configuration, but initialize also the LC_CTYPE locale: - enable UTF-8 mode (PEP 540) and/or coerce the C locale (PEP 538) */ +/* Read the configuration and initialize the LC_CTYPE locale: + enable UTF-8 mode (PEP 540) and/or coerce the C locale (PEP 538). */ static int pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, _PyCmdline *cmdline) { int init_utf8_mode = Py_UTF8Mode; _PyCoreConfig save_config = _PyCoreConfig_INIT; - char *oldloc = NULL; int res = -1; - oldloc = _PyMem_RawStrdup(setlocale(LC_ALL, NULL)); - if (oldloc == NULL) { - pymain->err = _Py_INIT_NO_MEMORY(); - goto done; - } - - /* Reconfigure the locale to the default for this process */ - _Py_SetLocaleFromEnv(LC_ALL); + /* Set LC_CTYPE to the user preferred locale */ + _Py_SetLocaleFromEnv(LC_CTYPE); int locale_coerced = 0; int loops = 0; @@ -1386,10 +1379,6 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, done: _PyCoreConfig_Clear(&save_config); - if (oldloc != NULL) { - setlocale(LC_ALL, oldloc); - PyMem_RawFree(oldloc); - } Py_UTF8Mode = init_utf8_mode ; return res; } diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 33af06ec18b9..88403f4cbe5d 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -343,6 +343,7 @@ static _LocaleCoercionTarget _TARGET_LOCALES[] = { static const char * get_stdio_errors(void) { +#ifndef MS_WINDOWS const char *ctype_loc = setlocale(LC_CTYPE, NULL); if (ctype_loc != NULL) { /* surrogateescape is the default in the legacy C and POSIX locales */ @@ -362,6 +363,10 @@ get_stdio_errors(void) } return "strict"; +#else + /* On Windows, always use surrogateescape by default */ + return "surrogateescape"; +#endif } #ifdef PY_COERCE_C_LOCALE @@ -751,11 +756,8 @@ _Py_InitializeCore(PyInterpreterState **interp_p, (and the input configuration is read only). */ _PyCoreConfig config = _PyCoreConfig_INIT; -#ifndef MS_WINDOWS - /* Set up the LC_CTYPE locale, so we can obtain the locale's charset - without having to switch locales. */ + /* Set LC_CTYPE to the user preferred locale */ _Py_SetLocaleFromEnv(LC_CTYPE); -#endif _PyMem_SetDefaultAllocator(PYMEM_DOMAIN_RAW, &old_alloc); if (_PyCoreConfig_Copy(&config, src_config) >= 0) {

1 0

Daily reference leaks (4243df51fe43): sum=9
by solipsis＠pitrou.net Aug. 29, 2018

Aug. 29, 2018

results for 4243df51fe43 on branch "default" -------------------------------------------- test_asyncio leaked [0, 0, 3] memory blocks, sum=3 test_collections leaked [0, 7, -7] memory blocks, sum=0 test_functools leaked [0, 3, 1] memory blocks, sum=4 test_multiprocessing_fork leaked [0, 2, 0] memory blocks, sum=2 test_multiprocessing_forkserver leaked [2, -1, -1] memory blocks, sum=0 Command line was: ['./python', '-m', 'test.regrtest', '-uall', '-R', '3:3:/home/psf-users/antoine/refleaks/reflog4gu0md', '--timeout', '7200']

1 0

[3.7] bpo-34485: stdout uses surrogateescape on POSIX locale (GH-8986) (GH-8987)
by Victor Stinner Aug. 29, 2018

Aug. 29, 2018

https://github.com/python/cpython/commit/0b9ea4b211b24464c7d38f63e45e51c275… commit: 0b9ea4b211b24464c7d38f63e45e51c275c52dcd branch: 3.7 author: Victor Stinner <vstinner(a)redhat.com> committer: GitHub <noreply(a)github.com> date: 2018-08-29T11:01:33+02:00 summary: [3.7] bpo-34485: stdout uses surrogateescape on POSIX locale (GH-8986) (GH-8987) * bpo-34485: stdout uses surrogateescape on POSIX locale (GH-8986) Standard streams like sys.stdout now use the "surrogateescape" error handler, instead of "strict", on the POSIX locale (when the C locale is not coerced and the UTF-8 Mode is disabled). Add tests on sys.stdout.errors with LC_ALL=POSIX. Fix the error handler of standard streams like sys.stdout: PYTHONIOENCODING=":" is now ignored instead of setting the error handler to "strict". (cherry picked from commit 315877dc361d554bec34b4b62c270479ad36a1be) files: A Misc/NEWS.d/next/Core and Builtins/2018-08-28-23-01-14.bpo-34485.dq1Kqk.rst A Misc/NEWS.d/next/Core and Builtins/2018-08-29-09-27-47.bpo-34485.5aJCmw.rst M Lib/test/test_sys.py M Lib/test/test_utf8_mode.py M Python/pylifecycle.c diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index 336ae447a8de..27f75901c63f 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -654,10 +654,10 @@ def test_getfilesystemencoding(self): expected = None self.check_fsencoding(fs_encoding, expected) - def c_locale_get_error_handler(self, isolated=False, encoding=None): + def c_locale_get_error_handler(self, locale, isolated=False, encoding=None): # Force the POSIX locale env = os.environ.copy() - env["LC_ALL"] = "C" + env["LC_ALL"] = locale env["PYTHONCOERCECLOCALE"] = "0" code = '\n'.join(( 'import sys', @@ -683,44 +683,50 @@ def c_locale_get_error_handler(self, isolated=False, encoding=None): stdout, stderr = p.communicate() return stdout - def test_c_locale_surrogateescape(self): - out = self.c_locale_get_error_handler(isolated=True) + def check_locale_surrogateescape(self, locale): + out = self.c_locale_get_error_handler(locale, isolated=True) self.assertEqual(out, 'stdin: surrogateescape\n' 'stdout: surrogateescape\n' 'stderr: backslashreplace\n') # replace the default error handler - out = self.c_locale_get_error_handler(encoding=':ignore') + out = self.c_locale_get_error_handler(locale, encoding=':ignore') self.assertEqual(out, 'stdin: ignore\n' 'stdout: ignore\n' 'stderr: backslashreplace\n') # force the encoding - out = self.c_locale_get_error_handler(encoding='iso8859-1') + out = self.c_locale_get_error_handler(locale, encoding='iso8859-1') self.assertEqual(out, 'stdin: strict\n' 'stdout: strict\n' 'stderr: backslashreplace\n') - out = self.c_locale_get_error_handler(encoding='iso8859-1:') + out = self.c_locale_get_error_handler(locale, encoding='iso8859-1:') self.assertEqual(out, 'stdin: strict\n' 'stdout: strict\n' 'stderr: backslashreplace\n') # have no any effect - out = self.c_locale_get_error_handler(encoding=':') + out = self.c_locale_get_error_handler(locale, encoding=':') self.assertEqual(out, - 'stdin: strict\n' - 'stdout: strict\n' + 'stdin: surrogateescape\n' + 'stdout: surrogateescape\n' 'stderr: backslashreplace\n') - out = self.c_locale_get_error_handler(encoding='') + out = self.c_locale_get_error_handler(locale, encoding='') self.assertEqual(out, 'stdin: surrogateescape\n' 'stdout: surrogateescape\n' 'stderr: backslashreplace\n') + def test_c_locale_surrogateescape(self): + self.check_locale_surrogateescape('C') + + def test_posix_locale_surrogateescape(self): + self.check_locale_surrogateescape('POSIX') + def test_implementation(self): # This test applies to all implementations equally. diff --git a/Lib/test/test_utf8_mode.py b/Lib/test/test_utf8_mode.py index 4a16b7304689..554abfab3163 100644 --- a/Lib/test/test_utf8_mode.py +++ b/Lib/test/test_utf8_mode.py @@ -146,9 +146,9 @@ def test_stdio(self): out = self.get_output('-X', 'utf8', '-c', code, PYTHONIOENCODING=":namereplace") self.assertEqual(out.splitlines(), - ['stdin: UTF-8/namereplace', - 'stdout: UTF-8/namereplace', - 'stderr: UTF-8/backslashreplace']) + ['stdin: utf-8/namereplace', + 'stdout: utf-8/namereplace', + 'stderr: utf-8/backslashreplace']) def test_io(self): code = textwrap.dedent(''' diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-08-28-23-01-14.bpo-34485.dq1Kqk.rst b/Misc/NEWS.d/next/Core and Builtins/2018-08-28-23-01-14.bpo-34485.dq1Kqk.rst new file mode 100644 index 000000000000..5ca373aeab6d --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2018-08-28-23-01-14.bpo-34485.dq1Kqk.rst @@ -0,0 +1,3 @@ +Fix the error handler of standard streams like sys.stdout: +PYTHONIOENCODING=":" is now ignored instead of setting the error handler to +"strict". diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-08-29-09-27-47.bpo-34485.5aJCmw.rst b/Misc/NEWS.d/next/Core and Builtins/2018-08-29-09-27-47.bpo-34485.5aJCmw.rst new file mode 100644 index 000000000000..893e4f573f16 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2018-08-29-09-27-47.bpo-34485.5aJCmw.rst @@ -0,0 +1,3 @@ +Standard streams like sys.stdout now use the "surrogateescape" error +handler, instead of "strict", on the POSIX locale (when the C locale is not +coerced and the UTF-8 Mode is disabled). diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index fc4ee06f144f..539d62a2f0f4 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -423,13 +423,13 @@ get_default_standard_stream_error_handler(void) { const char *ctype_loc = setlocale(LC_CTYPE, NULL); if (ctype_loc != NULL) { - /* "surrogateescape" is the default in the legacy C locale */ - if (strcmp(ctype_loc, "C") == 0) { + /* surrogateescape is the default in the legacy C and POSIX locales */ + if (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "POSIX") == 0) { return "surrogateescape"; } #ifdef PY_COERCE_C_LOCALE - /* "surrogateescape" is the default in locale coercion target locales */ + /* surrogateescape is the default in locale coercion target locales */ const _LocaleCoercionTarget *target = NULL; for (target = _TARGET_LOCALES; target->locale_name; target++) { if (strcmp(ctype_loc, target->locale_name) == 0) { @@ -440,7 +440,7 @@ get_default_standard_stream_error_handler(void) } /* Otherwise return NULL to request the typical default error handler */ - return NULL; + return "strict"; } #ifdef PY_COERCE_C_LOCALE @@ -1851,20 +1851,42 @@ init_sys_streams(PyInterpreterState *interp) if (err) { *err = '\0'; err++; - if (*err && !errors) { - errors = err; + if (!err[0]) { + err = NULL; } } - if (*pythonioencoding && !encoding) { - encoding = pythonioencoding; + + /* Does PYTHONIOENCODING contain an encoding? */ + if (pythonioencoding[0]) { + if (!encoding) { + encoding = pythonioencoding; + } + + /* If the encoding is set but not the error handler, + use "strict" error handler by default. + PYTHONIOENCODING=latin1 behaves as + PYTHONIOENCODING=latin1:strict. */ + if (!err) { + err = "strict"; + } + } + + if (!errors && err != NULL) { + errors = err; } } - else if (interp->core_config.utf8_mode) { - encoding = "utf-8"; - errors = "surrogateescape"; + + if (interp->core_config.utf8_mode) { + if (!encoding) { + encoding = "utf-8"; + } + if (!errors) { + errors = "surrogateescape"; + } } - if (!errors && !pythonioencoding) { + + if (!errors) { /* Choose the default error handler based on the current locale */ errors = get_default_standard_stream_error_handler(); }

1 0

bpo-34485: stdout uses surrogateescape on POSIX locale (GH-8986)
by Victor Stinner Aug. 29, 2018

Aug. 29, 2018

https://github.com/python/cpython/commit/315877dc361d554bec34b4b62c270479ad… commit: 315877dc361d554bec34b4b62c270479ad36a1be branch: master author: Victor Stinner <vstinner(a)redhat.com> committer: GitHub <noreply(a)github.com> date: 2018-08-29T09:58:12+02:00 summary: bpo-34485: stdout uses surrogateescape on POSIX locale (GH-8986) Standard streams like sys.stdout now use the "surrogateescape" error handler, instead of "strict", on the POSIX locale (when the C locale is not coerced and the UTF-8 Mode is disabled). Add tests on sys.stdout.errors with LC_ALL=POSIX. files: A Misc/NEWS.d/next/Core and Builtins/2018-08-29-09-27-47.bpo-34485.5aJCmw.rst M Lib/test/test_sys.py M Python/pylifecycle.c diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index 005c82d13dc7..f3dd3bb67b38 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -654,10 +654,10 @@ def test_getfilesystemencoding(self): expected = None self.check_fsencoding(fs_encoding, expected) - def c_locale_get_error_handler(self, isolated=False, encoding=None): + def c_locale_get_error_handler(self, locale, isolated=False, encoding=None): # Force the POSIX locale env = os.environ.copy() - env["LC_ALL"] = "C" + env["LC_ALL"] = locale env["PYTHONCOERCECLOCALE"] = "0" code = '\n'.join(( 'import sys', @@ -683,44 +683,50 @@ def c_locale_get_error_handler(self, isolated=False, encoding=None): stdout, stderr = p.communicate() return stdout - def test_c_locale_surrogateescape(self): - out = self.c_locale_get_error_handler(isolated=True) + def check_locale_surrogateescape(self, locale): + out = self.c_locale_get_error_handler(locale, isolated=True) self.assertEqual(out, 'stdin: surrogateescape\n' 'stdout: surrogateescape\n' 'stderr: backslashreplace\n') # replace the default error handler - out = self.c_locale_get_error_handler(encoding=':ignore') + out = self.c_locale_get_error_handler(locale, encoding=':ignore') self.assertEqual(out, 'stdin: ignore\n' 'stdout: ignore\n' 'stderr: backslashreplace\n') # force the encoding - out = self.c_locale_get_error_handler(encoding='iso8859-1') + out = self.c_locale_get_error_handler(locale, encoding='iso8859-1') self.assertEqual(out, 'stdin: strict\n' 'stdout: strict\n' 'stderr: backslashreplace\n') - out = self.c_locale_get_error_handler(encoding='iso8859-1:') + out = self.c_locale_get_error_handler(locale, encoding='iso8859-1:') self.assertEqual(out, 'stdin: strict\n' 'stdout: strict\n' 'stderr: backslashreplace\n') # have no any effect - out = self.c_locale_get_error_handler(encoding=':') + out = self.c_locale_get_error_handler(locale, encoding=':') self.assertEqual(out, 'stdin: surrogateescape\n' 'stdout: surrogateescape\n' 'stderr: backslashreplace\n') - out = self.c_locale_get_error_handler(encoding='') + out = self.c_locale_get_error_handler(locale, encoding='') self.assertEqual(out, 'stdin: surrogateescape\n' 'stdout: surrogateescape\n' 'stderr: backslashreplace\n') + def test_c_locale_surrogateescape(self): + self.check_locale_surrogateescape('C') + + def test_posix_locale_surrogateescape(self): + self.check_locale_surrogateescape('POSIX') + def test_implementation(self): # This test applies to all implementations equally. diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-08-29-09-27-47.bpo-34485.5aJCmw.rst b/Misc/NEWS.d/next/Core and Builtins/2018-08-29-09-27-47.bpo-34485.5aJCmw.rst new file mode 100644 index 000000000000..893e4f573f16 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2018-08-29-09-27-47.bpo-34485.5aJCmw.rst @@ -0,0 +1,3 @@ +Standard streams like sys.stdout now use the "surrogateescape" error +handler, instead of "strict", on the POSIX locale (when the C locale is not +coerced and the UTF-8 Mode is disabled). diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 8c77859209ab..33af06ec18b9 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -345,13 +345,13 @@ get_stdio_errors(void) { const char *ctype_loc = setlocale(LC_CTYPE, NULL); if (ctype_loc != NULL) { - /* "surrogateescape" is the default in the legacy C locale */ - if (strcmp(ctype_loc, "C") == 0) { + /* surrogateescape is the default in the legacy C and POSIX locales */ + if (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "POSIX") == 0) { return "surrogateescape"; } #ifdef PY_COERCE_C_LOCALE - /* "surrogateescape" is the default in locale coercion target locales */ + /* surrogateescape is the default in locale coercion target locales */ const _LocaleCoercionTarget *target = NULL; for (target = _TARGET_LOCALES; target->locale_name; target++) { if (strcmp(ctype_loc, target->locale_name) == 0) { @@ -1791,16 +1791,29 @@ init_sys_streams(PyInterpreterState *interp) if (err) { *err = '\0'; err++; - if (*err && !errors) { - errors = err; + if (!err[0]) { + err = NULL; } } - if (!encoding && *pythonioencoding) { - encoding = pythonioencoding; - if (!errors) { - errors = "strict"; + + /* Does PYTHONIOENCODING contain an encoding? */ + if (pythonioencoding[0]) { + if (!encoding) { + encoding = pythonioencoding; + } + + /* If the encoding is set but not the error handler, + use "strict" error handler by default. + PYTHONIOENCODING=latin1 behaves as + PYTHONIOENCODING=latin1:strict. */ + if (!err) { + err = "strict"; } } + + if (!errors && err != NULL) { + errors = err; + } } if (interp->core_config.utf8_mode) {

1 0

Improve commutativity of math.hypot() and math.dist() (GH-8984)
by Raymond Hettinger Aug. 29, 2018

Aug. 29, 2018

https://github.com/python/cpython/commit/21786f5186383e8912e761eccd0f4ac1cc… commit: 21786f5186383e8912e761eccd0f4ac1cca83217 branch: master author: Raymond Hettinger <rhettinger(a)users.noreply.github.com> committer: GitHub <noreply(a)github.com> date: 2018-08-28T22:47:24-07:00 summary: Improve commutativity of math.hypot() and math.dist() (GH-8984) files: M Modules/mathmodule.c diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 62d327998fd1..37934f60e9c4 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2037,26 +2037,32 @@ where *max* is the largest value in the vector, compute: max * sqrt(sum((x / max) ** 2 for x in vec)) -When a maximum value is found, it is swapped to the end. This -lets us skip one loop iteration and just add 1.0 at the end. -Saving the largest value for last also helps improve accuracy. - -Kahan summation is used to improve accuracy. The *csum* -variable tracks the cumulative sum and *frac* tracks -fractional round-off error for the most recent addition. - The value of the *max* variable must be present in *vec* or should equal to 0.0 when n==0. Likewise, *max* will be INF if an infinity is present in the vec. The *found_nan* variable indicates whether some member of the *vec* is a NaN. + +To improve accuracy and to increase the number of cases where +vector_norm() is commutative, we use a variant of Neumaier +summation specialized to exploit that we always know that +|csum| >= |x|. + +The *csum* variable tracks the cumulative sum and *frac* tracks +the cumulative fractional errors at each step. Since this +variant assumes that |csum| >= |x| at each step, we establish +the precondition by starting the accumulation from 1.0 which +represents an entry equal to *max*. This also provides a nice +side benefit in that it lets us skip over a *max* entry (which +is swapped into *last*) saving us one iteration through the loop. + */ static inline double vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) { - double x, csum = 0.0, oldcsum, frac = 0.0, last; + double x, csum = 1.0, oldcsum, frac = 0.0, last; Py_ssize_t i; if (Py_IS_INFINITY(max)) { @@ -2078,14 +2084,14 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) last = max; } x /= max; - x = x*x - frac; + x = x*x; + assert(csum >= x); oldcsum = csum; csum += x; - frac = (csum - oldcsum) - x; + frac += (oldcsum - csum) + x; } assert(last == max); - csum += 1.0 - frac; - return max * sqrt(csum); + return max * sqrt(csum + frac); } #define NUM_STACK_ELEMS 16

1 0