[Python-checkins] bpo-29240: Fix locale encodings in UTF-8 Mode (#5170)

Victor Stinner webhook-mailer at python.org
Mon Jan 15 04:45:57 EST 2018


https://github.com/python/cpython/commit/7ed7aead9503102d2ed316175f198104e0cd674c
commit: 7ed7aead9503102d2ed316175f198104e0cd674c
branch: master
author: Victor Stinner <victor.stinner at gmail.com>
committer: GitHub <noreply at github.com>
date: 2018-01-15T10:45:49+01:00
summary:

bpo-29240: Fix locale encodings in UTF-8 Mode (#5170)

Modify locale.localeconv(), time.tzname, os.strerror() and other
functions to ignore the UTF-8 Mode: always use the current locale
encoding.

Changes:

* Add _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx(). On decoding or
  encoding error, they return the position of the error and an error
  message which are used to raise Unicode errors in
  PyUnicode_DecodeLocale() and PyUnicode_EncodeLocale().
* Replace _Py_DecodeCurrentLocale() with _Py_DecodeLocaleEx().
* PyUnicode_DecodeLocale() now uses _Py_DecodeLocaleEx() for all
  cases, especially for the strict error handler.
* Add _Py_DecodeUTF8Ex(): return more information on decoding error
  and supports the strict error handler.
* Rename _Py_EncodeUTF8_surrogateescape() to _Py_EncodeUTF8Ex().
* Replace _Py_EncodeCurrentLocale() with _Py_EncodeLocaleEx().
* Ignore the UTF-8 mode to encode/decode localeconv(), strerror()
  and time zone name.
* Remove PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize()
  and PyUnicode_EncodeLocale() now ignore the UTF-8 mode: always use
  the "current" locale.
* Remove _PyUnicode_DecodeCurrentLocale(),
  _PyUnicode_DecodeCurrentLocaleAndSize() and
  _PyUnicode_EncodeCurrentLocale().

files:
M Doc/c-api/sys.rst
M Doc/c-api/unicode.rst
M Include/fileutils.h
M Include/unicodeobject.h
M Modules/_datetimemodule.c
M Modules/_localemodule.c
M Modules/getpath.c
M Modules/readline.c
M Modules/timemodule.c
M Objects/unicodeobject.c
M Python/fileutils.c
M Python/pathconfig.c

diff --git a/Doc/c-api/sys.rst b/Doc/c-api/sys.rst
index 20bc7bd3df3..e4da96c493c 100644
--- a/Doc/c-api/sys.rst
+++ b/Doc/c-api/sys.rst
@@ -106,6 +106,16 @@ Operating System Utilities
    surrogate character, escape the bytes using the surrogateescape error
    handler instead of decoding them.
 
+   Encoding, highest priority to lowest priority:
+
+   * ``UTF-8`` on macOS and Android;
+   * ``UTF-8`` if the Python UTF-8 mode is enabled;
+   * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
+     ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
+     and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
+     ``ISO-8859-1`` encoding.
+   * the current locale encoding.
+
    Return a pointer to a newly allocated wide character string, use
    :c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write
    the number of wide characters excluding the null character into ``*size``
@@ -137,6 +147,18 @@ Operating System Utilities
    :ref:`surrogateescape error handler <surrogateescape>`: surrogate characters
    in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
 
+   Encoding, highest priority to lowest priority:
+
+   * ``UTF-8`` on macOS and Android;
+   * ``UTF-8`` if the Python UTF-8 mode is enabled;
+   * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
+     ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
+     and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
+     ``ISO-8859-1`` encoding.
+   * the current locale encoding.
+
+   The function uses the UTF-8 encoding in the Python UTF-8 mode.
+
    Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free`
    to free the memory. Return ``NULL`` on encoding error or memory allocation
    error
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 45aff1b7e3c..3f6c0559907 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -770,12 +770,20 @@ system.
    :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
    Python startup).
 
+   This function ignores the Python UTF-8 mode.
+
    .. seealso::
 
       The :c:func:`Py_DecodeLocale` function.
 
    .. versionadded:: 3.3
 
+   .. versionchanged:: 3.7
+      The function now also uses the current locale encoding for the
+      ``surrogateescape`` error handler. Previously, :c:func:`Py_DecodeLocale`
+      was used for the ``surrogateescape``, and the current locale encoding was
+      used for ``strict``.
+
 
 .. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors)
 
@@ -797,12 +805,20 @@ system.
    :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
    Python startup).
 
+   This function ignores the Python UTF-8 mode.
+
    .. seealso::
 
       The :c:func:`Py_EncodeLocale` function.
 
    .. versionadded:: 3.3
 
+   .. versionchanged:: 3.7
+      The function now also uses the current locale encoding for the
+      ``surrogateescape`` error handler. Previously, :c:func:`Py_EncodeLocale`
+      was used for the ``surrogateescape``, and the current locale encoding was
+      used for ``strict``.
+
 
 File System Encoding
 """"""""""""""""""""
diff --git a/Include/fileutils.h b/Include/fileutils.h
index 2527d84669d..b4f8b11a635 100644
--- a/Include/fileutils.h
+++ b/Include/fileutils.h
@@ -20,18 +20,41 @@ PyAPI_FUNC(char*) _Py_EncodeLocaleRaw(
 #endif
 
 #ifdef Py_BUILD_CORE
+PyAPI_FUNC(int) _Py_DecodeUTF8Ex(
+    const char *arg,
+    Py_ssize_t arglen,
+    wchar_t **wstr,
+    size_t *wlen,
+    const char **reason,
+    int surrogateescape);
+
+PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
+    const wchar_t *text,
+    char **str,
+    size_t *error_pos,
+    const char **reason,
+    int raw_malloc,
+    int surrogateescape);
+
 PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
-    const char *s,
-    Py_ssize_t size,
-    size_t *p_wlen);
+    const char *arg,
+    Py_ssize_t arglen);
 
-PyAPI_FUNC(wchar_t *) _Py_DecodeCurrentLocale(
+PyAPI_FUNC(int) _Py_DecodeLocaleEx(
     const char *arg,
-    size_t *size);
+    wchar_t **wstr,
+    size_t *wlen,
+    const char **reason,
+    int current_locale,
+    int surrogateescape);
 
-PyAPI_FUNC(char*) _Py_EncodeCurrentLocale(
+PyAPI_FUNC(int) _Py_EncodeLocaleEx(
     const wchar_t *text,
-    size_t *error_pos);
+    char **str,
+    size_t *error_pos,
+    const char **reason,
+    int current_locale,
+    int surrogateescape);
 #endif
 
 #ifndef Py_LIMITED_API
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index d263026b562..0274de6733a 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1810,20 +1810,6 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
     PyObject *unicode,
     const char *errors
     );
-
-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocale(
-    const char *str,
-    const char *errors);
-
-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocaleAndSize(
-    const char *str,
-    Py_ssize_t len,
-    const char *errors);
-
-PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCurrentLocale(
-    PyObject *unicode,
-    const char *errors
-    );
 #endif
 
 /* --- File system encoding ---------------------------------------------- */
diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c
index 624196702b6..e68c7c0a1c5 100644
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@@ -696,7 +696,7 @@ static int parse_isoformat_date(const char *dtstr,
     if (NULL == p) {
         return -1;
     }
-    
+
     if (*(p++) != '-') {
         return -2;
     }
diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c
index e364668d3dd..324b694b830 100644
--- a/Modules/_localemodule.c
+++ b/Modules/_localemodule.c
@@ -572,8 +572,9 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args)
     if (!PyArg_ParseTuple(args, "sz", &domain, &codeset))
         return NULL;
     codeset = bind_textdomain_codeset(domain, codeset);
-    if (codeset)
+    if (codeset) {
         return PyUnicode_DecodeLocale(codeset, NULL);
+    }
     Py_RETURN_NONE;
 }
 #endif
diff --git a/Modules/getpath.c b/Modules/getpath.c
index 85e737b61d0..e6a3e8e78cd 100644
--- a/Modules/getpath.c
+++ b/Modules/getpath.c
@@ -449,8 +449,8 @@ search_for_exec_prefix(const _PyCoreConfig *core_config,
             n = fread(buf, 1, MAXPATHLEN, f);
             buf[n] = '\0';
             fclose(f);
-            rel_builddir_path = _Py_DecodeUTF8_surrogateescape(buf, n, NULL);
-            if (rel_builddir_path != NULL) {
+            rel_builddir_path = _Py_DecodeUTF8_surrogateescape(buf, n);
+            if (rel_builddir_path) {
                 wcsncpy(exec_prefix, calculate->argv0_path, MAXPATHLEN);
                 exec_prefix[MAXPATHLEN] = L'\0';
                 joinpath(exec_prefix, rel_builddir_path);
diff --git a/Modules/readline.c b/Modules/readline.c
index caf661c81d4..811fca8cd92 100644
--- a/Modules/readline.c
+++ b/Modules/readline.c
@@ -132,13 +132,13 @@ static PyModuleDef readlinemodule;
 static PyObject *
 encode(PyObject *b)
 {
-    return _PyUnicode_EncodeCurrentLocale(b, "surrogateescape");
+    return PyUnicode_EncodeLocale(b, "surrogateescape");
 }
 
 static PyObject *
 decode(const char *s)
 {
-    return _PyUnicode_DecodeCurrentLocale(s, "surrogateescape");
+    return PyUnicode_DecodeLocale(s, "surrogateescape");
 }
 
 
diff --git a/Modules/timemodule.c b/Modules/timemodule.c
index 4e7f9d9482e..b17ab5ae824 100644
--- a/Modules/timemodule.c
+++ b/Modules/timemodule.c
@@ -418,11 +418,11 @@ tmtotuple(struct tm *p
     SET(8, p->tm_isdst);
 #ifdef HAVE_STRUCT_TM_TM_ZONE
     PyStructSequence_SET_ITEM(v, 9,
-        _PyUnicode_DecodeCurrentLocale(p->tm_zone, "surrogateescape"));
+        PyUnicode_DecodeLocale(p->tm_zone, "surrogateescape"));
     SET(10, p->tm_gmtoff);
 #else
     PyStructSequence_SET_ITEM(v, 9,
-        _PyUnicode_DecodeCurrentLocale(zone, "surrogateescape"));
+        PyUnicode_DecodeLocale(zone, "surrogateescape"));
     PyStructSequence_SET_ITEM(v, 10, _PyLong_FromTime_t(gmtoff));
 #endif /* HAVE_STRUCT_TM_TM_ZONE */
 #undef SET
@@ -809,8 +809,7 @@ time_strftime(PyObject *self, PyObject *args)
 #ifdef HAVE_WCSFTIME
             ret = PyUnicode_FromWideChar(outbuf, buflen);
 #else
-            ret = _PyUnicode_DecodeCurrentLocaleAndSize(outbuf, buflen,
-                                                        "surrogateescape");
+            ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, "surrogateescape");
 #endif
             PyMem_Free(outbuf);
             break;
@@ -1541,8 +1540,8 @@ PyInit_timezone(PyObject *m) {
     PyModule_AddIntConstant(m, "altzone", timezone-3600);
 #endif
     PyModule_AddIntConstant(m, "daylight", daylight);
-    otz0 = _PyUnicode_DecodeCurrentLocale(tzname[0], "surrogateescape");
-    otz1 = _PyUnicode_DecodeCurrentLocale(tzname[1], "surrogateescape");
+    otz0 = PyUnicode_DecodeLocale(tzname[0], "surrogateescape");
+    otz1 = PyUnicode_DecodeLocale(tzname[1], "surrogateescape");
     PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
 #else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
     {
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index a6e02f478b1..07330119dc3 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3327,53 +3327,6 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
     return NULL;
 }
 
-static size_t
-wcstombs_errorpos(const wchar_t *wstr)
-{
-    size_t len;
-#if SIZEOF_WCHAR_T == 2
-    wchar_t buf[3];
-#else
-    wchar_t buf[2];
-#endif
-    char outbuf[MB_LEN_MAX];
-    const wchar_t *start, *previous;
-
-#if SIZEOF_WCHAR_T == 2
-    buf[2] = 0;
-#else
-    buf[1] = 0;
-#endif
-    start = wstr;
-    while (*wstr != L'\0')
-    {
-        previous = wstr;
-#if SIZEOF_WCHAR_T == 2
-        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
-            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
-        {
-            buf[0] = wstr[0];
-            buf[1] = wstr[1];
-            wstr += 2;
-        }
-        else {
-            buf[0] = *wstr;
-            buf[1] = 0;
-            wstr++;
-        }
-#else
-        buf[0] = *wstr;
-        wstr++;
-#endif
-        len = wcstombs(outbuf, buf, sizeof(outbuf));
-        if (len == (size_t)-1)
-            return previous - start;
-    }
-
-    /* failed to find the unencodable character */
-    return 0;
-}
-
 static int
 locale_error_handler(const char *errors, int *surrogateescape)
 {
@@ -3396,130 +3349,60 @@ locale_error_handler(const char *errors, int *surrogateescape)
 }
 
 static PyObject *
-unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale)
+unicode_encode_locale(PyObject *unicode, const char *errors,
+                      int current_locale)
 {
-    Py_ssize_t wlen, wlen2;
-    wchar_t *wstr;
-    char *errmsg;
-    PyObject *bytes, *reason, *exc;
-    size_t error_pos, errlen;
     int surrogateescape;
-
     if (locale_error_handler(errors, &surrogateescape) < 0)
         return NULL;
 
-    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
-    if (wstr == NULL)
+    Py_ssize_t wlen;
+    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
+    if (wstr == NULL) {
         return NULL;
+    }
 
-    wlen2 = wcslen(wstr);
+    Py_ssize_t wlen2 = wcslen(wstr);
     if (wlen2 != wlen) {
         PyMem_Free(wstr);
         PyErr_SetString(PyExc_ValueError, "embedded null character");
         return NULL;
     }
 
-    if (surrogateescape) {
-        /* "surrogateescape" error handler */
-        char *str;
-
-        if (current_locale) {
-            str = _Py_EncodeCurrentLocale(wstr, &error_pos);
-        }
-        else {
-            str = Py_EncodeLocale(wstr, &error_pos);
-        }
-        if (str == NULL) {
-            if (error_pos == (size_t)-1) {
-                PyErr_NoMemory();
-                PyMem_Free(wstr);
-                return NULL;
-            }
-            else {
-                goto encode_error;
+    char *str;
+    size_t error_pos;
+    const char *reason;
+    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
+                                 current_locale, surrogateescape);
+    if (res != 0) {
+        if (res == -2) {
+            PyObject *exc;
+            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
+                    "locale", unicode,
+                    (Py_ssize_t)error_pos,
+                    (Py_ssize_t)(error_pos+1),
+                    reason);
+            if (exc != NULL) {
+                PyCodec_StrictErrors(exc);
+                Py_DECREF(exc);
             }
-        }
-        PyMem_Free(wstr);
-
-        bytes = PyBytes_FromString(str);
-        if (current_locale) {
-            PyMem_RawFree(str);
+            return NULL;
         }
         else {
-            PyMem_Free(str);
-        }
-    }
-    else {
-        /* strict mode */
-        size_t len, len2;
-
-        len = wcstombs(NULL, wstr, 0);
-        if (len == (size_t)-1) {
-            error_pos = (size_t)-1;
-            goto encode_error;
-        }
-
-        bytes = PyBytes_FromStringAndSize(NULL, len);
-        if (bytes == NULL) {
+            PyErr_NoMemory();
             PyMem_Free(wstr);
             return NULL;
         }
-
-        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
-        if (len2 == (size_t)-1 || len2 > len) {
-            Py_DECREF(bytes);
-            error_pos = (size_t)-1;
-            goto encode_error;
-        }
-        PyMem_Free(wstr);
     }
-    return bytes;
-
-encode_error:
-    errmsg = strerror(errno);
-    assert(errmsg != NULL);
-
-    if (error_pos == (size_t)-1)
-        error_pos = wcstombs_errorpos(wstr);
-
     PyMem_Free(wstr);
 
-    wstr = Py_DecodeLocale(errmsg, &errlen);
-    if (wstr != NULL) {
-        reason = PyUnicode_FromWideChar(wstr, errlen);
-        PyMem_RawFree(wstr);
-    } else {
-        errmsg = NULL;
-    }
-
-    if (errmsg == NULL)
-        reason = PyUnicode_FromString(
-            "wcstombs() encountered an unencodable "
-            "wide character");
-    if (reason == NULL)
-        return NULL;
-
-    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
-                                "locale", unicode,
-                                (Py_ssize_t)error_pos,
-                                (Py_ssize_t)(error_pos+1),
-                                reason);
-    Py_DECREF(reason);
-    if (exc != NULL) {
-        PyCodec_StrictErrors(exc);
-        Py_DECREF(exc);
-    }
-    return NULL;
+    PyObject *bytes = PyBytes_FromString(str);
+    PyMem_RawFree(str);
+    return bytes;
 }
 
 PyObject *
 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
-{
-    return unicode_encode_locale(unicode, errors, 0);
-}
-
-PyObject *
-_PyUnicode_EncodeCurrentLocale(PyObject *unicode, const char *errors)
 {
     return unicode_encode_locale(unicode, errors, 1);
 }
@@ -3687,51 +3570,11 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
     return NULL;
 }
 
-static size_t
-mbstowcs_errorpos(const char *str, size_t len)
-{
-#ifdef HAVE_MBRTOWC
-    const char *start = str;
-    mbstate_t mbs;
-    size_t converted;
-    wchar_t ch;
-
-    memset(&mbs, 0, sizeof mbs);
-    while (len)
-    {
-        converted = mbrtowc(&ch, str, len, &mbs);
-        if (converted == 0)
-            /* Reached end of string */
-            break;
-        if (converted == (size_t)-1 || converted == (size_t)-2) {
-            /* Conversion error or incomplete character */
-            return str - start;
-        }
-        else {
-            str += converted;
-            len -= converted;
-        }
-    }
-    /* failed to find the undecodable byte sequence */
-    return 0;
-#endif
-    return 0;
-}
-
 static PyObject*
 unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
                       int current_locale)
 {
-    wchar_t smallbuf[256];
-    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
-    wchar_t *wstr;
-    size_t wlen, wlen2;
-    PyObject *unicode;
     int surrogateescape;
-    size_t error_pos, errlen;
-    char *errmsg;
-    PyObject *exc, *reason = NULL;   /* initialize to prevent gcc warning */
-
     if (locale_error_handler(errors, &surrogateescape) < 0)
         return NULL;
 
@@ -3740,113 +3583,47 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
         return NULL;
     }
 
-    if (surrogateescape) {
-        /* "surrogateescape" error handler */
-        if (current_locale) {
-            wstr = _Py_DecodeCurrentLocale(str, &wlen);
+    wchar_t *wstr;
+    size_t wlen;
+    const char *reason;
+    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
+                                 current_locale, surrogateescape);
+    if (res != 0) {
+        if (res == -2) {
+            PyObject *exc;
+            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
+                                        "locale", str, len,
+                                        (Py_ssize_t)wlen,
+                                        (Py_ssize_t)(wlen + 1),
+                                        reason);
+            if (exc != NULL) {
+                PyCodec_StrictErrors(exc);
+                Py_DECREF(exc);
+            }
         }
         else {
-            wstr = Py_DecodeLocale(str, &wlen);
-        }
-        if (wstr == NULL) {
-            if (wlen == (size_t)-1)
-                PyErr_NoMemory();
-            else
-                PyErr_SetFromErrno(PyExc_OSError);
-            return NULL;
+            PyErr_NoMemory();
         }
-
-        unicode = PyUnicode_FromWideChar(wstr, wlen);
-        PyMem_RawFree(wstr);
+        return NULL;
     }
-    else {
-        /* strict mode */
-#ifndef HAVE_BROKEN_MBSTOWCS
-        wlen = mbstowcs(NULL, str, 0);
-#else
-        wlen = len;
-#endif
-        if (wlen == (size_t)-1)
-            goto decode_error;
-        if (wlen+1 <= smallbuf_len) {
-            wstr = smallbuf;
-        }
-        else {
-            wstr = PyMem_New(wchar_t, wlen+1);
-            if (!wstr)
-                return PyErr_NoMemory();
-        }
 
-        wlen2 = mbstowcs(wstr, str, wlen+1);
-        if (wlen2 == (size_t)-1) {
-            if (wstr != smallbuf)
-                PyMem_Free(wstr);
-            goto decode_error;
-        }
-#ifdef HAVE_BROKEN_MBSTOWCS
-        assert(wlen2 == wlen);
-#endif
-        unicode = PyUnicode_FromWideChar(wstr, wlen2);
-        if (wstr != smallbuf)
-            PyMem_Free(wstr);
-    }
+    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
+    PyMem_RawFree(wstr);
     return unicode;
-
-decode_error:
-    errmsg = strerror(errno);
-    assert(errmsg != NULL);
-
-    error_pos = mbstowcs_errorpos(str, len);
-    wstr = Py_DecodeLocale(errmsg, &errlen);
-    if (wstr != NULL) {
-        reason = PyUnicode_FromWideChar(wstr, errlen);
-        PyMem_RawFree(wstr);
-    }
-
-    if (reason == NULL)
-        reason = PyUnicode_FromString(
-            "mbstowcs() encountered an invalid multibyte sequence");
-    if (reason == NULL)
-        return NULL;
-
-    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
-                                "locale", str, len,
-                                (Py_ssize_t)error_pos,
-                                (Py_ssize_t)(error_pos+1),
-                                reason);
-    Py_DECREF(reason);
-    if (exc != NULL) {
-        PyCodec_StrictErrors(exc);
-        Py_DECREF(exc);
-    }
-    return NULL;
 }
 
 PyObject*
 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
                               const char *errors)
-{
-    return unicode_decode_locale(str, len, errors, 0);
-}
-
-PyObject*
-_PyUnicode_DecodeCurrentLocaleAndSize(const char *str, Py_ssize_t len,
-                                      const char *errors)
 {
     return unicode_decode_locale(str, len, errors, 1);
 }
 
-PyObject*
-_PyUnicode_DecodeCurrentLocale(const char *str, const char *errors)
-{
-    return unicode_decode_locale(str, (Py_ssize_t)strlen(str), errors, 1);
-}
-
 PyObject*
 PyUnicode_DecodeLocale(const char *str, const char *errors)
 {
     Py_ssize_t size = (Py_ssize_t)strlen(str);
-    return unicode_decode_locale(str, size, errors, 0);
+    return unicode_decode_locale(str, size, errors, 1);
 }
 
 
@@ -3878,7 +3655,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
                                 Py_FileSystemDefaultEncodeErrors);
     }
     else {
-        return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
+        return unicode_decode_locale(s, size,
+                                     Py_FileSystemDefaultEncodeErrors, 0);
     }
 #endif
 }
@@ -5128,17 +4906,23 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
 }
 
 
-/* UTF-8 decoder using the surrogateescape error handler .
+/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
+   non-zero, use strict error handler otherwise.
 
-   On success, return a pointer to a newly allocated wide character string (use
-   PyMem_RawFree() to free the memory) and write the output length (in number
-   of wchar_t units) into *p_wlen (if p_wlen is set).
+   On success, write a pointer to a newly allocated wide character string into
+   *wstr (use PyMem_RawFree() to free the memory) and write the output length
+   (in number of wchar_t units) into *wlen (if wlen is set).
 
-   On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen
-   (if p_wlen is set). */
-wchar_t*
-_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
+   On memory allocation failure, return -1.
+
+   On decoding error (if surrogateescape is zero), return -2. If wlen is
+   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
+   is not NULL, write the decoding error message into *reason. */
+int
+_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
+                 const char **reason, int surrogateescape)
 {
+    const char *orig_s = s;
     const char *e;
     wchar_t *unicode;
     Py_ssize_t outpos;
@@ -5146,18 +4930,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
     /* Note: size will always be longer than the resulting Unicode
        character count */
     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
-        if (p_wlen) {
-            *p_wlen = (size_t)-1;
-        }
-        return NULL;
+        return -1;
     }
 
     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
     if (!unicode) {
-        if (p_wlen) {
-            *p_wlen = (size_t)-1;
-        }
-        return NULL;
+        return -1;
     }
 
     /* Unpack UTF-8 encoded data */
@@ -5175,7 +4953,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
             Py_UNREACHABLE();
 #else
             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
-            /*  compute and append the two surrogates: */
+            /* write a surrogate pair */
             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
 #endif
@@ -5183,60 +4961,88 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
         else {
             if (!ch && s == e)
                 break;
+            if (!surrogateescape) {
+                PyMem_RawFree(unicode );
+                if (reason != NULL) {
+                    switch (ch) {
+                    case 0:
+                        *reason = "unexpected end of data";
+                        break;
+                    case 1:
+                        *reason = "invalid start byte";
+                        break;
+                    /* 2, 3, 4 */
+                    default:
+                        *reason = "invalid continuation byte";
+                        break;
+                    }
+                }
+                if (wlen != NULL) {
+                    *wlen = s - orig_s;
+                }
+                return -2;
+            }
             /* surrogateescape */
             unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
         }
     }
     unicode[outpos] = L'\0';
-    if (p_wlen) {
-        *p_wlen = outpos;
+    if (wlen) {
+        *wlen = outpos;
     }
-    return unicode;
+    *wstr = unicode;
+    return 0;
+}
+
+wchar_t*
+_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
+{
+    wchar_t *wstr;
+    int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
+    if (res != 0) {
+        return NULL;
+    }
+    return wstr;
 }
 
 
 /* UTF-8 encoder using the surrogateescape error handler .
 
-   On success, return a pointer to a newly allocated character string (use
-   PyMem_Free() to free the memory).
+   On success, return 0 and write the newly allocated character string (use
+   PyMem_Free() to free the memory) into *str.
 
-   On encoding failure, return NULL and write the position of the invalid
-   surrogate character into *error_pos (if error_pos is set).
+   On encoding failure, return -2 and write the position of the invalid
+   surrogate character into *error_pos (if error_pos is set) and the decoding
+   error message into *reason (if reason is set).
 
-   On memory allocation failure, return NULL and write (size_t)-1 into
-   *error_pos (if error_pos is set). */
-char*
-_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
-                               int raw_malloc)
+   On memory allocation failure, return -1. */
+int
+_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
+                 const char **reason, int raw_malloc, int surrogateescape)
 {
     const Py_ssize_t max_char_size = 4;
     Py_ssize_t len = wcslen(text);
 
     assert(len >= 0);
 
+    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
+        return -1;
+    }
     char *bytes;
-    if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
-        if (raw_malloc) {
-            bytes = PyMem_RawMalloc((len + 1) * max_char_size);
-        }
-        else {
-            bytes = PyMem_Malloc((len + 1) * max_char_size);
-        }
+    if (raw_malloc) {
+        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
     }
     else {
-        bytes = NULL;
+        bytes = PyMem_Malloc((len + 1) * max_char_size);
     }
     if (bytes == NULL) {
-        if (error_pos != NULL) {
-            *error_pos = (size_t)-1;
-        }
-        return NULL;
+        return -1;
     }
 
     char *p = bytes;
     Py_ssize_t i;
-    for (i = 0; i < len;) {
-        Py_UCS4 ch = text[i++];
+    for (i = 0; i < len; i++) {
+        Py_UCS4 ch = text[i];
 
         if (ch < 0x80) {
             /* Encode ASCII */
@@ -5250,11 +5056,20 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
         }
         else if (Py_UNICODE_IS_SURROGATE(ch)) {
             /* surrogateescape error handler */
-            if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
+            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
                 if (error_pos != NULL) {
-                    *error_pos = (size_t)i - 1;
+                    *error_pos = (size_t)i;
                 }
-                goto error;
+                if (reason != NULL) {
+                    *reason = "encoding error";
+                }
+                if (raw_malloc) {
+                    PyMem_RawFree(bytes);
+                }
+                else {
+                    PyMem_Free(bytes);
+                }
+                return -2;
             }
             *p++ = (char)(ch & 0xff);
         }
@@ -5286,18 +5101,16 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
         if (error_pos != NULL) {
             *error_pos = (size_t)-1;
         }
-        goto error;
-    }
-    return bytes2;
-
- error:
-    if (raw_malloc) {
-        PyMem_RawFree(bytes);
-    }
-    else {
-        PyMem_Free(bytes);
+        if (raw_malloc) {
+            PyMem_RawFree(bytes);
+        }
+        else {
+            PyMem_Free(bytes);
+        }
+        return -1;
     }
-    return NULL;
+    *str = bytes2;
+    return 0;
 }
 
 
diff --git a/Python/fileutils.c b/Python/fileutils.c
index 9275494e864..a50075eced0 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -20,9 +20,6 @@ extern int winerror_to_errno(int);
 #include <fcntl.h>
 #endif /* HAVE_FCNTL_H */
 
-extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text,
-                                            size_t *error_pos, int raw_malloc);
-
 #ifdef O_CLOEXEC
 /* Does open() support the O_CLOEXEC flag? Possible values:
 
@@ -69,7 +66,10 @@ _Py_device_encoding(int fd)
     Py_RETURN_NONE;
 }
 
-#if !defined(__APPLE__) && !defined(MS_WINDOWS)
+#if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS)
+
+#define USE_FORCE_ASCII
+
 extern int _Py_normalize_encoding(const char *, char *, size_t);
 
 /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
@@ -90,7 +90,7 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);
 
        1: the workaround is used: Py_EncodeLocale() uses
           encode_ascii_surrogateescape() and Py_DecodeLocale() uses
-          decode_ascii_surrogateescape()
+          decode_ascii()
        0: the workaround is not used: Py_EncodeLocale() uses wcstombs() and
           Py_DecodeLocale() uses mbstowcs()
       -1: unknown, need to call check_force_ascii() to get the value
@@ -180,16 +180,15 @@ check_force_ascii(void)
     return 1;
 }
 
-static char*
-encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_malloc)
+static int
+encode_ascii(const wchar_t *text, char **str,
+             size_t *error_pos, const char **reason,
+             int raw_malloc, int surrogateescape)
 {
     char *result = NULL, *out;
     size_t len, i;
     wchar_t ch;
 
-    if (error_pos != NULL)
-        *error_pos = (size_t)-1;
-
     len = wcslen(text);
 
     /* +1 for NULL byte */
@@ -199,8 +198,9 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal
     else {
         result = PyMem_Malloc(len + 1);
     }
-    if (result == NULL)
-        return NULL;
+    if (result == NULL) {
+        return -1;
+    }
 
     out = result;
     for (i=0; i<len; i++) {
@@ -210,60 +210,84 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal
             /* ASCII character */
             *out++ = (char)ch;
         }
-        else if (0xdc80 <= ch && ch <= 0xdcff) {
+        else if (surrogateescape && 0xdc80 <= ch && ch <= 0xdcff) {
             /* UTF-8b surrogate */
             *out++ = (char)(ch - 0xdc00);
         }
         else {
-            if (error_pos != NULL) {
-                *error_pos = i;
-            }
             if (raw_malloc) {
                 PyMem_RawFree(result);
             }
             else {
                 PyMem_Free(result);
             }
-            return NULL;
+            if (error_pos != NULL) {
+                *error_pos = i;
+            }
+            if (reason) {
+                *reason = "encoding error";
+            }
+            return -2;
         }
     }
     *out = '\0';
-    return result;
+    *str = result;
+    return 0;
 }
-#endif   /* !defined(__APPLE__) && !defined(MS_WINDOWS) */
+#endif   /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */
+
 
-#if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC))
-static wchar_t*
-decode_ascii_surrogateescape(const char *arg, size_t *size)
+#if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
+static int
+decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
+             const char **reason, int surrogateescape)
 {
     wchar_t *res;
     unsigned char *in;
     wchar_t *out;
     size_t argsize = strlen(arg) + 1;
 
-    if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
-        return NULL;
-    res = PyMem_RawMalloc(argsize*sizeof(wchar_t));
-    if (!res)
-        return NULL;
+    if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
+        return -1;
+    }
+    res = PyMem_RawMalloc(argsize * sizeof(wchar_t));
+    if (!res) {
+        return -1;
+    }
 
-    in = (unsigned char*)arg;
     out = res;
-    while(*in)
-        if(*in < 128)
-            *out++ = *in++;
-        else
-            *out++ = 0xdc00 + *in++;
+    for (in = (unsigned char*)arg; *in; in++) {
+        unsigned char ch = *in;
+        if (ch < 128) {
+            *out++ = ch;
+        }
+        else {
+            if (!surrogateescape) {
+                PyMem_RawFree(res);
+                if (wlen) {
+                    *wlen = in - (unsigned char*)arg;
+                }
+                if (reason) {
+                    *reason = "decoding error";
+                }
+                return -2;
+            }
+            *out++ = 0xdc00 + ch;
+        }
+    }
     *out = 0;
-    if (size != NULL)
-        *size = out - res;
-    return res;
+
+    if (wlen != NULL) {
+        *wlen = out - res;
+    }
+    *wstr = res;
+    return 0;
 }
-#endif
+#endif   /* !HAVE_MBRTOWC */
 
-#if !defined(__APPLE__) && !defined(__ANDROID__)
-static wchar_t*
-decode_current_locale(const char* arg, size_t *size)
+static int
+decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
+                      const char **reason, int surrogateescape)
 {
     wchar_t *res;
     size_t argsize;
@@ -284,15 +308,15 @@ decode_current_locale(const char* arg, size_t *size)
     argsize = mbstowcs(NULL, arg, 0);
 #endif
     if (argsize != (size_t)-1) {
-        if (argsize == PY_SSIZE_T_MAX)
-            goto oom;
-        argsize += 1;
-        if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
-            goto oom;
-        res = (wchar_t *)PyMem_RawMalloc(argsize*sizeof(wchar_t));
-        if (!res)
-            goto oom;
-        count = mbstowcs(res, arg, argsize);
+        if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
+            return -1;
+        }
+        res = (wchar_t *)PyMem_RawMalloc((argsize + 1) * sizeof(wchar_t));
+        if (!res) {
+            return -1;
+        }
+
+        count = mbstowcs(res, arg, argsize + 1);
         if (count != (size_t)-1) {
             wchar_t *tmp;
             /* Only use the result if it contains no
@@ -301,13 +325,16 @@ decode_current_locale(const char* arg, size_t *size)
                          !Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
                 ;
             if (*tmp == 0) {
-                if (size != NULL)
-                    *size = count;
-                return res;
+                if (wlen != NULL) {
+                    *wlen = count;
+                }
+                *wstr = res;
+                return 0;
             }
         }
         PyMem_RawFree(res);
     }
+
     /* Conversion failed. Fall back to escaping with surrogateescape. */
 #ifdef HAVE_MBRTOWC
     /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
@@ -315,30 +342,37 @@ decode_current_locale(const char* arg, size_t *size)
     /* Overallocate; as multi-byte characters are in the argument, the
        actual output could use less memory. */
     argsize = strlen(arg) + 1;
-    if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
-        goto oom;
-    res = (wchar_t*)PyMem_RawMalloc(argsize*sizeof(wchar_t));
-    if (!res)
-        goto oom;
+    if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
+        return -1;
+    }
+    res = (wchar_t*)PyMem_RawMalloc(argsize * sizeof(wchar_t));
+    if (!res) {
+        return -1;
+    }
+
     in = (unsigned char*)arg;
     out = res;
     memset(&mbs, 0, sizeof mbs);
     while (argsize) {
         size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
-        if (converted == 0)
+        if (converted == 0) {
             /* Reached end of string; null char stored. */
             break;
+        }
+
         if (converted == (size_t)-2) {
             /* Incomplete character. This should never happen,
                since we provide everything that we have -
                unless there is a bug in the C library, or I
                misunderstood how mbrtowc works. */
-            PyMem_RawFree(res);
-            if (size != NULL)
-                *size = (size_t)-2;
-            return NULL;
+            goto decode_error;
         }
+
         if (converted == (size_t)-1) {
+            if (!surrogateescape) {
+                goto decode_error;
+            }
+
             /* Conversion error. Escape as UTF-8b, and start over
                in the initial shift state. */
             *out++ = 0xdc00 + *in++;
@@ -346,12 +380,18 @@ decode_current_locale(const char* arg, size_t *size)
             memset(&mbs, 0, sizeof mbs);
             continue;
         }
+
         if (Py_UNICODE_IS_SURROGATE(*out)) {
+            if (!surrogateescape) {
+                goto decode_error;
+            }
+
             /* Surrogate character.  Escape the original
                byte sequence with surrogateescape. */
             argsize -= converted;
-            while (converted--)
+            while (converted--) {
                 *out++ = 0xdc00 + *in++;
+            }
             continue;
         }
         /* successfully converted some bytes */
@@ -359,55 +399,80 @@ decode_current_locale(const char* arg, size_t *size)
         argsize -= converted;
         out++;
     }
-    if (size != NULL)
-        *size = out - res;
+    if (wlen != NULL) {
+        *wlen = out - res;
+    }
+    *wstr = res;
+    return 0;
+
+decode_error:
+    PyMem_RawFree(res);
+    if (wlen) {
+        *wlen = in - (unsigned char*)arg;
+    }
+    if (reason) {
+        *reason = "decoding error";
+    }
+    return -2;
 #else   /* HAVE_MBRTOWC */
     /* Cannot use C locale for escaping; manually escape as if charset
        is ASCII (i.e. escape all bytes > 128. This will still roundtrip
        correctly in the locale's charset, which must be an ASCII superset. */
-    res = decode_ascii_surrogateescape(arg, size);
-    if (res == NULL)
-        goto oom;
+    return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
 #endif   /* HAVE_MBRTOWC */
-    return res;
-
-oom:
-    if (size != NULL) {
-        *size = (size_t)-1;
-    }
-    return NULL;
 }
-#endif
 
 
-static wchar_t*
-decode_locale(const char* arg, size_t *size, int ignore_utf8_mode)
+/* Decode a byte string from the locale encoding.
+
+   Use the strict error handler if 'surrogateescape' is zero.  Use the
+   surrogateescape error handler if 'surrogateescape' is non-zero: undecodable
+   bytes are decoded as characters in range U+DC80..U+DCFF. If a byte sequence
+   can be decoded as a surrogate character, escape the bytes using the
+   surrogateescape error handler instead of decoding them.
+
+   On sucess, return 0 and write the newly allocated wide character string into
+   *wstr (use PyMem_RawFree() to free the memory). If wlen is not NULL, write
+   the number of wide characters excluding the null character into *wlen.
+
+   On memory allocation failure, return -1.
+
+   On decoding error, return -2. If wlen is not NULL, write the start of
+   invalid byte sequence in the input string into *wlen. If reason is not NULL,
+   write the decoding error message into *reason.
+
+   Use the Py_EncodeLocaleEx() function to encode the character string back to
+   a byte string. */
+int
+_Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
+                   const char **reason,
+                   int current_locale, int surrogateescape)
 {
+    if (current_locale) {
+        return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
+    }
+
 #if defined(__APPLE__) || defined(__ANDROID__)
-    return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
+    return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
+                            surrogateescape);
 #else
-    if (!ignore_utf8_mode && Py_UTF8Mode == 1) {
-        return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
+    if (Py_UTF8Mode == 1) {
+        return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
+                                surrogateescape);
     }
 
-#ifndef MS_WINDOWS
-    if (force_ascii == -1)
+#ifdef USE_FORCE_ASCII
+    if (force_ascii == -1) {
         force_ascii = check_force_ascii();
+    }
 
     if (force_ascii) {
         /* force ASCII encoding to workaround mbstowcs() issue */
-        wchar_t *wstr = decode_ascii_surrogateescape(arg, size);
-        if (wstr == NULL) {
-            if (size != NULL) {
-                *size = (size_t)-1;
-            }
-            return NULL;
-        }
-        return wstr;
+        return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
     }
 #endif
 
-    return decode_current_locale(arg, size);
+    return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
 #endif   /* __APPLE__ or __ANDROID__ */
 }
 
@@ -432,23 +497,24 @@ decode_locale(const char* arg, size_t *size, int ignore_utf8_mode)
    Use the Py_EncodeLocale() function to encode the character string back to a
    byte string. */
 wchar_t*
-Py_DecodeLocale(const char* arg, size_t *size)
+Py_DecodeLocale(const char* arg, size_t *wlen)
 {
-    return decode_locale(arg, size, 0);
-}
-
-
-/* Similar to Py_DecodeLocale() but ignore the UTF-8 mode */
-wchar_t*
-_Py_DecodeCurrentLocale(const char* arg, size_t *size)
-{
-    return decode_locale(arg, size, 1);
+    wchar_t *wstr;
+    int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1);
+    if (res != 0) {
+        if (wlen != NULL) {
+            *wlen = (size_t)res;
+        }
+        return NULL;
+    }
+    return wstr;
 }
 
 
-#if !defined(__APPLE__) && !defined(__ANDROID__)
-static char*
-encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
+static int
+encode_current_locale(const wchar_t *text, char **str,
+                      size_t *error_pos, const char **reason,
+                      int raw_malloc, int surrogateescape)
 {
     const size_t len = wcslen(text);
     char *result = NULL, *bytes = NULL;
@@ -464,38 +530,37 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
         for (i=0; i < len; i++) {
             c = text[i];
             if (c >= 0xdc80 && c <= 0xdcff) {
+                if (!surrogateescape) {
+                    goto encode_error;
+                }
                 /* UTF-8b surrogate */
                 if (bytes != NULL) {
                     *bytes++ = c - 0xdc00;
                     size--;
                 }
-                else
+                else {
                     size++;
+                }
                 continue;
             }
             else {
                 buf[0] = c;
-                if (bytes != NULL)
+                if (bytes != NULL) {
                     converted = wcstombs(bytes, buf, size);
-                else
+                }
+                else {
                     converted = wcstombs(NULL, buf, 0);
+                }
                 if (converted == (size_t)-1) {
-                    if (raw_malloc) {
-                        PyMem_RawFree(result);
-                    }
-                    else {
-                        PyMem_Free(result);
-                    }
-                    if (error_pos != NULL)
-                        *error_pos = i;
-                    return NULL;
+                    goto encode_error;
                 }
                 if (bytes != NULL) {
                     bytes += converted;
                     size -= converted;
                 }
-                else
+                else {
                     size += converted;
+                }
             }
         }
         if (result != NULL) {
@@ -511,40 +576,80 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
             result = PyMem_Malloc(size);
         }
         if (result == NULL) {
-            if (error_pos != NULL) {
-                *error_pos = (size_t)-1;
-            }
-            return NULL;
+            return -1;
         }
         bytes = result;
     }
-    return result;
+    *str = result;
+    return 0;
+
+encode_error:
+    if (raw_malloc) {
+        PyMem_RawFree(result);
+    }
+    else {
+        PyMem_Free(result);
+    }
+    if (error_pos != NULL) {
+        *error_pos = i;
+    }
+    if (reason) {
+        *reason = "encoding error";
+    }
+    return -2;
 }
-#endif
 
-static char*
-encode_locale(const wchar_t *text, size_t *error_pos,
-              int raw_malloc, int ignore_utf8_mode)
+static int
+encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
+                 const char **reason,
+                 int raw_malloc, int current_locale, int surrogateescape)
 {
+    if (current_locale) {
+        return encode_current_locale(text, str, error_pos, reason,
+                                     raw_malloc, surrogateescape);
+    }
+
 #if defined(__APPLE__) || defined(__ANDROID__)
-    return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc);
+    return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
+                            raw_malloc, surrogateescape);
 #else   /* __APPLE__ */
-    if (!ignore_utf8_mode && Py_UTF8Mode == 1) {
-        return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc);
+    if (Py_UTF8Mode == 1) {
+        return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
+                                raw_malloc, surrogateescape);
     }
 
-#ifndef MS_WINDOWS
-    if (force_ascii == -1)
+#ifdef USE_FORCE_ASCII
+    if (force_ascii == -1) {
         force_ascii = check_force_ascii();
+    }
 
-    if (force_ascii)
-        return encode_ascii_surrogateescape(text, error_pos, raw_malloc);
+    if (force_ascii) {
+        return encode_ascii(text, str, error_pos, reason,
+                            raw_malloc, surrogateescape);
+    }
 #endif
 
-    return encode_current_locale(text, error_pos, raw_malloc);
+    return encode_current_locale(text, str, error_pos, reason,
+                                 raw_malloc, surrogateescape);
 #endif   /* __APPLE__ or __ANDROID__ */
 }
 
+static char*
+encode_locale(const wchar_t *text, size_t *error_pos,
+              int raw_malloc, int current_locale)
+{
+    char *str;
+    int res = encode_locale_ex(text, &str, error_pos, NULL,
+                               raw_malloc, current_locale, 1);
+    if (res != -2 && error_pos) {
+        *error_pos = (size_t)-1;
+    }
+    if (res != 0) {
+        return NULL;
+    }
+    return str;
+}
+
 /* Encode a wide character string to the locale encoding with the
    surrogateescape error handler: surrogate characters in the range
    U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
@@ -573,11 +678,13 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos)
 }
 
 
-/* Similar to _Py_EncodeLocaleRaw() but ignore the UTF-8 Mode */
-char*
-_Py_EncodeCurrentLocale(const wchar_t *text, size_t *error_pos)
+int
+_Py_EncodeLocaleEx(const wchar_t *text, char **str,
+                   size_t *error_pos, const char **reason,
+                   int current_locale, int surrogateescape)
 {
-    return encode_locale(text, error_pos, 1, 1);
+    return encode_locale_ex(text, str, error_pos, reason, 1,
+                            current_locale, surrogateescape);
 }
 
 
diff --git a/Python/pathconfig.c b/Python/pathconfig.c
index 9591fcc4911..7ebd69bf622 100644
--- a/Python/pathconfig.c
+++ b/Python/pathconfig.c
@@ -382,8 +382,8 @@ _Py_FindEnvConfigValue(FILE *env_file, const wchar_t *key,
             /* Comment - skip */
             continue;
         }
-        tmpbuffer = _Py_DecodeUTF8_surrogateescape(buffer, n, NULL);
-        if (tmpbuffer != NULL) {
+        tmpbuffer = _Py_DecodeUTF8_surrogateescape(buffer, n);
+        if (tmpbuffer) {
             wchar_t * state;
             wchar_t * tok = wcstok(tmpbuffer, L" \t\r\n", &state);
             if ((tok != NULL) && !wcscmp(tok, key)) {



More information about the Python-checkins mailing list