[Python-checkins] cpython: The locale decoder raises a UnicodeDecodeError instead of an OSError

victor.stinner python-checkins at python.org
Sat Dec 17 07:07:48 CET 2011


http://hg.python.org/cpython/rev/ea421c534305
changeset:   74010:ea421c534305
user:        Victor Stinner <victor.stinner at haypocalc.com>
date:        Sat Dec 17 07:08:30 2011 +0100
summary:
  The locale decoder raises a UnicodeDecodeError instead of an OSError

Search the invalid character using mbrtowc().

files:
  Objects/unicodeobject.c |  105 +++++++++++++++++++++++----
  1 files changed, 87 insertions(+), 18 deletions(-)


diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3084,9 +3084,7 @@
 #endif
     char outbuf[MB_LEN_MAX];
     const wchar_t *start, *previous;
-    int save_errno;
-
-    save_errno = errno;
+
 #if SIZEOF_WCHAR_T == 2
     buf[2] = 0;
 #else
@@ -3114,14 +3112,11 @@
         wstr++;
 #endif
         len = wcstombs(outbuf, buf, sizeof(outbuf));
-        if (len == (size_t)-1) {
-            errno = save_errno;
+        if (len == (size_t)-1)
             return previous - start;
-        }
     }
 
     /* failed to find the unencodable character */
-    errno = save_errno;
     return 0;
 }
 
@@ -3199,7 +3194,7 @@
 
         len = wcstombs(NULL, wstr, 0);
         if (len == (size_t)-1) {
-            error_pos = wcstombs_errorpos(wstr);
+            error_pos = (size_t)-1;
             goto encode_error;
         }
 
@@ -3211,7 +3206,7 @@
 
         len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
         if (len2 == (size_t)-1 || len2 > len) {
-            error_pos = wcstombs_errorpos(wstr);
+            error_pos = (size_t)-1;
             goto encode_error;
         }
         PyMem_Free(wstr);
@@ -3221,12 +3216,23 @@
 encode_error:
     errmsg = strerror(errno);
     assert(errmsg != NULL);
+
+    if (error_pos == (size_t)-1)
+        error_pos = wcstombs_errorpos(wstr);
+
     PyMem_Free(wstr);
     Py_XDECREF(bytes);
 
-    if (errmsg != NULL)
-        reason = PyUnicode_DecodeLocale(errmsg, "surrogateescape");
-    else
+    if (errmsg != NULL) {
+        size_t errlen;
+        wstr = _Py_char2wchar(errmsg, &errlen);
+        if (wstr != NULL) {
+            reason = PyUnicode_FromWideChar(wstr, errlen);
+            PyMem_Free(wstr);
+        } else
+            errmsg = NULL;
+    }
+    if (errmsg == NULL)
         reason = PyUnicode_FromString(
             "wcstombs() encountered an unencodable "
             "wide character");
@@ -3376,6 +3382,37 @@
     return NULL;
 }
 
+static size_t
+mbstowcs_errorpos(const char *str, size_t len)
+{
+#ifdef HAVE_MBRTOWC
+    const char *start = str;
+    mbstate_t mbs;
+    size_t converted;
+    wchar_t ch;
+
+    memset(&mbs, 0, sizeof mbs);
+    while (len)
+    {
+        converted = mbrtowc(&ch, (char*)str, len, &mbs);
+        if (converted == 0)
+            /* Reached end of string */
+            break;
+        if (converted == (size_t)-1 || converted == (size_t)-2) {
+            /* Conversion error or incomplete character */
+            return str - start;
+        }
+        else {
+            str += converted;
+            len -= converted;
+        }
+    }
+    /* failed to find the undecodable byte sequence */
+    return 0;
+#endif
+    return 0;
+}
+
 PyObject*
 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
                               const char *errors)
@@ -3386,6 +3423,9 @@
     size_t wlen, wlen2;
     PyObject *unicode;
     int surrogateescape;
+    size_t error_pos;
+    char *errmsg;
+    PyObject *reason, *exc;
 
     if (locale_error_handler(errors, &surrogateescape) < 0)
         return NULL;
@@ -3415,10 +3455,8 @@
 #else
         wlen = len;
 #endif
-        if (wlen == (size_t)-1) {
-            PyErr_SetFromErrno(PyExc_OSError);
-            return NULL;
-        }
+        if (wlen == (size_t)-1)
+            goto decode_error;
         if (wlen+1 <= smallbuf_len) {
             wstr = smallbuf;
         }
@@ -3436,8 +3474,7 @@
         if (wlen2 == (size_t)-1) {
             if (wstr != smallbuf)
                 PyMem_Free(wstr);
-            PyErr_SetFromErrno(PyExc_OSError);
-            return NULL;
+            goto decode_error;
         }
 #ifdef HAVE_BROKEN_MBSTOWCS
         assert(wlen2 == wlen);
@@ -3447,6 +3484,38 @@
             PyMem_Free(wstr);
     }
     return unicode;
+
+decode_error:
+    errmsg = strerror(errno);
+    assert(errmsg != NULL);
+
+    error_pos = mbstowcs_errorpos(str, len);
+    if (errmsg != NULL) {
+        size_t errlen;
+        wstr = _Py_char2wchar(errmsg, &errlen);
+        if (wstr != NULL) {
+            reason = PyUnicode_FromWideChar(wstr, errlen);
+            PyMem_Free(wstr);
+        } else
+            errmsg = NULL;
+    }
+    if (errmsg == NULL)
+        reason = PyUnicode_FromString(
+            "mbstowcs() encountered an invalid multibyte sequence");
+    if (reason == NULL)
+        return NULL;
+
+    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
+                                "locale", str, len,
+                                (Py_ssize_t)error_pos,
+                                (Py_ssize_t)(error_pos+1),
+                                reason);
+    Py_DECREF(reason);
+    if (exc != NULL) {
+        PyCodec_StrictErrors(exc);
+        Py_XDECREF(exc);
+    }
+    return NULL;
 }
 
 PyObject*

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list