[Python-checkins] r82037 - in python/branches/py3k: Doc/library/codecs.rst Lib/ctypes/init.py Lib/test/test_codecs.py Misc/NEWS Objects/unicodeobject.c

Thu Jun 17 01:33:54 CEST 2010

Author: victor.stinner
Date: Thu Jun 17 01:33:54 2010
New Revision: 82037

Log:
Issue #850997: mbcs encoding (Windows only) handles errors argument: strict
mode raises unicode errors. The encoder only supports "strict" and "replace"
error handlers, the decoder only supports "strict" and "ignore" error handlers.


Modified:
   python/branches/py3k/Doc/library/codecs.rst
   python/branches/py3k/Lib/ctypes/__init__.py
   python/branches/py3k/Lib/test/test_codecs.py
   python/branches/py3k/Misc/NEWS
   python/branches/py3k/Objects/unicodeobject.c

Modified: python/branches/py3k/Doc/library/codecs.rst
==============================================================================

--- python/branches/py3k/Doc/library/codecs.rst	(original)
+++ python/branches/py3k/Doc/library/codecs.rst	Thu Jun 17 01:33:54 2010
@@ -1223,6 +1223,23 @@
    Convert a label to Unicode, as specified in :rfc:`3490`.
 
 
+:mod:`encodings.mbcs` --- Windows ANSI codepage
+-----------------------------------------------
+
+.. module:: encodings.mbcs
+   :synopsis: Windows ANSI codepage
+
+Encode operand according to the ANSI codepage (CP_ACP). This codec only
+supports ``'strict'`` and ``'replace'`` error handlers to encode, and
+``'strict'`` and ``'ignore'`` error handlers to decode.
+
+Availability: Windows only.
+
+.. versionchanged:: 3.2
+   Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
+   to encode, and ``'ignore'`` to decode.
+
+
 :mod:`encodings.utf_8_sig` --- UTF-8 codec with BOM signature
 -------------------------------------------------------------
 

Modified: python/branches/py3k/Lib/ctypes/__init__.py
==============================================================================
--- python/branches/py3k/Lib/ctypes/__init__.py	(original)
+++ python/branches/py3k/Lib/ctypes/__init__.py	Thu Jun 17 01:33:54 2010
@@ -265,7 +265,7 @@
     pass
 else:
     if _os.name in ("nt", "ce"):
-        set_conversion_mode("mbcs", "ignore")
+        set_conversion_mode("mbcs", "strict")
     else:
         set_conversion_mode("ascii", "strict")
 

Modified: python/branches/py3k/Lib/test/test_codecs.py
==============================================================================
--- python/branches/py3k/Lib/test/test_codecs.py	(original)
+++ python/branches/py3k/Lib/test/test_codecs.py	Thu Jun 17 01:33:54 2010
@@ -1358,11 +1358,6 @@
     "idna",
 ]
 
-# The following encodings only support "strict" mode
-only_strict_mode = [
-    "idna",
-]
-
 class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
     def test_basics(self):
         s = "abc123" # all codecs should be able to encode these
@@ -1437,7 +1432,7 @@
                     result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
                     self.assertEqual(result, "")
 
-                if encoding not in only_strict_mode:
+                if encoding not in ("idna", "mbcs"):
                     # check incremental decoder/encoder with errors argument
                     try:
                         encoder = codecs.getincrementalencoder(encoding)("ignore")

Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS	(original)
+++ python/branches/py3k/Misc/NEWS	Thu Jun 17 01:33:54 2010
@@ -12,6 +12,11 @@
 Core and Builtins
 -----------------
 
+- Issue #850997: mbcs encoding (Windows only) handles errors argument: strict
+  mode raises unicode errors. The encoder only supports "strict" and "replace"
+  error handlers, the decoder only supports "strict" and "ignore" error
+  handlers.
+
 - Issue #8592: PyArg_Parse*() functions raise a TypeError for "y", "u" and "Z"
   formats if the string contains a null byte/character. Write unit tests for
   string formats.

Modified: python/branches/py3k/Objects/unicodeobject.c
==============================================================================
--- python/branches/py3k/Objects/unicodeobject.c	(original)
+++ python/branches/py3k/Objects/unicodeobject.c	Thu Jun 17 01:33:54 2010
@@ -1767,6 +1767,33 @@
     return 0;
 }
 
+/* create or adjust a UnicodeDecodeError */
+static void
+make_decode_exception(PyObject **exceptionObject,
+                      const char *encoding,
+                      const char *input, Py_ssize_t length,
+                      Py_ssize_t startpos, Py_ssize_t endpos,
+                      const char *reason)
+{
+    if (*exceptionObject == NULL) {
+        *exceptionObject = PyUnicodeDecodeError_Create(
+            encoding, input, length, startpos, endpos, reason);
+    }
+    else {
+        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
+            goto onError;
+        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
+            goto onError;
+        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
+            goto onError;
+    }
+    return;
+
+onError:
+    Py_DECREF(*exceptionObject);
+    *exceptionObject = NULL;
+}
+
 /* error handling callback helper:
    build arguments, call the callback and check the arguments,
    if no exception occurred, copy the replacement to the output
@@ -1800,20 +1827,13 @@
             goto onError;
     }
 
-    if (*exceptionObject == NULL) {
-        *exceptionObject = PyUnicodeDecodeError_Create(
-            encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
-        if (*exceptionObject == NULL)
-            goto onError;
-    }
-    else {
-        if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
-            goto onError;
-        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
-            goto onError;
-        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
-            goto onError;
-    }
+    make_decode_exception(exceptionObject,
+        encoding,
+        *input, *inend - *input,
+        *startinpos, *endinpos,
+        reason);
+    if (*exceptionObject == NULL)
+        goto onError;
 
     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
     if (restuple == NULL)
@@ -4552,32 +4572,46 @@
 static int decode_mbcs(PyUnicodeObject **v,
                        const char *s, /* MBCS string */
                        int size, /* sizeof MBCS string */
-                       int final)
+                       int final,
+                       const char *errors)
 {
     Py_UNICODE *p;
-    Py_ssize_t n = 0;
-    int usize = 0;
+    Py_ssize_t n;
+    DWORD usize;
+    DWORD flags;
 
     assert(size >= 0);
 
+    /* check and handle 'errors' arg */
+    if (errors==NULL || strcmp(errors, "strict")==0)
+        flags = MB_ERR_INVALID_CHARS;
+    else if (strcmp(errors, "ignore")==0)
+        flags = 0;
+    else {
+        PyErr_Format(PyExc_ValueError,
+                     "mbcs encoding does not support errors='%s'",
+                     errors);
+        return -1;
+    }
+
     /* Skip trailing lead-byte unless 'final' is set */
     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
         --size;
 
     /* First get the size of the result */
     if (size > 0) {
-        usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
-        if (usize == 0) {
-            PyErr_SetFromWindowsErrWithFilename(0, NULL);
-            return -1;
-        }
-    }
+        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
+        if (usize==0)
+            goto mbcs_decode_error;
+    } else
+        usize = 0;
 
     if (*v == NULL) {
         /* Create unicode object */
         *v = _PyUnicode_New(usize);
         if (*v == NULL)
             return -1;
+        n = 0;
     }
     else {
         /* Extend unicode object */
@@ -4587,15 +4621,35 @@
     }
 
     /* Do the conversion */
-    if (size > 0) {
+    if (usize > 0) {
         p = PyUnicode_AS_UNICODE(*v) + n;
-        if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
-            PyErr_SetFromWindowsErrWithFilename(0, NULL);
-            return -1;
+        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
+            goto mbcs_decode_error;
         }
     }
-
     return size;
+
+mbcs_decode_error:
+    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
+       we raise a UnicodeDecodeError - else it is a 'generic'
+       windows error
+     */
+    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
+        /* Ideally, we should get reason from FormatMessage - this
+           is the Windows 2000 English version of the message
+        */
+        PyObject *exc = NULL;
+        const char *reason = "No mapping for the Unicode character exists "
+                             "in the target multi-byte code page.";
+        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
+        if (exc != NULL) {
+            PyCodec_StrictErrors(exc);
+            Py_DECREF(exc);
+        }
+    } else {
+        PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    }
+    return -1;
 }
 
 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
@@ -4612,10 +4666,10 @@
 #ifdef NEED_RETRY
   retry:
     if (size > INT_MAX)
-        done = decode_mbcs(&v, s, INT_MAX, 0);
+        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
     else
 #endif
-        done = decode_mbcs(&v, s, (int)size, !consumed);
+        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
 
     if (done < 0) {
         Py_XDECREF(v);
@@ -4649,20 +4703,45 @@
  */
 static int encode_mbcs(PyObject **repr,
                        const Py_UNICODE *p, /* unicode */
-                       int size) /* size of unicode */
+                       int size, /* size of unicode */
+                       const char* errors)
 {
-    int mbcssize = 0;
-    Py_ssize_t n = 0;
+    BOOL usedDefaultChar = FALSE;
+    BOOL *pusedDefaultChar;
+    int mbcssize;
+    Py_ssize_t n;
+    PyObject *exc = NULL;
+    DWORD flags;
 
     assert(size >= 0);
 
+    /* check and handle 'errors' arg */
+    if (errors==NULL || strcmp(errors, "strict")==0) {
+        flags = WC_NO_BEST_FIT_CHARS;
+        pusedDefaultChar = &usedDefaultChar;
+    } else if (strcmp(errors, "replace")==0) {
+        flags = 0;
+        pusedDefaultChar = NULL;
+    } else {
+         PyErr_Format(PyExc_ValueError,
+                      "mbcs encoding does not support errors='%s'",
+                      errors);
+         return -1;
+    }
+
     /* First get the size of the result */
     if (size > 0) {
-        mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
+        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
+                                       NULL, pusedDefaultChar);
         if (mbcssize == 0) {
             PyErr_SetFromWindowsErrWithFilename(0, NULL);
             return -1;
         }
+        /* If we used a default char, then we failed! */
+        if (pusedDefaultChar && *pusedDefaultChar)
+            goto mbcs_encode_error;
+    } else {
+        mbcssize = 0;
     }
 
     if (*repr == NULL) {
@@ -4670,6 +4749,7 @@
         *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
         if (*repr == NULL)
             return -1;
+        n = 0;
     }
     else {
         /* Extend string object */
@@ -4681,13 +4761,20 @@
     /* Do the conversion */
     if (size > 0) {
         char *s = PyBytes_AS_STRING(*repr) + n;
-        if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
+        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
+                                     NULL, pusedDefaultChar)) {
             PyErr_SetFromWindowsErrWithFilename(0, NULL);
             return -1;
         }
+        if (pusedDefaultChar && *pusedDefaultChar)
+            goto mbcs_encode_error;
     }
-
     return 0;
+
+mbcs_encode_error:
+    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
+    Py_XDECREF(exc);
+    return -1;
 }
 
 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
@@ -4700,10 +4787,10 @@
 #ifdef NEED_RETRY
   retry:
     if (size > INT_MAX)
-        ret = encode_mbcs(&repr, p, INT_MAX);
+        ret = encode_mbcs(&repr, p, INT_MAX, errors);
     else
 #endif
-        ret = encode_mbcs(&repr, p, (int)size);
+        ret = encode_mbcs(&repr, p, (int)size, errors);
 
     if (ret < 0) {
         Py_XDECREF(repr);


[Python-checkins] r82037 - in python/branches/py3k: Doc/library/codecs.rst Lib/ctypes/__init__.py Lib/test/test_codecs.py Misc/NEWS Objects/unicodeobject.c

[Python-checkins] r82037 - in python/branches/py3k: Doc/library/codecs.rst Lib/ctypes/init.py Lib/test/test_codecs.py Misc/NEWS Objects/unicodeobject.c