[Python-checkins] cpython: Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore

victor.stinner python-checkins at python.org
Tue Oct 18 21:20:04 CEST 2011


http://hg.python.org/cpython/rev/af0800b986b7
changeset:   72976:af0800b986b7
user:        Victor Stinner <victor.stinner at haypocalc.com>
date:        Tue Oct 18 21:21:00 2011 +0200
summary:
  Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore
error handlers on all Windows versions. The MBCS codec is now supporting all
error handlers, instead of only replace to encode and ignore to decode.

files:
  Doc/library/codecs.rst  |    7 +-
  Doc/whatsnew/3.3.rst    |    5 +
  Include/unicodeobject.h |   16 +-
  Lib/test/test_codecs.py |  198 +++++++
  Misc/NEWS               |    4 +
  Modules/_codecsmodule.c |   50 +
  Objects/unicodeobject.c |  782 ++++++++++++++++++++++-----
  Python/pythonrun.c      |    5 +-
  8 files changed, 904 insertions(+), 163 deletions(-)


diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -1280,12 +1280,13 @@
 .. module:: encodings.mbcs
    :synopsis: Windows ANSI codepage
 
-Encode operand according to the ANSI codepage (CP_ACP). This codec only
-supports ``'strict'`` and ``'replace'`` error handlers to encode, and
-``'strict'`` and ``'ignore'`` error handlers to decode.
+Encode operand according to the ANSI codepage (CP_ACP).
 
 Availability: Windows only.
 
+.. versionchanged:: 3.3
+   Support any error handler.
+
 .. versionchanged:: 3.2
    Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
    to encode, and ``'ignore'`` to decode.
diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@@ -197,6 +197,11 @@
 codecs
 ------
 
+The :mod:`~encodings.mbcs` codec has be rewritten to handle correclty
+``replace`` and ``ignore`` error handlers on all Windows versions. The
+:mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of
+only ``replace`` to encode and ``ignore`` to decode.
+
 Multibyte CJK decoders now resynchronize faster. They only ignore the first
 byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312',
 'replace')`` now returns a ``\n`` after the replacement character.
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1466,6 +1466,14 @@
     Py_ssize_t *consumed        /* bytes consumed */
     );
 
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
+    int code_page,              /* code page number */
+    const char *string,         /* encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
     PyObject *unicode           /* Unicode object */
     );
@@ -1473,11 +1481,17 @@
 #ifndef Py_LIMITED_API
 PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
     const Py_UNICODE *data,     /* Unicode char buffer */
-    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
+    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
     const char *errors          /* error handling */
     );
 #endif
 
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
+    int code_page,              /* code page number */
+    PyObject *unicode,          /* Unicode object */
+    const char *errors          /* error handling */
+    );
+
 #endif /* HAVE_MBCS */
 
 /* --- Decimal Encoder ---------------------------------------------------- */
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1744,6 +1744,203 @@
             self.assertEqual(sout, b"\x80")
 
 
+class CodePageTest(unittest.TestCase):
+    CP_UTF8 = 65001
+    vista_or_later = (sys.getwindowsversion().major >= 6)
+
+    def test_invalid_code_page(self):
+        self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
+        self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
+        self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
+        self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
+
+    def test_code_page_name(self):
+        self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
+            codecs.code_page_encode, 932, '\xff')
+        self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
+            codecs.code_page_decode, 932, b'\x81\x00')
+        self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
+            codecs.code_page_decode, self.CP_UTF8, b'\xff')
+
+    def check_decode(self, cp, tests):
+        for raw, errors, expected in tests:
+            if expected is not None:
+                try:
+                    decoded = codecs.code_page_decode(cp, raw, errors)
+                except UnicodeDecodeError as err:
+                    self.fail('Unable to decode %a from "cp%s" with '
+                              'errors=%r: %s' % (raw, cp, errors, err))
+                self.assertEqual(decoded[0], expected,
+                    '%a.decode("cp%s", %r)=%a != %a'
+                    % (raw, cp, errors, decoded[0], expected))
+                # assert 0 <= decoded[1] <= len(raw)
+                self.assertGreaterEqual(decoded[1], 0)
+                self.assertLessEqual(decoded[1], len(raw))
+            else:
+                self.assertRaises(UnicodeDecodeError,
+                    codecs.code_page_decode, cp, raw, errors)
+
+    def check_encode(self, cp, tests):
+        for text, errors, expected in tests:
+            if expected is not None:
+                try:
+                    encoded = codecs.code_page_encode(cp, text, errors)
+                except UnicodeEncodeError as err:
+                    self.fail('Unable to encode %a to "cp%s" with '
+                              'errors=%r: %s' % (text, cp, errors, err))
+                self.assertEqual(encoded[0], expected,
+                    '%a.encode("cp%s", %r)=%a != %a'
+                    % (text, cp, errors, encoded[0], expected))
+                self.assertEqual(encoded[1], len(text))
+            else:
+                self.assertRaises(UnicodeEncodeError,
+                    codecs.code_page_encode, cp, text, errors)
+
+    def test_cp932(self):
+        self.check_encode(932, (
+            ('abc', 'strict', b'abc'),
+            ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
+            # not encodable
+            ('\xff', 'strict', None),
+            ('[\xff]', 'ignore', b'[]'),
+            ('[\xff]', 'replace', b'[y]'),
+            ('[\u20ac]', 'replace', b'[?]'),
+        ))
+        tests = [
+            (b'abc', 'strict', 'abc'),
+            (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
+            # invalid bytes
+            (b'\xff', 'strict', None),
+            (b'\xff', 'ignore', ''),
+            (b'\xff', 'replace', '\ufffd'),
+            (b'\x81\x00abc', 'strict', None),
+            (b'\x81\x00abc', 'ignore', '\x00abc'),
+        ]
+        if self.vista_or_later:
+            tests.append((b'\x81\x00abc', 'replace', '\ufffd\x00abc'))
+        else:
+            tests.append((b'\x81\x00abc', 'replace', '\x00\x00abc'))
+        self.check_decode(932, tests)
+
+    def test_cp1252(self):
+        self.check_encode(1252, (
+            ('abc', 'strict', b'abc'),
+            ('\xe9\u20ac', 'strict',  b'\xe9\x80'),
+            ('\xff', 'strict', b'\xff'),
+            ('\u0141', 'strict', None),
+            ('\u0141', 'ignore', b''),
+            ('\u0141', 'replace', b'L'),
+        ))
+        self.check_decode(1252, (
+            (b'abc', 'strict', 'abc'),
+            (b'\xe9\x80', 'strict', '\xe9\u20ac'),
+            (b'\xff', 'strict', '\xff'),
+        ))
+
+    def test_cp_utf7(self):
+        cp = 65000
+        self.check_encode(cp, (
+            ('abc', 'strict', b'abc'),
+            ('\xe9\u20ac', 'strict',  b'+AOkgrA-'),
+            ('\U0010ffff', 'strict',  b'+2//f/w-'),
+            ('\udc80', 'strict', b'+3IA-'),
+            ('\ufffd', 'strict', b'+//0-'),
+        ))
+        self.check_decode(cp, (
+            (b'abc', 'strict', 'abc'),
+            (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
+            (b'+2//f/w-', 'strict', '\U0010ffff'),
+            (b'+3IA-', 'strict', '\udc80'),
+            (b'+//0-', 'strict', '\ufffd'),
+            # invalid bytes
+            (b'[+/]', 'strict', '[]'),
+            (b'[\xff]', 'strict', '[\xff]'),
+        ))
+
+    def test_cp_utf8(self):
+        cp = self.CP_UTF8
+
+        tests = [
+            ('abc', 'strict', b'abc'),
+            ('\xe9\u20ac', 'strict',  b'\xc3\xa9\xe2\x82\xac'),
+            ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
+        ]
+        if self.vista_or_later:
+            tests.append(('\udc80', 'strict', None))
+            tests.append(('\udc80', 'ignore', b''))
+            tests.append(('\udc80', 'replace', b'?'))
+        else:
+            tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
+        self.check_encode(cp, tests)
+
+        tests = [
+            (b'abc', 'strict', 'abc'),
+            (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
+            (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
+            (b'\xef\xbf\xbd', 'strict', '\ufffd'),
+            (b'[\xc3\xa9]', 'strict', '[\xe9]'),
+            # invalid bytes
+            (b'[\xff]', 'strict', None),
+            (b'[\xff]', 'ignore', '[]'),
+            (b'[\xff]', 'replace', '[\ufffd]'),
+        ]
+        if self.vista_or_later:
+            tests.extend((
+                (b'[\xed\xb2\x80]', 'strict', None),
+                (b'[\xed\xb2\x80]', 'ignore', '[]'),
+                (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
+            ))
+        else:
+            tests.extend((
+                (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
+            ))
+        self.check_decode(cp, tests)
+
+    def test_error_handlers(self):
+        self.check_encode(932, (
+            ('\xff', 'backslashreplace', b'\\xff'),
+            ('\xff', 'xmlcharrefreplace', b'&#255;'),
+        ))
+        self.check_decode(932, (
+            (b'\xff', 'surrogateescape', '\udcff'),
+        ))
+        if self.vista_or_later:
+            self.check_encode(self.CP_UTF8, (
+                ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
+            ))
+
+    def test_multibyte_encoding(self):
+        self.check_decode(932, (
+            (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
+            (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
+        ))
+        self.check_decode(self.CP_UTF8, (
+            (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
+            (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
+        ))
+        if self.vista_or_later:
+            self.check_encode(self.CP_UTF8, (
+                ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
+                ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
+            ))
+
+    def test_incremental(self):
+        decoded = codecs.code_page_decode(932,
+                                          b'\xe9\x80\xe9', 'strict',
+                                          False)
+        self.assertEqual(decoded, ('\u9a3e', 2))
+
+        decoded = codecs.code_page_decode(932,
+                                          b'\xe9\x80\xe9\x80', 'strict',
+                                          False)
+        self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
+
+        decoded = codecs.code_page_decode(932,
+                                          b'abc', 'strict',
+                                          False)
+        self.assertEqual(decoded, ('abc', 3))
+
+
 def test_main():
     support.run_unittest(
         UTF32Test,
@@ -1772,6 +1969,7 @@
         SurrogateEscapeTest,
         BomTest,
         TransformCodecTest,
+        CodePageTest,
     )
 
 
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@
 Core and Builtins
 -----------------
 
+- Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore
+  error handlers on all Windows versions. The MBCS codec is now supporting all
+  error handlers, instead of only replace to encode and ignore to decode.
+
 - Issue #13188: When called without an explicit traceback argument,
   generator.throw() now gets the traceback from the passed exception's
   ``__traceback__`` attribute.  Patch by Petri Lehtinen.
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -612,6 +612,31 @@
     return codec_tuple(decoded, consumed);
 }
 
+static PyObject *
+code_page_decode(PyObject *self,
+                 PyObject *args)
+{
+    Py_buffer pbuf;
+    const char *errors = NULL;
+    int final = 0;
+    Py_ssize_t consumed;
+    PyObject *decoded = NULL;
+    int code_page;
+
+    if (!PyArg_ParseTuple(args, "iy*|zi:code_page_decode",
+                          &code_page, &pbuf, &errors, &final))
+        return NULL;
+    consumed = pbuf.len;
+
+    decoded = PyUnicode_DecodeCodePageStateful(code_page,
+                                               pbuf.buf, pbuf.len, errors,
+                                               final ? NULL : &consumed);
+    PyBuffer_Release(&pbuf);
+    if (decoded == NULL)
+        return NULL;
+    return codec_tuple(decoded, consumed);
+}
+
 #endif /* HAVE_MBCS */
 
 /* --- Encoder ------------------------------------------------------------ */
@@ -1011,6 +1036,29 @@
     return v;
 }
 
+static PyObject *
+code_page_encode(PyObject *self,
+                 PyObject *args)
+{
+    PyObject *str, *v;
+    const char *errors = NULL;
+    int code_page;
+
+    if (!PyArg_ParseTuple(args, "iO|z:code_page_encode",
+                          &code_page, &str, &errors))
+        return NULL;
+
+    str = PyUnicode_FromObject(str);
+    if (str == NULL)
+        return NULL;
+    v = codec_tuple(PyUnicode_EncodeCodePage(code_page,
+                                             str,
+                                             errors),
+                    PyUnicode_GET_LENGTH(str));
+    Py_DECREF(str);
+    return v;
+}
+
 #endif /* HAVE_MBCS */
 
 /* --- Error handler registry --------------------------------------------- */
@@ -1101,6 +1149,8 @@
 #ifdef HAVE_MBCS
     {"mbcs_encode",             mbcs_encode,                    METH_VARARGS},
     {"mbcs_decode",             mbcs_decode,                    METH_VARARGS},
+    {"code_page_encode",        code_page_encode,               METH_VARARGS},
+    {"code_page_decode",        code_page_decode,               METH_VARARGS},
 #endif
     {"register_error",          register_error,                 METH_VARARGS,
         register_error__doc__},
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -429,6 +429,10 @@
 }
 #endif
 
+#ifdef HAVE_MBCS
+static OSVERSIONINFOEX winver;
+#endif
+
 /* --- Bloom Filters ----------------------------------------------------- */
 
 /* stuff to implement simple "bloom filters" for Unicode characters.
@@ -6896,130 +6900,307 @@
 #define NEED_RETRY
 #endif
 
-/* XXX This code is limited to "true" double-byte encodings, as
-   a) it assumes an incomplete character consists of a single byte, and
-   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
-   encodings, see IsDBCSLeadByteEx documentation. */
+#ifndef WC_ERR_INVALID_CHARS
+#  define WC_ERR_INVALID_CHARS 0x0080
+#endif
+
+static char*
+code_page_name(UINT code_page, PyObject **obj)
+{
+    *obj = NULL;
+    if (code_page == CP_ACP)
+        return "mbcs";
+    if (code_page == CP_UTF7)
+        return "CP_UTF7";
+    if (code_page == CP_UTF8)
+        return "CP_UTF8";
+
+    *obj = PyBytes_FromFormat("cp%u", code_page);
+    if (*obj == NULL)
+        return NULL;
+    return PyBytes_AS_STRING(*obj);
+}
 
 static int
-is_dbcs_lead_byte(const char *s, int offset)
+is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
 {
     const char *curr = s + offset;
-
-    if (IsDBCSLeadByte(*curr)) {
-        const char *prev = CharPrev(s, curr);
-        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
-    }
+    const char *prev;
+
+    if (!IsDBCSLeadByteEx(code_page, *curr))
+        return 0;
+
+    prev = CharPrevExA(code_page, s, curr, 0);
+    if (prev == curr)
+        return 1;
+    /* FIXME: This code is limited to "true" double-byte encodings,
+       as it assumes an incomplete character consists of a single
+       byte. */
+    if (curr - prev == 2)
+        return 1;
+    if (!IsDBCSLeadByteEx(code_page, *prev))
+        return 1;
     return 0;
 }
 
+static DWORD
+decode_code_page_flags(UINT code_page)
+{
+    if (code_page == CP_UTF7) {
+        /* The CP_UTF7 decoder only supports flags=0 */
+        return 0;
+    }
+    else
+        return MB_ERR_INVALID_CHARS;
+}
+
 /*
- * Decode MBCS string into unicode object. If 'final' is set, converts
- * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
+ * Decode a byte string from a Windows code page into unicode object in strict
+ * mode.
+ *
+ * Returns consumed size if succeed, returns -2 on decode error, or raise a
+ * WindowsError and returns -1 on other error.
  */
 static int
-decode_mbcs(PyUnicodeObject **v,
-            const char *s, /* MBCS string */
-            int size, /* sizeof MBCS string */
-            int final,
-            const char *errors)
-{
-    Py_UNICODE *p;
-    Py_ssize_t n;
-    DWORD usize;
-    DWORD flags;
-
-    assert(size >= 0);
-
-    /* check and handle 'errors' arg */
-    if (errors==NULL || strcmp(errors, "strict")==0)
-        flags = MB_ERR_INVALID_CHARS;
-    else if (strcmp(errors, "ignore")==0)
-        flags = 0;
-    else {
-        PyErr_Format(PyExc_ValueError,
-                     "mbcs encoding does not support errors='%s'",
-                     errors);
-        return -1;
-    }
-
-    /* Skip trailing lead-byte unless 'final' is set */
-    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
-        --size;
+decode_code_page_strict(UINT code_page,
+                        PyUnicodeObject **v,
+                        const char *in,
+                        int insize)
+{
+    const DWORD flags = decode_code_page_flags(code_page);
+    Py_UNICODE *out;
+    DWORD outsize;
 
     /* First get the size of the result */
-    if (size > 0) {
-        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
-        if (usize==0)
-            goto mbcs_decode_error;
-    } else
-        usize = 0;
+    assert(insize > 0);
+    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
+    if (outsize <= 0)
+        goto error;
 
     if (*v == NULL) {
         /* Create unicode object */
-        *v = _PyUnicode_New(usize);
+        *v = _PyUnicode_New(outsize);
         if (*v == NULL)
             return -1;
-        n = 0;
+        out = PyUnicode_AS_UNICODE(*v);
     }
     else {
         /* Extend unicode object */
-        n = PyUnicode_GET_SIZE(*v);
-        if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
+        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+        if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0)
             return -1;
+        out = PyUnicode_AS_UNICODE(*v) + n;
     }
 
     /* Do the conversion */
-    if (usize > 0) {
-        p = PyUnicode_AS_UNICODE(*v) + n;
-        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
-            goto mbcs_decode_error;
-        }
-    }
-    return size;
-
-mbcs_decode_error:
-    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
-       we raise a UnicodeDecodeError - else it is a 'generic'
-       windows error
-     */
-    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
-        /* Ideally, we should get reason from FormatMessage - this
-           is the Windows 2000 English version of the message
-        */
-        PyObject *exc = NULL;
-        const char *reason = "No mapping for the Unicode character exists "
-                             "in the target multi-byte code page.";
-        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
+    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
+    if (outsize <= 0)
+        goto error;
+    return insize;
+
+error:
+    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+        return -2;
+    PyErr_SetFromWindowsErr(0);
+    return -1;
+}
+
+/*
+ * Decode a byte string from a code page into unicode object with an error
+ * handler.
+ *
+ * Returns consumed size if succeed, or raise a WindowsError or
+ * UnicodeDecodeError exception and returns -1 on error.
+ */
+static int
+decode_code_page_errors(UINT code_page,
+                        PyUnicodeObject **v,
+                        const char *in,
+                        int size,
+                        const char *errors)
+{
+    const char *startin = in;
+    const char *endin = in + size;
+    const DWORD flags = decode_code_page_flags(code_page);
+    /* Ideally, we should get reason from FormatMessage. This is the Windows
+       2000 English version of the message. */
+    const char *reason = "No mapping for the Unicode character exists "
+                         "in the target code page.";
+    /* each step cannot decode more than 1 character, but a character can be
+       represented as a surrogate pair */
+    wchar_t buffer[2], *startout, *out;
+    int insize, outsize;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+    PyObject *encoding_obj = NULL;
+    char *encoding;
+    DWORD err;
+    int ret = -1;
+
+    assert(size > 0);
+
+    encoding = code_page_name(code_page, &encoding_obj);
+    if (encoding == NULL)
+        return -1;
+
+    if (errors == NULL || strcmp(errors, "strict") == 0) {
+        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
+           UnicodeDecodeError. */
+        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
         if (exc != NULL) {
             PyCodec_StrictErrors(exc);
-            Py_DECREF(exc);
-        }
-    } else {
-        PyErr_SetFromWindowsErrWithFilename(0, NULL);
-    }
-    return -1;
-}
-
-PyObject *
-PyUnicode_DecodeMBCSStateful(const char *s,
-                             Py_ssize_t size,
-                             const char *errors,
-                             Py_ssize_t *consumed)
+            Py_CLEAR(exc);
+        }
+        goto error;
+    }
+
+    if (*v == NULL) {
+        /* Create unicode object */
+        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+            PyErr_NoMemory();
+            goto error;
+        }
+        *v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
+        if (*v == NULL)
+            goto error;
+        startout = PyUnicode_AS_UNICODE(*v);
+    }
+    else {
+        /* Extend unicode object */
+        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+            PyErr_NoMemory();
+            goto error;
+        }
+        if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
+            goto error;
+        startout = PyUnicode_AS_UNICODE(*v) + n;
+    }
+
+    /* Decode the byte string character per character */
+    out = startout;
+    while (in < endin)
+    {
+        /* Decode a character */
+        insize = 1;
+        do
+        {
+            outsize = MultiByteToWideChar(code_page, flags,
+                                          in, insize,
+                                          buffer, Py_ARRAY_LENGTH(buffer));
+            if (outsize > 0)
+                break;
+            err = GetLastError();
+            if (err != ERROR_NO_UNICODE_TRANSLATION
+                && err != ERROR_INSUFFICIENT_BUFFER)
+            {
+                PyErr_SetFromWindowsErr(0);
+                goto error;
+            }
+            insize++;
+        }
+        /* 4=maximum length of a UTF-8 sequence */
+        while (insize <= 4 && (in + insize) <= endin);
+
+        if (outsize <= 0) {
+            Py_ssize_t startinpos, endinpos, outpos;
+
+            startinpos = in - startin;
+            endinpos = startinpos + 1;
+            outpos = out - PyUnicode_AS_UNICODE(*v);
+            if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    encoding, reason,
+                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
+                    v, &outpos, &out))
+            {
+                goto error;
+            }
+        }
+        else {
+            in += insize;
+            memcpy(out, buffer, outsize * sizeof(wchar_t));
+            out += outsize;
+        }
+    }
+
+    /* write a NUL character at the end */
+    *out = 0;
+
+    /* Extend unicode object */
+    outsize = out - startout;
+    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
+    if (PyUnicode_Resize((PyObject**)v, outsize) < 0)
+        goto error;
+    ret = 0;
+
+error:
+    Py_XDECREF(encoding_obj);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return ret;
+}
+
+/*
+ * Decode a byte string from a Windows code page into unicode object. If
+ * 'final' is set, converts trailing lead-byte too.
+ *
+ * Returns consumed size if succeed, or raise a WindowsError or
+ * UnicodeDecodeError exception and returns -1 on error.
+ */
+static int
+decode_code_page(UINT code_page,
+                 PyUnicodeObject **v,
+                 const char *s,  int size,
+                 int final, const char *errors)
+{
+    int done;
+
+    /* Skip trailing lead-byte unless 'final' is set */
+    if (size == 0) {
+        if (*v == NULL) {
+            Py_INCREF(unicode_empty);
+            *v = (PyUnicodeObject*)unicode_empty;
+            if (*v == NULL)
+                return -1;
+        }
+        return 0;
+    }
+
+    if (!final && is_dbcs_lead_byte(code_page, s, size - 1))
+        --size;
+
+    done = decode_code_page_strict(code_page, v, s, size);
+    if (done == -2)
+        done = decode_code_page_errors(code_page, v, s, size, errors);
+    return done;
+}
+
+static PyObject *
+decode_code_page_stateful(int code_page,
+                          const char *s,
+                          Py_ssize_t size,
+                          const char *errors,
+                          Py_ssize_t *consumed)
 {
     PyUnicodeObject *v = NULL;
     int done;
 
+    if (code_page < 0) {
+        PyErr_SetString(PyExc_ValueError, "invalid code page number");
+        return NULL;
+    }
+
     if (consumed)
         *consumed = 0;
 
 #ifdef NEED_RETRY
   retry:
     if (size > INT_MAX)
-        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
+        done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors);
     else
 #endif
-        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
+        done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors);
 
     if (done < 0) {
         Py_XDECREF(v);
@@ -7036,6 +7217,7 @@
         goto retry;
     }
 #endif
+
 #ifndef DONT_MAKE_RESULT_READY
     if (_PyUnicode_READY_REPLACE(&v)) {
         Py_DECREF(v);
@@ -7047,6 +7229,25 @@
 }
 
 PyObject *
+PyUnicode_DecodeCodePageStateful(int code_page,
+                                 const char *s,
+                                 Py_ssize_t size,
+                                 const char *errors,
+                                 Py_ssize_t *consumed)
+{
+    return decode_code_page_stateful(code_page, s, size, errors, consumed);
+}
+
+PyObject *
+PyUnicode_DecodeMBCSStateful(const char *s,
+                             Py_ssize_t size,
+                             const char *errors,
+                             Py_ssize_t *consumed)
+{
+    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
+}
+
+PyObject *
 PyUnicode_DecodeMBCS(const char *s,
                      Py_ssize_t size,
                      const char *errors)
@@ -7054,105 +7255,342 @@
     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
 }
 
+static DWORD
+encode_code_page_flags(UINT code_page, const char *errors)
+{
+    if (code_page == CP_UTF8) {
+        if (winver.dwMajorVersion >= 6)
+            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
+               and later */
+            return WC_ERR_INVALID_CHARS;
+        else
+            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
+            return 0;
+    }
+    else if (code_page == CP_UTF7) {
+        /* CP_UTF7 only supports flags=0 */
+        return 0;
+    }
+    else {
+        if (errors != NULL && strcmp(errors, "replace") == 0)
+            return 0;
+        else
+            return WC_NO_BEST_FIT_CHARS;
+    }
+}
+
 /*
- * Convert unicode into string object (MBCS).
- * Returns 0 if succeed, -1 otherwise.
+ * Encode a Unicode string to a Windows code page into a byte string in strict
+ * mode.
+ *
+ * Returns consumed characters if succeed, returns -2 on encode error, or raise
+ * a WindowsError and returns -1 on other error.
  */
 static int
-encode_mbcs(PyObject **repr,
-            const Py_UNICODE *p, /* unicode */
-            int size, /* size of unicode */
-            const char* errors)
+encode_code_page_strict(UINT code_page, PyObject **outbytes,
+                        const Py_UNICODE *p, const int size,
+                        const char* errors)
 {
     BOOL usedDefaultChar = FALSE;
-    BOOL *pusedDefaultChar;
-    int mbcssize;
-    Py_ssize_t n;
+    BOOL *pusedDefaultChar = &usedDefaultChar;
+    int outsize;
     PyObject *exc = NULL;
-    DWORD flags;
-
-    assert(size >= 0);
-
-    /* check and handle 'errors' arg */
-    if (errors==NULL || strcmp(errors, "strict")==0) {
-        flags = WC_NO_BEST_FIT_CHARS;
+    const DWORD flags = encode_code_page_flags(code_page, NULL);
+    char *out;
+
+    assert(size > 0);
+
+    if (code_page != CP_UTF8 && code_page != CP_UTF7)
         pusedDefaultChar = &usedDefaultChar;
-    } else if (strcmp(errors, "replace")==0) {
-        flags = 0;
+    else
         pusedDefaultChar = NULL;
-    } else {
-         PyErr_Format(PyExc_ValueError,
-                      "mbcs encoding does not support errors='%s'",
-                      errors);
-         return -1;
-    }
 
     /* First get the size of the result */
-    if (size > 0) {
-        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
-                                       NULL, pusedDefaultChar);
-        if (mbcssize == 0) {
-            PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    outsize = WideCharToMultiByte(code_page, flags,
+                                  p, size,
+                                  NULL, 0,
+                                  NULL, pusedDefaultChar);
+    if (outsize <= 0)
+        goto error;
+    /* If we used a default char, then we failed! */
+    if (pusedDefaultChar && *pusedDefaultChar)
+        return -2;
+
+    if (*outbytes == NULL) {
+        /* Create string object */
+        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
+        if (*outbytes == NULL)
             return -1;
-        }
-        /* If we used a default char, then we failed! */
-        if (pusedDefaultChar && *pusedDefaultChar)
-            goto mbcs_encode_error;
-    } else {
-        mbcssize = 0;
-    }
-
-    if (*repr == NULL) {
-        /* Create string object */
-        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
-        if (*repr == NULL)
-            return -1;
-        n = 0;
+        out = PyBytes_AS_STRING(*outbytes);
     }
     else {
         /* Extend string object */
-        n = PyBytes_Size(*repr);
-        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
+        const Py_ssize_t n = PyBytes_Size(*outbytes);
+        if (outsize > PY_SSIZE_T_MAX - n) {
+            PyErr_NoMemory();
             return -1;
+        }
+        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
+            return -1;
+        out = PyBytes_AS_STRING(*outbytes) + n;
     }
 
     /* Do the conversion */
-    if (size > 0) {
-        char *s = PyBytes_AS_STRING(*repr) + n;
-        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
-                                     NULL, pusedDefaultChar)) {
-            PyErr_SetFromWindowsErrWithFilename(0, NULL);
-            return -1;
-        }
-        if (pusedDefaultChar && *pusedDefaultChar)
-            goto mbcs_encode_error;
-    }
+    outsize = WideCharToMultiByte(code_page, flags,
+                                  p, size,
+                                  out, outsize,
+                                  NULL, pusedDefaultChar);
+    if (outsize <= 0)
+        goto error;
+    if (pusedDefaultChar && *pusedDefaultChar)
+        return -2;
     return 0;
 
-mbcs_encode_error:
-    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
+error:
+    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+        return -2;
+    PyErr_SetFromWindowsErr(0);
+    return -1;
+}
+
+/*
+ * Encode a Unicode string to a Windows code page into a byte string using a
+ * error handler.
+ *
+ * Returns consumed characters if succeed, or raise a WindowsError and returns
+ * -1 on other error.
+ */
+static int
+encode_code_page_errors(UINT code_page, PyObject **outbytes,
+                        const Py_UNICODE *in, const int insize,
+                        const char* errors)
+{
+    const DWORD flags = encode_code_page_flags(code_page, errors);
+    const Py_UNICODE *startin = in;
+    const Py_UNICODE *endin = in + insize;
+    /* Ideally, we should get reason from FormatMessage. This is the Windows
+       2000 English version of the message. */
+    const char *reason = "invalid character";
+    /* 4=maximum length of a UTF-8 sequence */
+    char buffer[4];
+    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
+    Py_ssize_t outsize;
+    char *out;
+    int charsize;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+    PyObject *encoding_obj = NULL;
+    char *encoding;
+    int err;
+    Py_ssize_t startpos, newpos, newoutsize;
+    PyObject *rep;
+    int ret = -1;
+
+    assert(insize > 0);
+
+    encoding = code_page_name(code_page, &encoding_obj);
+    if (encoding == NULL)
+        return -1;
+
+    if (errors == NULL || strcmp(errors, "strict") == 0) {
+        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
+           then we raise a UnicodeEncodeError. */
+        make_encode_exception(&exc, encoding, in, insize, 0, 0, reason);
+        if (exc != NULL) {
+            PyCodec_StrictErrors(exc);
+            Py_DECREF(exc);
+        }
+        Py_XDECREF(encoding_obj);
+        return -1;
+    }
+
+    if (code_page != CP_UTF8 && code_page != CP_UTF7)
+        pusedDefaultChar = &usedDefaultChar;
+    else
+        pusedDefaultChar = NULL;
+
+    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
+        PyErr_NoMemory();
+        goto error;
+    }
+    outsize = insize * Py_ARRAY_LENGTH(buffer);
+
+    if (*outbytes == NULL) {
+        /* Create string object */
+        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
+        if (*outbytes == NULL)
+            goto error;
+        out = PyBytes_AS_STRING(*outbytes);
+    }
+    else {
+        /* Extend string object */
+        Py_ssize_t n = PyBytes_Size(*outbytes);
+        if (n > PY_SSIZE_T_MAX - outsize) {
+            PyErr_NoMemory();
+            goto error;
+        }
+        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
+            goto error;
+        out = PyBytes_AS_STRING(*outbytes) + n;
+    }
+
+    /* Encode the string character per character */
+    while (in < endin)
+    {
+        if ((in + 2) <= endin
+            && 0xD800 <= in[0] && in[0] <= 0xDBFF
+            && 0xDC00 <= in[1] && in[1] <= 0xDFFF)
+            charsize = 2;
+        else
+            charsize = 1;
+
+        outsize = WideCharToMultiByte(code_page, flags,
+                                      in, charsize,
+                                      buffer, Py_ARRAY_LENGTH(buffer),
+                                      NULL, pusedDefaultChar);
+        if (outsize > 0) {
+            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
+            {
+                in += charsize;
+                memcpy(out, buffer, outsize);
+                out += outsize;
+                continue;
+            }
+        }
+        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
+            PyErr_SetFromWindowsErr(0);
+            goto error;
+        }
+
+        charsize = Py_MAX(charsize - 1, 1);
+        startpos = in - startin;
+        rep = unicode_encode_call_errorhandler(
+                  errors, &errorHandler, encoding, reason,
+                  startin, insize, &exc,
+                  startpos, startpos + charsize, &newpos);
+        if (rep == NULL)
+            goto error;
+        in = startin + newpos;
+
+        if (PyBytes_Check(rep)) {
+            outsize = PyBytes_GET_SIZE(rep);
+            if (outsize != 1) {
+                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
+                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
+                    Py_DECREF(rep);
+                    goto error;
+                }
+                out = PyBytes_AS_STRING(*outbytes) + offset;
+            }
+            memcpy(out, PyBytes_AS_STRING(rep), outsize);
+            out += outsize;
+        }
+        else {
+            Py_ssize_t i;
+            enum PyUnicode_Kind kind;
+            void *data;
+
+            if (PyUnicode_READY(rep) < 0) {
+                Py_DECREF(rep);
+                goto error;
+            }
+
+            outsize = PyUnicode_GET_LENGTH(rep);
+            if (outsize != 1) {
+                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
+                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
+                    Py_DECREF(rep);
+                    goto error;
+                }
+                out = PyBytes_AS_STRING(*outbytes) + offset;
+            }
+            kind = PyUnicode_KIND(rep);
+            data = PyUnicode_DATA(rep);
+            for (i=0; i < outsize; i++) {
+                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+                if (ch > 127) {
+                    raise_encode_exception(&exc,
+                        encoding,
+                        startin, insize,
+                        startpos, startpos + charsize,
+                        "unable to encode error handler result to ASCII");
+                    Py_DECREF(rep);
+                    goto error;
+                }
+                *out = (unsigned char)ch;
+                out++;
+            }
+        }
+        Py_DECREF(rep);
+    }
+    /* write a NUL byte */
+    *out = 0;
+    outsize = out - PyBytes_AS_STRING(*outbytes);
+    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
+    if (_PyBytes_Resize(outbytes, outsize) < 0)
+        goto error;
+    ret = 0;
+
+error:
+    Py_XDECREF(encoding_obj);
+    Py_XDECREF(errorHandler);
     Py_XDECREF(exc);
-    return -1;
-}
-
-PyObject *
-PyUnicode_EncodeMBCS(const Py_UNICODE *p,
-                     Py_ssize_t size,
-                     const char *errors)
-{
-    PyObject *repr = NULL;
+    return ret;
+}
+
+/*
+ * Encode a Unicode string to a Windows code page into a byte string.
+ *
+ * Returns consumed characters if succeed, or raise a WindowsError and returns
+ * -1 on other error.
+ */
+static int
+encode_code_page_chunk(UINT code_page, PyObject **outbytes,
+                       const Py_UNICODE *p, int size,
+                       const char* errors)
+{
+    int done;
+
+    if (size == 0) {
+        if (*outbytes == NULL) {
+            *outbytes = PyBytes_FromStringAndSize(NULL, 0);
+            if (*outbytes == NULL)
+                return -1;
+        }
+        return 0;
+    }
+
+    done = encode_code_page_strict(code_page, outbytes, p, size, errors);
+    if (done == -2)
+        done = encode_code_page_errors(code_page, outbytes, p, size, errors);
+    return done;
+}
+
+static PyObject *
+encode_code_page(int code_page,
+                 const Py_UNICODE *p, Py_ssize_t size,
+                 const char *errors)
+{
+    PyObject *outbytes = NULL;
     int ret;
 
+    if (code_page < 0) {
+        PyErr_SetString(PyExc_ValueError, "invalid code page number");
+        return NULL;
+    }
+
 #ifdef NEED_RETRY
   retry:
     if (size > INT_MAX)
-        ret = encode_mbcs(&repr, p, INT_MAX, errors);
+        ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors);
     else
 #endif
-        ret = encode_mbcs(&repr, p, (int)size, errors);
+        ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors);
 
     if (ret < 0) {
-        Py_XDECREF(repr);
+        Py_XDECREF(outbytes);
         return NULL;
     }
 
@@ -7164,7 +7602,28 @@
     }
 #endif
 
-    return repr;
+    return outbytes;
+}
+
+PyObject *
+PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+                     Py_ssize_t size,
+                     const char *errors)
+{
+    return encode_code_page(CP_ACP, p, size, errors);
+}
+
+PyObject *
+PyUnicode_EncodeCodePage(int code_page,
+                         PyObject *unicode,
+                         const char *errors)
+{
+    const Py_UNICODE *p;
+    Py_ssize_t size;
+    p = PyUnicode_AsUnicodeAndSize(unicode, &size);
+    if (p == NULL)
+        return NULL;
+    return encode_code_page(code_page, p, size, errors);
 }
 
 PyObject *
@@ -13434,7 +13893,7 @@
 
 /* Initialize the Unicode implementation */
 
-void _PyUnicode_Init(void)
+int _PyUnicode_Init(void)
 {
     int i;
 
@@ -13467,6 +13926,15 @@
         Py_ARRAY_LENGTH(linebreak));
 
     PyType_Ready(&EncodingMapType);
+
+#ifdef HAVE_MBCS
+    winver.dwOSVersionInfoSize = sizeof(winver);
+    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
+        PyErr_SetFromWindowsErr(0);
+        return -1;
+    }
+#endif
+    return 0;
 }
 
 /* Finalize the Unicode implementation */
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -67,7 +67,7 @@
 static void call_py_exitfuncs(void);
 static void wait_for_thread_shutdown(void);
 static void call_ll_exitfuncs(void);
-extern void _PyUnicode_Init(void);
+extern int _PyUnicode_Init(void);
 extern void _PyUnicode_Fini(void);
 extern int _PyLong_Init(void);
 extern void PyLong_Fini(void);
@@ -261,7 +261,8 @@
         Py_FatalError("Py_Initialize: can't make modules_reloading dictionary");
 
     /* Init Unicode implementation; relies on the codec registry */
-    _PyUnicode_Init();
+    if (_PyUnicode_Init() < 0)
+        Py_FatalError("Py_Initialize: can't initialize unicode");
 
     bimod = _PyBuiltin_Init();
     if (bimod == NULL)

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list