[Python-checkins] cpython: Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error

victor.stinner python-checkins at python.org
Mon Oct 5 07:44:01 EDT 2015


https://hg.python.org/cpython/rev/3152e4038d97
changeset:   98537:3152e4038d97
parent:      98535:f51921883f50
user:        Victor Stinner <victor.stinner at gmail.com>
date:        Mon Oct 05 13:43:50 2015 +0200
summary:
  Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error
handlers: ``ignore``, ``replace`` and ``surrogateescape``.

files:
  Doc/whatsnew/3.6.rst    |   3 +
  Lib/test/test_codecs.py |  12 +++++++
  Misc/NEWS               |   3 +
  Objects/unicodeobject.c |  48 +++++++++++++++++++++++-----
  4 files changed, 57 insertions(+), 9 deletions(-)


diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@@ -123,6 +123,9 @@
 * The UTF-8 encoder is now up to 75 times as fast for error handlers:
   ``ignore``, ``replace``, ``surrogateescape``, ``surrogatepass``.
 
+* The UTF-8 decoder is now up to 15 times as fast for error handlers:
+  ``ignore``, ``replace`` and ``surrogateescape``.
+
 
 Build and C API Changes
 =======================
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -788,6 +788,18 @@
         self.check_state_handling_decode(self.encoding,
                                          u, u.encode(self.encoding))
 
+    def test_decode_error(self):
+        for data, error_handler, expected in (
+            (b'[\x80\xff]', 'ignore', '[]'),
+            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
+            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
+            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
+        ):
+            with self.subTest(data=data, error_handler=error_handler,
+                              expected=expected):
+                self.assertEqual(data.decode(self.encoding, error_handler),
+                                 expected)
+
     def test_lone_surrogates(self):
         super().test_lone_surrogates()
         # not sure if this is making sense for
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@
 Core and Builtins
 -----------------
 
+* Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error
+  handlers: ``ignore``, ``replace`` and ``surrogateescape``.
+
 - Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
 
 - Issue #25267: The UTF-8 encoder is now up to 75 times as fast for error
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4714,8 +4714,9 @@
     Py_ssize_t startinpos;
     Py_ssize_t endinpos;
     const char *errmsg = "";
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 
     if (size == 0) {
         if (consumed)
@@ -4740,6 +4741,7 @@
     while (s < end) {
         Py_UCS4 ch;
         int kind = writer.kind;
+
         if (kind == PyUnicode_1BYTE_KIND) {
             if (PyUnicode_IS_ASCII(writer.buffer))
                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
@@ -4778,24 +4780,52 @@
             continue;
         }
 
-        if (unicode_decode_call_errorhandler_writer(
-                errors, &errorHandler,
-                "utf-8", errmsg,
-                &starts, &end, &startinpos, &endinpos, &exc, &s,
-                &writer))
-            goto onError;
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler) {
+        case _Py_ERROR_IGNORE:
+            s += (endinpos - startinpos);
+            break;
+
+        case _Py_ERROR_REPLACE:
+            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
+                goto onError;
+            s += (endinpos - startinpos);
+            break;
+
+        case _Py_ERROR_SURROGATEESCAPE:
+            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+                goto onError;
+            for (Py_ssize_t i=startinpos; i<endinpos; i++) {
+                ch = (Py_UCS4)(unsigned char)(starts[i]);
+                PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
+                                ch + 0xdc00);
+                writer.pos++;
+            }
+            s += (endinpos - startinpos);
+            break;
+
+        default:
+            if (unicode_decode_call_errorhandler_writer(
+                    errors, &error_handler_obj,
+                    "utf-8", errmsg,
+                    &starts, &end, &startinpos, &endinpos, &exc, &s,
+                    &writer))
+                goto onError;
+        }
     }
 
 End:
     if (consumed)
         *consumed = s - starts;
 
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return _PyUnicodeWriter_Finish(&writer);
 
 onError:
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     _PyUnicodeWriter_Dealloc(&writer);
     return NULL;

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list