[Python-checkins] cpython: Issue #15596: Faster pickling of unicode strings.

antoine.pitrou python-checkins at python.org
Sun Apr 7 17:41:03 CEST 2013


http://hg.python.org/cpython/rev/09a84091ae96
changeset:   83184:09a84091ae96
user:        Antoine Pitrou <solipsis at pitrou.net>
date:        Sun Apr 07 17:38:11 2013 +0200
summary:
  Issue #15596: Faster pickling of unicode strings.

files:
  Misc/NEWS         |    2 +
  Modules/_pickle.c |  116 +++++++++++++++++++++------------
  2 files changed, 77 insertions(+), 41 deletions(-)


diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -307,6 +307,8 @@
 Library
 -------
 
+- Issue #15596: Faster pickling of unicode strings.
+
 - Issue #17572: Avoid chained exceptions while passing bad directives to
   time.strptime().  Initial patch by Claudiu Popa.
 
diff --git a/Modules/_pickle.c b/Modules/_pickle.c
--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -1873,63 +1873,97 @@
 }
 
 static int
+write_utf8(PicklerObject *self, char *data, Py_ssize_t size)
+{
+    char pdata[5];
+
+#if SIZEOF_SIZE_T > 4
+    if (size > 0xffffffffUL) {
+        /* string too large */
+        PyErr_SetString(PyExc_OverflowError,
+                        "cannot serialize a string larger than 4GB");
+        return -1;
+    }
+#endif
+
+    pdata[0] = BINUNICODE;
+    pdata[1] = (unsigned char)(size & 0xff);
+    pdata[2] = (unsigned char)((size >> 8) & 0xff);
+    pdata[3] = (unsigned char)((size >> 16) & 0xff);
+    pdata[4] = (unsigned char)((size >> 24) & 0xff);
+
+    if (_Pickler_Write(self, pdata, sizeof(pdata)) < 0)
+        return -1;
+
+    if (_Pickler_Write(self, data, size) < 0)
+        return -1;
+
+    return 0;
+}
+
+static int
+write_unicode_binary(PicklerObject *self, PyObject *obj)
+{
+    PyObject *encoded = NULL;
+    Py_ssize_t size;
+    char *data;
+    int r;
+
+    if (PyUnicode_READY(obj))
+        return -1;
+
+    data = PyUnicode_AsUTF8AndSize(obj, &size);
+    if (data != NULL)
+        return write_utf8(self, data, size);
+
+    /* Issue #8383: for strings with lone surrogates, fallback on the
+       "surrogatepass" error handler. */
+    PyErr_Clear();
+    encoded = PyUnicode_AsEncodedString(obj, "utf-8", "surrogatepass");
+    if (encoded == NULL)
+        return -1;
+
+    r = write_utf8(self, PyBytes_AS_STRING(encoded),
+                   PyBytes_GET_SIZE(encoded));
+    Py_DECREF(encoded);
+    return r;
+}
+
+static int
 save_unicode(PicklerObject *self, PyObject *obj)
 {
-    Py_ssize_t size;
-    PyObject *encoded = NULL;
-
     if (self->bin) {
-        char pdata[5];
-
-        encoded = PyUnicode_AsEncodedString(obj, "utf-8", "surrogatepass");
-        if (encoded == NULL)
-            goto error;
-
-        size = PyBytes_GET_SIZE(encoded);
-        if (size > 0xffffffffL) {
-            PyErr_SetString(PyExc_OverflowError,
-                            "cannot serialize a string larger than 4 GiB");
-            goto error;          /* string too large */
-        }
-
-        pdata[0] = BINUNICODE;
-        pdata[1] = (unsigned char)(size & 0xff);
-        pdata[2] = (unsigned char)((size >> 8) & 0xff);
-        pdata[3] = (unsigned char)((size >> 16) & 0xff);
-        pdata[4] = (unsigned char)((size >> 24) & 0xff);
-
-        if (_Pickler_Write(self, pdata, 5) < 0)
-            goto error;
-
-        if (_Pickler_Write(self, PyBytes_AS_STRING(encoded), size) < 0)
-            goto error;
+        if (write_unicode_binary(self, obj) < 0)
+            return -1;
     }
     else {
+        PyObject *encoded;
+        Py_ssize_t size;
         const char unicode_op = UNICODE;
 
         encoded = raw_unicode_escape(obj);
         if (encoded == NULL)
-            goto error;
-
-        if (_Pickler_Write(self, &unicode_op, 1) < 0)
-            goto error;
+            return -1;
+
+        if (_Pickler_Write(self, &unicode_op, 1) < 0) {
+            Py_DECREF(encoded);
+            return -1;
+        }
 
         size = PyBytes_GET_SIZE(encoded);
-        if (_Pickler_Write(self, PyBytes_AS_STRING(encoded), size) < 0)
-            goto error;
+        if (_Pickler_Write(self, PyBytes_AS_STRING(encoded), size) < 0) {
+            Py_DECREF(encoded);
+            return -1;
+        }
+        Py_DECREF(encoded);
 
         if (_Pickler_Write(self, "\n", 1) < 0)
-            goto error;
+            return -1;
     }
     if (memo_put(self, obj) < 0)
-        goto error;
-
-    Py_DECREF(encoded);
+        return -1;
+
     return 0;
-
-  error:
-    Py_XDECREF(encoded);
-    return -1;
 }
 
 /* A helper for save_tuple.  Push the len elements in tuple t on the stack. */

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list