[Python-checkins] cpython: str.replace() avoids memory when it's possible

Thu Oct 6 12:37:17 CEST 2011

http://hg.python.org/cpython/rev/79c68caacb73
changeset:   72720:79c68caacb73
user:        Victor Stinner <victor.stinner at haypocalc.com>
date:        Thu Oct 06 12:31:55 2011 +0200
summary:
  str.replace() avoids memory when it's possible

files:
  Objects/unicodeobject.c |  102 +++++++++++++++++++++++----
  1 files changed, 84 insertions(+), 18 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1741,6 +1741,63 @@
     }
 }
 
+/* Ensure that a string uses the most efficient storage, if it is not the
+   case: create a new string with of the right kind. Write NULL into *p_unicode
+   on error. */
+void
+unicode_adjust_maxchar(PyObject **p_unicode)
+{
+    PyObject *unicode, *copy;
+    Py_UCS4 max_char;
+    Py_ssize_t i, len;
+    unsigned int kind;
+
+    assert(p_unicode != NULL);
+    unicode = *p_unicode;
+    assert(PyUnicode_IS_READY(unicode));
+    if (PyUnicode_IS_ASCII(unicode))
+        return;
+
+    len = PyUnicode_GET_LENGTH(unicode);
+    kind = PyUnicode_KIND(unicode);
+    if (kind == PyUnicode_1BYTE_KIND) {
+        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
+        for (i = 0; i < len; i++) {
+            if (u[i] & 0x80)
+                return;
+        }
+        max_char = 127;
+    }
+    else if (kind == PyUnicode_2BYTE_KIND) {
+        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
+        max_char = 0;
+        for (i = 0; i < len; i++) {
+            if (u[i] > max_char) {
+                max_char = u[i];
+                if (max_char >= 256)
+                    return;
+            }
+        }
+    }
+    else {
+        assert(kind == PyUnicode_4BYTE_KIND);
+        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
+        max_char = 0;
+        for (i = 0; i < len; i++) {
+            if (u[i] > max_char) {
+                max_char = u[i];
+                if (max_char >= 0x10000)
+                    return;
+            }
+        }
+    }
+    assert(max_char > PyUnicode_MAX_CHAR_VALUE(unicode));
+    copy = PyUnicode_New(len, max_char);
+    copy_characters(copy, 0, unicode, 0, len);
+    Py_DECREF(unicode);
+    *p_unicode = copy;
+}
+
 PyObject*
 PyUnicode_Copy(PyObject *unicode)
 {
@@ -9573,14 +9630,16 @@
                     PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
                 }
             if (mayshrink) {
-                PyObject *tmp = u;
-                u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
-                                              PyUnicode_GET_LENGTH(tmp));
-                Py_DECREF(tmp);
+                unicode_adjust_maxchar(&u);
+                if (u == NULL)
+                    goto error;
             }
         } else {
             int rkind = skind;
             char *res;
+            PyObject *rstr;
+            Py_UCS4 maxchar;
+
             if (kind1 < rkind) {
                 /* widen substring */
                 buf1 = _PyUnicode_AsKind(str1, rkind);
@@ -9607,11 +9666,13 @@
                 if (!buf1) goto error;
                 release1 = 1;
             }
-            res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
-            if (!res) {
-                PyErr_NoMemory();
+            maxchar = PyUnicode_MAX_CHAR_VALUE(self);
+            maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
+            rstr = PyUnicode_New(slen, maxchar);
+            if (!rstr)
                 goto error;
-            }
+            res = PyUnicode_DATA(rstr);
+
             memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
             /* change everything in-place, starting with this one */
             memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
@@ -9631,16 +9692,19 @@
                 i += len1;
             }
 
-            u = PyUnicode_FromKindAndData(rkind, res, slen);
-            PyMem_Free(res);
-            if (!u) goto error;
+            u = rstr;
+            unicode_adjust_maxchar(&u);
+            if (!u)
+                goto error;
         }
     } else {
 
         Py_ssize_t n, i, j, ires;
         Py_ssize_t product, new_size;
         int rkind = skind;
+        PyObject *rstr;
         char *res;
+        Py_UCS4 maxchar;
 
         if (kind1 < rkind) {
             buf1 = _PyUnicode_AsKind(str1, rkind);
@@ -9679,9 +9743,12 @@
                             "replace string is too long");
             goto error;
         }
-        res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
-        if (!res)
+        maxchar = PyUnicode_MAX_CHAR_VALUE(self);
+        maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
+        rstr = PyUnicode_New(new_size, maxchar);
+        if (!rstr)
             goto error;
+        res = PyUnicode_DATA(rstr);
         ires = i = 0;
         if (len1 > 0) {
             while (n-- > 0) {
@@ -9731,11 +9798,10 @@
                    sbuf + PyUnicode_KIND_SIZE(rkind, i),
                    PyUnicode_KIND_SIZE(rkind, slen-i));
         }
-        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(str2))
-            u = unicode_fromascii((unsigned char*)res, new_size);
-        else
-            u = PyUnicode_FromKindAndData(rkind, res, new_size);
-        PyMem_Free(res);
+        u = rstr;
+        unicode_adjust_maxchar(&u);
+        if (u == NULL)
+            goto error;
     }
     if (srelease)
         PyMem_FREE(sbuf);

-- 
Repository URL: http://hg.python.org/cpython