[Python-checkins] cpython (merge 3.2 -> default): Issue #12911: Fix memory consumption when calculating the repr() of huge tuples

antoine.pitrou python-checkins at python.org
Thu Oct 6 19:13:46 CEST 2011


http://hg.python.org/cpython/rev/656c13024ede
changeset:   72756:656c13024ede
parent:      72753:9a91ab415109
parent:      72755:f9f782f2369e
user:        Antoine Pitrou <solipsis at pitrou.net>
date:        Thu Oct 06 19:04:12 2011 +0200
summary:
  Issue #12911: Fix memory consumption when calculating the repr() of huge tuples or lists.

This introduces a small private API for this common pattern.
The issue has been discovered thanks to Martin's huge-mem buildbot.

files:
  Include/Python.h           |    2 +-
  Include/accu.h             |   35 +++++++
  Lib/test/test_list.py      |   11 ++
  Lib/test/test_tuple.py     |   10 ++
  Makefile.pre.in            |    2 +
  Misc/NEWS                  |    3 +
  Objects/accu.c             |  114 +++++++++++++++++++++++++
  Objects/listobject.c       |   81 +++++++----------
  Objects/tupleobject.c      |   73 +++++++--------
  PC/VC6/pythoncore.dsp      |    4 +
  PC/VS7.1/pythoncore.vcproj |    3 +
  PC/VS8.0/pythoncore.vcproj |    8 +
  PCbuild/pythoncore.vcproj  |    8 +
  13 files changed, 269 insertions(+), 85 deletions(-)


diff --git a/Include/Python.h b/Include/Python.h
--- a/Include/Python.h
+++ b/Include/Python.h
@@ -101,7 +101,7 @@
 #include "warnings.h"
 #include "weakrefobject.h"
 #include "structseq.h"
-
+#include "accu.h"
 
 #include "codecs.h"
 #include "pyerrors.h"
diff --git a/Include/accu.h b/Include/accu.h
new file mode 100644
--- /dev/null
+++ b/Include/accu.h
@@ -0,0 +1,35 @@
+#ifndef Py_LIMITED_API
+#ifndef Py_ACCU_H
+#define Py_ACCU_H
+
+/*** This is a private API for use by the interpreter and the stdlib.
+ *** Its definition may be changed or removed at any moment.
+ ***/
+
+/*
+ * A two-level accumulator of unicode objects that avoids both the overhead
+ * of keeping a huge number of small separate objects, and the quadratic
+ * behaviour of using a naive repeated concatenation scheme.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    PyObject *large;  /* A list of previously accumulated large strings */
+    PyObject *small;  /* Pending small strings */
+} _PyAccu;
+
+PyAPI_FUNC(int) _PyAccu_Init(_PyAccu *acc);
+PyAPI_FUNC(int) _PyAccu_Accumulate(_PyAccu *acc, PyObject *unicode);
+PyAPI_FUNC(PyObject *) _PyAccu_FinishAsList(_PyAccu *acc);
+PyAPI_FUNC(PyObject *) _PyAccu_Finish(_PyAccu *acc);
+PyAPI_FUNC(void) _PyAccu_Destroy(_PyAccu *acc);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* Py_ACCU_H */
+#endif /* Py_LIMITED_API */
diff --git a/Lib/test/test_list.py b/Lib/test/test_list.py
--- a/Lib/test/test_list.py
+++ b/Lib/test/test_list.py
@@ -59,6 +59,17 @@
         self.assertRaises((MemoryError, OverflowError), mul, lst, n)
         self.assertRaises((MemoryError, OverflowError), imul, lst, n)
 
+    def test_repr_large(self):
+        # Check the repr of large list objects
+        def check(n):
+            l = [0] * n
+            s = repr(l)
+            self.assertEqual(s,
+                '[' + ', '.join(['0'] * n) + ']')
+        check(10)       # check our checking code
+        check(1000000)
+
+
 def test_main(verbose=None):
     support.run_unittest(ListTest)
 
diff --git a/Lib/test/test_tuple.py b/Lib/test/test_tuple.py
--- a/Lib/test/test_tuple.py
+++ b/Lib/test/test_tuple.py
@@ -154,6 +154,16 @@
         # Trying to untrack an unfinished tuple could crash Python
         self._not_tracked(tuple(gc.collect() for i in range(101)))
 
+    def test_repr_large(self):
+        # Check the repr of large list objects
+        def check(n):
+            l = (0,) * n
+            s = repr(l)
+            self.assertEqual(s,
+                '(' + ', '.join(['0'] * n) + ')')
+        check(10)       # check our checking code
+        check(1000000)
+
 def test_main():
     support.run_unittest(TupleTest)
 
diff --git a/Makefile.pre.in b/Makefile.pre.in
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -342,6 +342,7 @@
 # Objects
 OBJECT_OBJS=	\
 		Objects/abstract.o \
+		Objects/accu.o \
 		Objects/boolobject.o \
 		Objects/bytes_methods.o \
 		Objects/bytearrayobject.o \
@@ -661,6 +662,7 @@
 PYTHON_HEADERS= \
 		Include/Python.h \
 		Include/abstract.h \
+		Include/accu.h \
 		Include/asdl.h \
 		Include/ast.h \
                 Include/bltinmodule.h \
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@
 Core and Builtins
 -----------------
 
+- Issue #12911: Fix memory consumption when calculating the repr() of huge
+  tuples or lists.
+
 - PEP 393: flexible string representation. Thanks to Torsten Becker for the
   initial implementation, and Victor Stinner for various bug fixes.
 
diff --git a/Objects/accu.c b/Objects/accu.c
new file mode 100644
--- /dev/null
+++ b/Objects/accu.c
@@ -0,0 +1,114 @@
+/* Accumulator struct implementation */
+
+#include "Python.h"
+
+static PyObject *
+join_list_unicode(PyObject *lst)
+{
+    /* return ''.join(lst) */
+    PyObject *sep, *ret;
+    sep = PyUnicode_FromStringAndSize("", 0);
+    ret = PyUnicode_Join(sep, lst);
+    Py_DECREF(sep);
+    return ret;
+}
+
+int
+_PyAccu_Init(_PyAccu *acc)
+{
+    /* Lazily allocated */
+    acc->large = NULL;
+    acc->small = PyList_New(0);
+    if (acc->small == NULL)
+        return -1;
+    return 0;
+}
+
+static int
+flush_accumulator(_PyAccu *acc)
+{
+    Py_ssize_t nsmall = PyList_GET_SIZE(acc->small);
+    if (nsmall) {
+        int ret;
+        PyObject *joined;
+        if (acc->large == NULL) {
+            acc->large = PyList_New(0);
+            if (acc->large == NULL)
+                return -1;
+        }
+        joined = join_list_unicode(acc->small);
+        if (joined == NULL)
+            return -1;
+        if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) {
+            Py_DECREF(joined);
+            return -1;
+        }
+        ret = PyList_Append(acc->large, joined);
+        Py_DECREF(joined);
+        return ret;
+    }
+    return 0;
+}
+
+int
+_PyAccu_Accumulate(_PyAccu *acc, PyObject *unicode)
+{
+    Py_ssize_t nsmall;
+    assert(PyUnicode_Check(unicode));
+
+    if (PyList_Append(acc->small, unicode))
+        return -1;
+    nsmall = PyList_GET_SIZE(acc->small);
+    /* Each item in a list of unicode objects has an overhead (in 64-bit
+     * builds) of:
+     *   - 8 bytes for the list slot
+     *   - 56 bytes for the header of the unicode object
+     * that is, 64 bytes.  100000 such objects waste more than 6MB
+     * compared to a single concatenated string.
+     */
+    if (nsmall < 100000)
+        return 0;
+    return flush_accumulator(acc);
+}
+
+PyObject *
+_PyAccu_FinishAsList(_PyAccu *acc)
+{
+    int ret;
+    PyObject *res;
+
+    ret = flush_accumulator(acc);
+    Py_CLEAR(acc->small);
+    if (ret) {
+        Py_CLEAR(acc->large);
+        return NULL;
+    }
+    res = acc->large;
+    acc->large = NULL;
+    return res;
+}
+
+PyObject *
+_PyAccu_Finish(_PyAccu *acc)
+{
+    PyObject *list, *res;
+    if (acc->large == NULL) {
+        list = acc->small;
+        acc->small = NULL;
+    }
+    else {
+        list = _PyAccu_FinishAsList(acc);
+        if (!list)
+            return NULL;
+    }
+    res = join_list_unicode(list);
+    Py_DECREF(list);
+    return res;
+}
+
+void
+_PyAccu_Destroy(_PyAccu *acc)
+{
+    Py_CLEAR(acc->small);
+    Py_CLEAR(acc->large);
+}
diff --git a/Objects/listobject.c b/Objects/listobject.c
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -321,70 +321,59 @@
 list_repr(PyListObject *v)
 {
     Py_ssize_t i;
-    PyObject *s, *temp;
-    PyObject *pieces = NULL, *result = NULL;
+    PyObject *s = NULL;
+    _PyAccu acc;
+    static PyObject *sep = NULL;
+
+    if (Py_SIZE(v) == 0) {
+        return PyUnicode_FromString("[]");
+    }
+
+    if (sep == NULL) {
+        sep = PyUnicode_FromString(", ");
+        if (sep == NULL)
+            return NULL;
+    }
 
     i = Py_ReprEnter((PyObject*)v);
     if (i != 0) {
         return i > 0 ? PyUnicode_FromString("[...]") : NULL;
     }
 
-    if (Py_SIZE(v) == 0) {
-        result = PyUnicode_FromString("[]");
-        goto Done;
-    }
+    if (_PyAccu_Init(&acc))
+        goto error;
 
-    pieces = PyList_New(0);
-    if (pieces == NULL)
-        goto Done;
+    s = PyUnicode_FromString("[");
+    if (s == NULL || _PyAccu_Accumulate(&acc, s))
+        goto error;
+    Py_CLEAR(s);
 
     /* Do repr() on each element.  Note that this may mutate the list,
        so must refetch the list size on each iteration. */
     for (i = 0; i < Py_SIZE(v); ++i) {
-        int status;
         if (Py_EnterRecursiveCall(" while getting the repr of a list"))
-            goto Done;
+            goto error;
         s = PyObject_Repr(v->ob_item[i]);
         Py_LeaveRecursiveCall();
-        if (s == NULL)
-            goto Done;
-        status = PyList_Append(pieces, s);
-        Py_DECREF(s);  /* append created a new ref */
-        if (status < 0)
-            goto Done;
+        if (i > 0 && _PyAccu_Accumulate(&acc, sep))
+            goto error;
+        if (s == NULL || _PyAccu_Accumulate(&acc, s))
+            goto error;
+        Py_CLEAR(s);
     }
+    s = PyUnicode_FromString("]");
+    if (s == NULL || _PyAccu_Accumulate(&acc, s))
+        goto error;
+    Py_CLEAR(s);
 
-    /* Add "[]" decorations to the first and last items. */
-    assert(PyList_GET_SIZE(pieces) > 0);
-    s = PyUnicode_FromString("[");
-    if (s == NULL)
-        goto Done;
-    temp = PyList_GET_ITEM(pieces, 0);
-    PyUnicode_AppendAndDel(&s, temp);
-    PyList_SET_ITEM(pieces, 0, s);
-    if (s == NULL)
-        goto Done;
+    Py_ReprLeave((PyObject *)v);
+    return _PyAccu_Finish(&acc);
 
-    s = PyUnicode_FromString("]");
-    if (s == NULL)
-        goto Done;
-    temp = PyList_GET_ITEM(pieces, PyList_GET_SIZE(pieces) - 1);
-    PyUnicode_AppendAndDel(&temp, s);
-    PyList_SET_ITEM(pieces, PyList_GET_SIZE(pieces) - 1, temp);
-    if (temp == NULL)
-        goto Done;
-
-    /* Paste them all together with ", " between. */
-    s = PyUnicode_FromString(", ");
-    if (s == NULL)
-        goto Done;
-    result = PyUnicode_Join(s, pieces);
-    Py_DECREF(s);
-
-Done:
-    Py_XDECREF(pieces);
+error:
+    _PyAccu_Destroy(&acc);
+    Py_XDECREF(s);
     Py_ReprLeave((PyObject *)v);
-    return result;
+    return NULL;
 }
 
 static Py_ssize_t
diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c
--- a/Objects/tupleobject.c
+++ b/Objects/tupleobject.c
@@ -240,13 +240,20 @@
 tuplerepr(PyTupleObject *v)
 {
     Py_ssize_t i, n;
-    PyObject *s, *temp;
-    PyObject *pieces, *result = NULL;
+    PyObject *s = NULL;
+    _PyAccu acc;
+    static PyObject *sep = NULL;
 
     n = Py_SIZE(v);
     if (n == 0)
         return PyUnicode_FromString("()");
 
+    if (sep == NULL) {
+        sep = PyUnicode_FromString(", ");
+        if (sep == NULL)
+            return NULL;
+    }
+
     /* While not mutable, it is still possible to end up with a cycle in a
        tuple through an object that stores itself within a tuple (and thus
        infinitely asks for the repr of itself). This should only be
@@ -256,52 +263,42 @@
         return i > 0 ? PyUnicode_FromString("(...)") : NULL;
     }
 
-    pieces = PyTuple_New(n);
-    if (pieces == NULL)
-        return NULL;
+    if (_PyAccu_Init(&acc))
+        goto error;
+
+    s = PyUnicode_FromString("(");
+    if (s == NULL || _PyAccu_Accumulate(&acc, s))
+        goto error;
+    Py_CLEAR(s);
 
     /* Do repr() on each element. */
     for (i = 0; i < n; ++i) {
         if (Py_EnterRecursiveCall(" while getting the repr of a tuple"))
-            goto Done;
+            goto error;
         s = PyObject_Repr(v->ob_item[i]);
         Py_LeaveRecursiveCall();
-        if (s == NULL)
-            goto Done;
-        PyTuple_SET_ITEM(pieces, i, s);
+        if (i > 0 && _PyAccu_Accumulate(&acc, sep))
+            goto error;
+        if (s == NULL || _PyAccu_Accumulate(&acc, s))
+            goto error;
+        Py_CLEAR(s);
     }
+    if (n > 1)
+        s = PyUnicode_FromString(")");
+    else
+        s = PyUnicode_FromString(",)");
+    if (s == NULL || _PyAccu_Accumulate(&acc, s))
+        goto error;
+    Py_CLEAR(s);
 
-    /* Add "()" decorations to the first and last items. */
-    assert(n > 0);
-    s = PyUnicode_FromString("(");
-    if (s == NULL)
-        goto Done;
-    temp = PyTuple_GET_ITEM(pieces, 0);
-    PyUnicode_AppendAndDel(&s, temp);
-    PyTuple_SET_ITEM(pieces, 0, s);
-    if (s == NULL)
-        goto Done;
+    Py_ReprLeave((PyObject *)v);
+    return _PyAccu_Finish(&acc);
 
-    s = PyUnicode_FromString(n == 1 ? ",)" : ")");
-    if (s == NULL)
-        goto Done;
-    temp = PyTuple_GET_ITEM(pieces, n-1);
-    PyUnicode_AppendAndDel(&temp, s);
-    PyTuple_SET_ITEM(pieces, n-1, temp);
-    if (temp == NULL)
-        goto Done;
-
-    /* Paste them all together with ", " between. */
-    s = PyUnicode_FromString(", ");
-    if (s == NULL)
-        goto Done;
-    result = PyUnicode_Join(s, pieces);
-    Py_DECREF(s);
-
-Done:
-    Py_DECREF(pieces);
+error:
+    _PyAccu_Destroy(&acc);
+    Py_XDECREF(s);
     Py_ReprLeave((PyObject *)v);
-    return result;
+    return NULL;
 }
 
 /* The addend 82520, was selected from the range(0, 1000000) for
diff --git a/PC/VC6/pythoncore.dsp b/PC/VC6/pythoncore.dsp
--- a/PC/VC6/pythoncore.dsp
+++ b/PC/VC6/pythoncore.dsp
@@ -205,6 +205,10 @@
 # End Source File
 # Begin Source File
 
+SOURCE=..\..\Objects\accu.c
+# End Source File
+# Begin Source File
+
 SOURCE=..\..\Parser\acceler.c
 # End Source File
 # Begin Source File
diff --git a/PC/VS7.1/pythoncore.vcproj b/PC/VS7.1/pythoncore.vcproj
--- a/PC/VS7.1/pythoncore.vcproj
+++ b/PC/VS7.1/pythoncore.vcproj
@@ -445,6 +445,9 @@
 			RelativePath="..\..\Objects\abstract.c">
 		</File>
 		<File
+			RelativePath="..\..\Objects\accu.c">
+		</File>
+		<File
 			RelativePath="..\..\Parser\acceler.c">
 		</File>
 		<File
diff --git a/PC/VS8.0/pythoncore.vcproj b/PC/VS8.0/pythoncore.vcproj
--- a/PC/VS8.0/pythoncore.vcproj
+++ b/PC/VS8.0/pythoncore.vcproj
@@ -635,6 +635,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\Include\accu.h"
+				>
+			</File>
+			<File
 				RelativePath="..\..\Include\asdl.h"
 				>
 			</File>
@@ -1447,6 +1451,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\Objects\accu.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\Objects\boolobject.c"
 				>
 			</File>
diff --git a/PCbuild/pythoncore.vcproj b/PCbuild/pythoncore.vcproj
--- a/PCbuild/pythoncore.vcproj
+++ b/PCbuild/pythoncore.vcproj
@@ -635,6 +635,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\Include\accu.h"
+				>
+			</File>
+			<File
 				RelativePath="..\Include\asdl.h"
 				>
 			</File>
@@ -1455,6 +1459,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\Objects\accu.c"
+				>
+			</File>
+			<File
 				RelativePath="..\Objects\boolobject.c"
 				>
 			</File>

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list