[Python-checkins] cpython: Issue #25873: Optimized iterating ElementTree.

serhiy.storchaka python-checkins at python.org
Mon Dec 21 05:44:24 EST 2015


https://hg.python.org/cpython/rev/5a5d5268afd5
changeset:   99655:5a5d5268afd5
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Mon Dec 21 12:43:54 2015 +0200
summary:
  Issue #25873: Optimized iterating ElementTree.
Iterating elements Element.iter() is now 40% faster,
iterating text Element.itertext() is now up to 2.5 times faster.

files:
  Misc/NEWS              |    4 +
  Modules/_elementtree.c |  261 +++++++++++-----------------
  2 files changed, 106 insertions(+), 159 deletions(-)


diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -115,6 +115,10 @@
 Library
 -------
 
+- Issue #25873: Optimized iterating ElementTree.  Iterating elements
+  Element.iter() is now 40% faster, iterating text Element.itertext()
+  is now up to 2.5 times faster.
+
 - Issue #25902: Fixed various refcount issues in ElementTree iteration.
 
 - Issue #22227: The TarFile iterator is reimplemented using generator.
diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c
--- a/Modules/_elementtree.c
+++ b/Modules/_elementtree.c
@@ -1986,22 +1986,22 @@
  * pre-order traversal. To keep track of which sub-element should be returned
  * next, a stack of parents is maintained. This is a standard stack-based
  * iterative pre-order traversal of a tree.
- * The stack is managed using a single-linked list starting at parent_stack.
- * Each stack node contains the saved parent to which we should return after
+ * The stack is managed using a continuous array.
+ * Each stack item contains the saved parent to which we should return after
  * the current one is exhausted, and the next child to examine in that parent.
  */
 typedef struct ParentLocator_t {
     ElementObject *parent;
     Py_ssize_t child_index;
-    struct ParentLocator_t *next;
 } ParentLocator;
 
 typedef struct {
     PyObject_HEAD
     ParentLocator *parent_stack;
+    Py_ssize_t parent_stack_used;
+    Py_ssize_t parent_stack_size;
     ElementObject *root_element;
     PyObject *sought_tag;
-    int root_done;
     int gettext;
 } ElementIterObject;
 
@@ -2009,13 +2009,11 @@
 static void
 elementiter_dealloc(ElementIterObject *it)
 {
-    ParentLocator *p = it->parent_stack;
-    while (p) {
-        ParentLocator *temp = p;
-        Py_XDECREF(p->parent);
-        p = p->next;
-        PyObject_Free(temp);
-    }
+    Py_ssize_t i = it->parent_stack_used;
+    it->parent_stack_used = 0;
+    while (i--)
+        Py_XDECREF(it->parent_stack[i].parent);
+    PyMem_Free(it->parent_stack);
 
     Py_XDECREF(it->sought_tag);
     Py_XDECREF(it->root_element);
@@ -2027,11 +2025,9 @@
 static int
 elementiter_traverse(ElementIterObject *it, visitproc visit, void *arg)
 {
-    ParentLocator *p = it->parent_stack;
-    while (p) {
-        Py_VISIT(p->parent);
-        p = p->next;
-    }
+    Py_ssize_t i = it->parent_stack_used;
+    while (i--)
+        Py_VISIT(it->parent_stack[i].parent);
 
     Py_VISIT(it->root_element);
     Py_VISIT(it->sought_tag);
@@ -2040,17 +2036,25 @@
 
 /* Helper function for elementiter_next. Add a new parent to the parent stack.
  */
-static ParentLocator *
-parent_stack_push_new(ParentLocator *stack, ElementObject *parent)
+static int
+parent_stack_push_new(ElementIterObject *it, ElementObject *parent)
 {
-    ParentLocator *new_node = PyObject_Malloc(sizeof(ParentLocator));
-    if (new_node) {
-        new_node->parent = parent;
-        Py_INCREF(parent);
-        new_node->child_index = 0;
-        new_node->next = stack;
+    ParentLocator *item;
+
+    if (it->parent_stack_used >= it->parent_stack_size) {
+        Py_ssize_t new_size = it->parent_stack_size * 2;  /* never overflow */
+        ParentLocator *parent_stack = it->parent_stack;
+        PyMem_Resize(parent_stack, ParentLocator, new_size);
+        if (parent_stack == NULL)
+            return -1;
+        it->parent_stack = parent_stack;
+        it->parent_stack_size = new_size;
     }
-    return new_node;
+    item = it->parent_stack + it->parent_stack_used++;
+    Py_INCREF(parent);
+    item->parent = parent;
+    item->child_index = 0;
+    return 0;
 }
 
 static PyObject *
@@ -2067,151 +2071,91 @@
      *   - itertext() also has to handle tail, after finishing with all the
      *     children of a node.
      */
-    ElementObject *cur_parent;
-    Py_ssize_t child_index;
     int rc;
     ElementObject *elem;
+    PyObject *text;
 
     while (1) {
         /* Handle the case reached in the beginning and end of iteration, where
-         * the parent stack is empty. The root_done flag gives us indication
-         * whether we've just started iterating (so root_done is 0), in which
-         * case the root is returned. If root_done is 1 and we're here, the
+         * the parent stack is empty. If root_element is NULL and we're here, the
          * iterator is exhausted.
          */
-        if (!it->parent_stack->parent) {
-            if (it->root_done) {
+        if (!it->parent_stack_used) {
+            if (!it->root_element) {
                 PyErr_SetNone(PyExc_StopIteration);
                 return NULL;
-            } else {
-                elem = it->root_element;
-                it->parent_stack = parent_stack_push_new(it->parent_stack,
-                                                         elem);
-                if (!it->parent_stack) {
-                    PyErr_NoMemory();
-                    return NULL;
-                }
-
-                Py_INCREF(elem);
-                it->root_done = 1;
-                rc = (it->sought_tag == Py_None);
-                if (!rc) {
-                    rc = PyObject_RichCompareBool(elem->tag,
-                                                  it->sought_tag, Py_EQ);
-                    if (rc < 0) {
-                        Py_DECREF(elem);
-                        return NULL;
-                    }
-                }
-                if (rc) {
-                    if (it->gettext) {
-                        PyObject *text = element_get_text(elem);
-                        if (!text) {
-                            Py_DECREF(elem);
-                            return NULL;
-                        }
-                        Py_INCREF(text);
-                        Py_DECREF(elem);
-                        rc = PyObject_IsTrue(text);
-                        if (rc > 0)
-                            return text;
-                        Py_DECREF(text);
-                        if (rc < 0)
-                            return NULL;
-                    } else {
-                        return (PyObject *)elem;
-                    }
-                }
-                else {
-                    Py_DECREF(elem);
-                }
             }
+
+            elem = it->root_element;  /* steals a reference */
+            it->root_element = NULL;
         }
-
-        /* See if there are children left to traverse in the current parent. If
-         * yes, visit the next child. If not, pop the stack and try again.
-         */
-        cur_parent = it->parent_stack->parent;
-        child_index = it->parent_stack->child_index;
-        if (cur_parent->extra && child_index < cur_parent->extra->length) {
-            elem = (ElementObject *)cur_parent->extra->children[child_index];
-            it->parent_stack->child_index++;
-            it->parent_stack = parent_stack_push_new(it->parent_stack,
-                                                     elem);
-            if (!it->parent_stack) {
-                PyErr_NoMemory();
-                return NULL;
-            }
-
-            Py_INCREF(elem);
-            if (it->gettext) {
-                PyObject *text = element_get_text(elem);
-                if (!text) {
-                    Py_DECREF(elem);
-                    return NULL;
-                }
-                Py_INCREF(text);
-                Py_DECREF(elem);
-                rc = PyObject_IsTrue(text);
-                if (rc > 0)
-                    return text;
-                Py_DECREF(text);
-                if (rc < 0)
-                    return NULL;
-            } else {
-                rc = (it->sought_tag == Py_None);
-                if (!rc) {
-                    rc = PyObject_RichCompareBool(elem->tag,
-                                                  it->sought_tag, Py_EQ);
-                    if (rc < 0) {
-                        Py_DECREF(elem);
-                        return NULL;
-                    }
-                }
-                if (rc) {
-                    return (PyObject *)elem;
+        else {
+            /* See if there are children left to traverse in the current parent. If
+             * yes, visit the next child. If not, pop the stack and try again.
+             */
+            ParentLocator *item = &it->parent_stack[it->parent_stack_used - 1];
+            Py_ssize_t child_index = item->child_index;
+            ElementObjectExtra *extra;
+            elem = item->parent;
+            extra = elem->extra;
+            if (!extra || child_index >= extra->length) {
+                it->parent_stack_used--;
+                /* Note that extra condition on it->parent_stack_used here;
+                 * this is because itertext() is supposed to only return *inner*
+                 * text, not text following the element it began iteration with.
+                 */
+                if (it->gettext && it->parent_stack_used) {
+                    text = element_get_tail(elem);
+                    goto gettext;
                 }
                 Py_DECREF(elem);
+                continue;
             }
+
+            elem = (ElementObject *)extra->children[child_index];
+            item->child_index++;
+            Py_INCREF(elem);
+        }
+
+        if (parent_stack_push_new(it, elem) < 0) {
+            Py_DECREF(elem);
+            PyErr_NoMemory();
+            return NULL;
+        }
+        if (it->gettext) {
+            text = element_get_text(elem);
+            goto gettext;
+        }
+
+        if (it->sought_tag == Py_None)
+            return (PyObject *)elem;
+
+        rc = PyObject_RichCompareBool(elem->tag, it->sought_tag, Py_EQ);
+        if (rc > 0)
+            return (PyObject *)elem;
+
+        Py_DECREF(elem);
+        if (rc < 0)
+            return NULL;
+        continue;
+
+gettext:
+        if (!text) {
+            Py_DECREF(elem);
+            return NULL;
+        }
+        if (text == Py_None) {
+            Py_DECREF(elem);
         }
         else {
-            PyObject *tail;
-            ParentLocator *next;
-            if (it->gettext) {
-                Py_INCREF(cur_parent);
-                tail = element_get_tail(cur_parent);
-                if (!tail) {
-                    Py_DECREF(cur_parent);
-                    return NULL;
-                }
-                Py_INCREF(tail);
-                Py_DECREF(cur_parent);
-            }
-            else {
-                tail = Py_None;
-                Py_INCREF(tail);
-            }
-            next = it->parent_stack->next;
-            cur_parent = it->parent_stack->parent;
-            PyObject_Free(it->parent_stack);
-            it->parent_stack = next;
-            Py_XDECREF(cur_parent);
-
-            /* Note that extra condition on it->parent_stack->parent here;
-             * this is because itertext() is supposed to only return *inner*
-             * text, not text following the element it began iteration with.
-             */
-            if (it->parent_stack->parent) {
-                rc = PyObject_IsTrue(tail);
-                if (rc > 0)
-                    return tail;
-                Py_DECREF(tail);
-                if (rc < 0)
-                    return NULL;
-            }
-            else {
-                Py_DECREF(tail);
-            }
+            Py_INCREF(text);
+            Py_DECREF(elem);
+            rc = PyObject_IsTrue(text);
+            if (rc > 0)
+                return text;
+            Py_DECREF(text);
+            if (rc < 0)
+                return NULL;
         }
     }
 
@@ -2263,6 +2207,7 @@
     0,                                          /* tp_new */
 };
 
+#define INIT_PARENT_STACK_SIZE 8
 
 static PyObject *
 create_elementiter(ElementObject *self, PyObject *tag, int gettext)
@@ -2275,22 +2220,20 @@
 
     Py_INCREF(tag);
     it->sought_tag = tag;
-    it->root_done = 0;
     it->gettext = gettext;
     Py_INCREF(self);
     it->root_element = self;
 
     PyObject_GC_Track(it);
 
-    it->parent_stack = PyObject_Malloc(sizeof(ParentLocator));
+    it->parent_stack = PyMem_New(ParentLocator, INIT_PARENT_STACK_SIZE);
     if (it->parent_stack == NULL) {
         Py_DECREF(it);
         PyErr_NoMemory();
         return NULL;
     }
-    it->parent_stack->parent = NULL;
-    it->parent_stack->child_index = 0;
-    it->parent_stack->next = NULL;
+    it->parent_stack_used = 0;
+    it->parent_stack_size = INIT_PARENT_STACK_SIZE;
 
     return (PyObject *)it;
 }

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list