[Python-checkins] bpo-37399: Correctly attach tail text to the last element/comment/pi (GH-14856)

Stefan Behnel webhook-mailer at python.org
Wed Jul 24 14:08:08 EDT 2019


https://github.com/python/cpython/commit/c6cb4cdd21c0c3a09b0617dbfaa7053d3bfa6def
commit: c6cb4cdd21c0c3a09b0617dbfaa7053d3bfa6def
branch: master
author: Stefan Behnel <stefan_ml at behnel.de>
committer: GitHub <noreply at github.com>
date: 2019-07-24T20:08:02+02:00
summary:

bpo-37399: Correctly attach tail text to the last element/comment/pi (GH-14856)

* bpo-37399: Correctly attach tail text to the last element/comment/pi, even when comments or pis are discarded.
Also fixes the insertion of PIs when "insert_pis=True" is configured for a TreeBuilder.

files:
M Lib/test/test_xml_etree.py
M Modules/_elementtree.c

diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index 61737493a904..b2492cda848f 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -2954,6 +2954,66 @@ def test_treebuilder_pi(self):
         self.assertEqual(b.pi('target'), (len('target'), None))
         self.assertEqual(b.pi('pitarget', ' text '), (len('pitarget'), ' text '))
 
+    def test_late_tail(self):
+        # Issue #37399: The tail of an ignored comment could overwrite the text before it.
+        class TreeBuilderSubclass(ET.TreeBuilder):
+            pass
+
+        xml = "<a>text<!-- comment -->tail</a>"
+        a = ET.fromstring(xml)
+        self.assertEqual(a.text, "texttail")
+
+        parser = ET.XMLParser(target=TreeBuilderSubclass())
+        parser.feed(xml)
+        a = parser.close()
+        self.assertEqual(a.text, "texttail")
+
+        xml = "<a>text<?pi data?>tail</a>"
+        a = ET.fromstring(xml)
+        self.assertEqual(a.text, "texttail")
+
+        xml = "<a>text<?pi data?>tail</a>"
+        parser = ET.XMLParser(target=TreeBuilderSubclass())
+        parser.feed(xml)
+        a = parser.close()
+        self.assertEqual(a.text, "texttail")
+
+    def test_late_tail_mix_pi_comments(self):
+        # Issue #37399: The tail of an ignored comment could overwrite the text before it.
+        # Test appending tails to comments/pis.
+        class TreeBuilderSubclass(ET.TreeBuilder):
+            pass
+
+        xml = "<a>text<?pi1?> <!-- comment -->\n<?pi2?>tail</a>"
+        parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=True))
+        parser.feed(xml)
+        a = parser.close()
+        self.assertEqual(a[0].text, ' comment ')
+        self.assertEqual(a[0].tail, '\ntail')
+        self.assertEqual(a.text, "text ")
+
+        parser = ET.XMLParser(target=TreeBuilderSubclass(insert_comments=True))
+        parser.feed(xml)
+        a = parser.close()
+        self.assertEqual(a[0].text, ' comment ')
+        self.assertEqual(a[0].tail, '\ntail')
+        self.assertEqual(a.text, "text ")
+
+        xml = "<a>text<!-- comment -->\n<?pi data?>tail</a>"
+        parser = ET.XMLParser(target=ET.TreeBuilder(insert_pis=True))
+        parser.feed(xml)
+        a = parser.close()
+        self.assertEqual(a[0].text, 'pi data')
+        self.assertEqual(a[0].tail, 'tail')
+        self.assertEqual(a.text, "text\n")
+
+        parser = ET.XMLParser(target=TreeBuilderSubclass(insert_pis=True))
+        parser.feed(xml)
+        a = parser.close()
+        self.assertEqual(a[0].text, 'pi data')
+        self.assertEqual(a[0].tail, 'tail')
+        self.assertEqual(a.text, "text\n")
+
     def test_treebuilder_elementfactory_none(self):
         parser = ET.XMLParser(target=ET.TreeBuilder(element_factory=None))
         parser.feed(self.sample1)
diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c
index b9b50165bfda..830ce8635eae 100644
--- a/Modules/_elementtree.c
+++ b/Modules/_elementtree.c
@@ -2399,6 +2399,7 @@ typedef struct {
 
     PyObject *this; /* current node */
     PyObject *last; /* most recently created node */
+    PyObject *last_for_tail; /* most recently created node that takes a tail */
 
     PyObject *data; /* data collector (string or list), or NULL */
 
@@ -2530,6 +2531,7 @@ treebuilder_gc_traverse(TreeBuilderObject *self, visitproc visit, void *arg)
     Py_VISIT(self->root);
     Py_VISIT(self->this);
     Py_VISIT(self->last);
+    Py_VISIT(self->last_for_tail);
     Py_VISIT(self->data);
     Py_VISIT(self->stack);
     Py_VISIT(self->pi_factory);
@@ -2551,6 +2553,7 @@ treebuilder_gc_clear(TreeBuilderObject *self)
     Py_CLEAR(self->stack);
     Py_CLEAR(self->data);
     Py_CLEAR(self->last);
+    Py_CLEAR(self->last_for_tail);
     Py_CLEAR(self->this);
     Py_CLEAR(self->pi_factory);
     Py_CLEAR(self->comment_factory);
@@ -2622,21 +2625,50 @@ _elementtree__set_factories_impl(PyObject *module, PyObject *comment_factory,
 }
 
 static int
-treebuilder_set_element_text_or_tail(PyObject *element, PyObject **data,
-                                     PyObject **dest, _Py_Identifier *name)
+treebuilder_extend_element_text_or_tail(PyObject *element, PyObject **data,
+                                        PyObject **dest, _Py_Identifier *name)
 {
+    /* Fast paths for the "almost always" cases. */
     if (Element_CheckExact(element)) {
-        PyObject *tmp = JOIN_OBJ(*dest);
-        *dest = JOIN_SET(*data, PyList_CheckExact(*data));
-        *data = NULL;
-        Py_DECREF(tmp);
-        return 0;
+        PyObject *dest_obj = JOIN_OBJ(*dest);
+        if (dest_obj == Py_None) {
+            *dest = JOIN_SET(*data, PyList_CheckExact(*data));
+            *data = NULL;
+            Py_DECREF(dest_obj);
+            return 0;
+        }
+        else if (JOIN_GET(*dest)) {
+            if (PyList_SetSlice(dest_obj, PY_SSIZE_T_MAX, PY_SSIZE_T_MAX, *data) < 0) {
+                return -1;
+            }
+            Py_CLEAR(*data);
+            return 0;
+        }
     }
-    else {
-        PyObject *joined = list_join(*data);
+
+    /*  Fallback for the non-Element / non-trivial cases. */
+    {
         int r;
-        if (joined == NULL)
+        PyObject* joined;
+        PyObject* previous = _PyObject_GetAttrId(element, name);
+        if (!previous)
+            return -1;
+        joined = list_join(*data);
+        if (!joined) {
+            Py_DECREF(previous);
             return -1;
+        }
+        if (previous != Py_None) {
+            PyObject *tmp = PyNumber_Add(previous, joined);
+            Py_DECREF(joined);
+            Py_DECREF(previous);
+            if (!tmp)
+                return -1;
+            joined = tmp;
+        } else {
+            Py_DECREF(previous);
+        }
+
         r = _PyObject_SetAttrId(element, name, joined);
         Py_DECREF(joined);
         if (r < 0)
@@ -2649,21 +2681,21 @@ treebuilder_set_element_text_or_tail(PyObject *element, PyObject **data,
 LOCAL(int)
 treebuilder_flush_data(TreeBuilderObject* self)
 {
-    PyObject *element = self->last;
-
     if (!self->data) {
         return 0;
     }
 
-    if (self->this == element) {
+    if (!self->last_for_tail) {
+        PyObject *element = self->last;
         _Py_IDENTIFIER(text);
-        return treebuilder_set_element_text_or_tail(
+        return treebuilder_extend_element_text_or_tail(
                 element, &self->data,
                 &((ElementObject *) element)->text, &PyId_text);
     }
     else {
+        PyObject *element = self->last_for_tail;
         _Py_IDENTIFIER(tail);
-        return treebuilder_set_element_text_or_tail(
+        return treebuilder_extend_element_text_or_tail(
                 element, &self->data,
                 &((ElementObject *) element)->tail, &PyId_tail);
     }
@@ -2739,6 +2771,7 @@ treebuilder_handle_start(TreeBuilderObject* self, PyObject* tag,
     }
 
     this = self->this;
+    Py_CLEAR(self->last_for_tail);
 
     if (this != Py_None) {
         if (treebuilder_add_subelement(this, node) < 0)
@@ -2836,6 +2869,8 @@ treebuilder_handle_end(TreeBuilderObject* self, PyObject* tag)
 
     item = self->last;
     self->last = self->this;
+    Py_INCREF(self->last);
+    Py_XSETREF(self->last_for_tail, self->last);
     self->index--;
     self->this = PyList_GET_ITEM(self->stack, self->index);
     Py_INCREF(self->this);
@@ -2851,7 +2886,7 @@ treebuilder_handle_end(TreeBuilderObject* self, PyObject* tag)
 LOCAL(PyObject*)
 treebuilder_handle_comment(TreeBuilderObject* self, PyObject* text)
 {
-    PyObject* comment = NULL;
+    PyObject* comment;
     PyObject* this;
 
     if (treebuilder_flush_data(self) < 0) {
@@ -2867,6 +2902,8 @@ treebuilder_handle_comment(TreeBuilderObject* self, PyObject* text)
         if (self->insert_comments && this != Py_None) {
             if (treebuilder_add_subelement(this, comment) < 0)
                 goto error;
+            Py_INCREF(comment);
+            Py_XSETREF(self->last_for_tail, comment);
         }
     } else {
         Py_INCREF(text);
@@ -2888,7 +2925,7 @@ treebuilder_handle_comment(TreeBuilderObject* self, PyObject* text)
 LOCAL(PyObject*)
 treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text)
 {
-    PyObject* pi = NULL;
+    PyObject* pi;
     PyObject* this;
     PyObject* stack[2] = {target, text};
 
@@ -2906,6 +2943,8 @@ treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text)
         if (self->insert_pis && this != Py_None) {
             if (treebuilder_add_subelement(this, pi) < 0)
                 goto error;
+            Py_INCREF(pi);
+            Py_XSETREF(self->last_for_tail, pi);
         }
     } else {
         pi = PyTuple_Pack(2, target, text);
@@ -3495,8 +3534,8 @@ expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in)
 static void
 expat_comment_handler(XMLParserObject* self, const XML_Char* comment_in)
 {
-    PyObject* comment = NULL;
-    PyObject* res = NULL;
+    PyObject* comment;
+    PyObject* res;
 
     if (PyErr_Occurred())
         return;
@@ -3510,16 +3549,17 @@ expat_comment_handler(XMLParserObject* self, const XML_Char* comment_in)
             return; /* parser will look for errors */
 
         res = treebuilder_handle_comment(target,  comment);
+        Py_XDECREF(res);
+        Py_DECREF(comment);
     } else if (self->handle_comment) {
         comment = PyUnicode_DecodeUTF8(comment_in, strlen(comment_in), "strict");
         if (!comment)
             return;
 
         res = _PyObject_CallOneArg(self->handle_comment, comment);
+        Py_XDECREF(res);
+        Py_DECREF(comment);
     }
-
-    Py_XDECREF(res);
-    Py_DECREF(comment);
 }
 
 static void
@@ -3587,7 +3627,7 @@ static void
 expat_pi_handler(XMLParserObject* self, const XML_Char* target_in,
                  const XML_Char* data_in)
 {
-    PyObject* pi_target = NULL;
+    PyObject* pi_target;
     PyObject* data;
     PyObject* res;
     PyObject* stack[2];
@@ -3599,7 +3639,7 @@ expat_pi_handler(XMLParserObject* self, const XML_Char* target_in,
         /* shortcut */
         TreeBuilderObject *target = (TreeBuilderObject*) self->target;
 
-        if (target->events_append && target->pi_event_obj) {
+        if ((target->events_append && target->pi_event_obj) || target->insert_pis) {
             pi_target = PyUnicode_DecodeUTF8(target_in, strlen(target_in), "strict");
             if (!pi_target)
                 goto error;



More information about the Python-checkins mailing list