[Python-checkins] cpython (3.3): Issue #16986: ElementTree now correctly parses a string input not only when

serhiy.storchaka python-checkins at python.org
Wed May 22 16:21:36 CEST 2013


http://hg.python.org/cpython/rev/7781ccae7b9a
changeset:   83890:7781ccae7b9a
branch:      3.3
parent:      83888:039dc6dd2bc0
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Wed May 22 17:07:51 2013 +0300
summary:
  Issue #16986: ElementTree now correctly parses a string input not only when
an internal XML encoding is UTF-8 or US-ASCII.

files:
  Include/pyexpat.h          |   1 +
  Lib/test/test_xml_etree.py |  45 +++++++++++++++++--------
  Misc/NEWS                  |   3 +
  Modules/_elementtree.c     |  39 +++++++++++++++++-----
  Modules/pyexpat.c          |   1 +
  5 files changed, 65 insertions(+), 24 deletions(-)


diff --git a/Include/pyexpat.h b/Include/pyexpat.h
--- a/Include/pyexpat.h
+++ b/Include/pyexpat.h
@@ -45,6 +45,7 @@
     void (*SetUserData)(XML_Parser parser, void *userData);
     void (*SetStartDoctypeDeclHandler)(XML_Parser parser,
                                        XML_StartDoctypeDeclHandler start);
+    enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding);
     /* always add new stuff to the end! */
 };
 
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -677,15 +677,18 @@
         elem = ET.fromstring("<html><body>text</body></html>")
         self.assertEqual(ET.tostring(elem), b'<html><body>text</body></html>')
 
-    def test_encoding(encoding):
-        def check(encoding):
-            ET.XML("<?xml version='1.0' encoding='%s'?><xml />" % encoding)
-        check("ascii")
-        check("us-ascii")
-        check("iso-8859-1")
-        check("iso-8859-15")
-        check("cp437")
-        check("mac-roman")
+    def test_encoding(self):
+        def check(encoding, body=''):
+            xml = ("<?xml version='1.0' encoding='%s'?><xml>%s</xml>" %
+                   (encoding, body))
+            self.assertEqual(ET.XML(xml.encode(encoding)).text, body)
+            self.assertEqual(ET.XML(xml).text, body)
+        check("ascii", 'a')
+        check("us-ascii", 'a')
+        check("iso-8859-1", '\xbd')
+        check("iso-8859-15", '\u20ac')
+        check("cp437", '\u221a')
+        check("mac-roman", '\u02da')
 
     def test_methods(self):
         # Test serialization methods.
@@ -1842,11 +1845,13 @@
 
 
 class XMLParserTest(unittest.TestCase):
-    sample1 = '<file><line>22</line></file>'
-    sample2 = ('<!DOCTYPE html PUBLIC'
-        ' "-//W3C//DTD XHTML 1.0 Transitional//EN"'
-        ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
-        '<html>text</html>')
+    sample1 = b'<file><line>22</line></file>'
+    sample2 = (b'<!DOCTYPE html PUBLIC'
+        b' "-//W3C//DTD XHTML 1.0 Transitional//EN"'
+        b' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
+        b'<html>text</html>')
+    sample3 = ('<?xml version="1.0" encoding="iso-8859-1"?>\n'
+        '<money value="$\xa3\u20ac\U0001017b">$\xa3\u20ac\U0001017b</money>')
 
     def _check_sample_element(self, e):
         self.assertEqual(e.tag, 'file')
@@ -1882,12 +1887,21 @@
                 _doctype = (name, pubid, system)
 
         parser = MyParserWithDoctype()
-        parser.feed(self.sample2)
+        with self.assertWarns(DeprecationWarning):
+            parser.feed(self.sample2)
         parser.close()
         self.assertEqual(_doctype,
             ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN',
              'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'))
 
+    def test_parse_string(self):
+        parser = ET.XMLParser(target=ET.TreeBuilder())
+        parser.feed(self.sample3)
+        e = parser.close()
+        self.assertEqual(e.tag, 'money')
+        self.assertEqual(e.attrib['value'], '$\xa3\u20ac\U0001017b')
+        self.assertEqual(e.text, '$\xa3\u20ac\U0001017b')
+
 
 class NamespaceParseTest(unittest.TestCase):
     def test_find_with_namespace(self):
@@ -2297,6 +2311,7 @@
         ElementFindTest,
         ElementIterTest,
         TreeBuilderTest,
+        XMLParserTest,
         BugsTest,
         ]
 
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -24,6 +24,9 @@
 Library
 -------
 
+- Issue #16986: ElementTree now correctly parses a string input not only when
+  an internal XML encoding is UTF-8 or US-ASCII.
+
 - Issue #17812: Fixed quadratic complexity of base64.b32encode().
 
 - Issue #17980: Fix possible abuse of ssl.match_hostname() for denial of
diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c
--- a/Modules/_elementtree.c
+++ b/Modules/_elementtree.c
@@ -3330,7 +3330,7 @@
 }
 
 LOCAL(PyObject*)
-expat_parse(XMLParserObject* self, char* data, int data_len, int final)
+expat_parse(XMLParserObject* self, const char* data, int data_len, int final)
 {
     int ok;
 
@@ -3376,16 +3376,37 @@
 }
 
 static PyObject*
-xmlparser_feed(XMLParserObject* self, PyObject* args)
+xmlparser_feed(XMLParserObject* self, PyObject* arg)
 {
     /* feed data to parser */
 
-    char* data;
-    int data_len;
-    if (!PyArg_ParseTuple(args, "s#:feed", &data, &data_len))
-        return NULL;
-
-    return expat_parse(self, data, data_len, 0);
+    if (PyUnicode_Check(arg)) {
+        Py_ssize_t data_len;
+        const char *data = PyUnicode_AsUTF8AndSize(arg, &data_len);
+        if (data == NULL)
+            return NULL;
+        if (data_len > INT_MAX) {
+            PyErr_SetString(PyExc_OverflowError, "size does not fit in an int");
+            return NULL;
+        }
+        /* Explicitly set UTF-8 encoding. Return code ignored. */
+        (void)EXPAT(SetEncoding)(self->parser, "utf-8");
+        return expat_parse(self, data, (int)data_len, 0);
+    }
+    else {
+        Py_buffer view;
+        PyObject *res;
+        if (PyObject_GetBuffer(arg, &view, PyBUF_SIMPLE) < 0)
+            return NULL;
+        if (view.len > INT_MAX) {
+            PyBuffer_Release(&view);
+            PyErr_SetString(PyExc_OverflowError, "size does not fit in an int");
+            return NULL;
+        }
+        res = expat_parse(self, view.buf, (int)view.len, 0);
+        PyBuffer_Release(&view);
+        return res;
+    }
 }
 
 static PyObject*
@@ -3570,7 +3591,7 @@
 }
 
 static PyMethodDef xmlparser_methods[] = {
-    {"feed", (PyCFunction) xmlparser_feed, METH_VARARGS},
+    {"feed", (PyCFunction) xmlparser_feed, METH_O},
     {"close", (PyCFunction) xmlparser_close, METH_VARARGS},
     {"_parse", (PyCFunction) xmlparser_parse, METH_VARARGS},
     {"_setevents", (PyCFunction) xmlparser_setevents, METH_VARARGS},
diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c
--- a/Modules/pyexpat.c
+++ b/Modules/pyexpat.c
@@ -1937,6 +1937,7 @@
     capi.SetUnknownEncodingHandler = XML_SetUnknownEncodingHandler;
     capi.SetUserData = XML_SetUserData;
     capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler;
+    capi.SetEncoding = XML_SetEncoding;
 
     /* export using capsule */
     capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list