[pypy-commit] pypy stdlib-3.2.5: Expat parser now correctly works with unicode input,

amauryfa noreply at buildbot.pypy.org
Wed Apr 2 02:58:12 CEST 2014


Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: stdlib-3.2.5
Changeset: r70392:73ca2fbe4077
Date: 2014-04-02 02:21 +0200
http://bitbucket.org/pypy/pypy/changeset/73ca2fbe4077/

Log:	Expat parser now correctly works with unicode input, even when the
	XML internal encoding is not UTF8 (CPython issue 17089)

diff --git a/pypy/module/pyexpat/interp_pyexpat.py b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -2,6 +2,7 @@
 from pypy.interpreter.typedef import TypeDef, GetSetProperty
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
 from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.unicodehelper import encode_utf8
 from rpython.rlib import rgc, jit
 from rpython.rtyper.lltypesystem import rffi, lltype
 from rpython.rtyper.tool import rffi_platform
@@ -348,6 +349,8 @@
 XML_SetUnknownEncodingHandler = expat_external(
     'XML_SetUnknownEncodingHandler',
     [XML_Parser, callback_type, rffi.VOIDP], lltype.Void)
+XML_SetEncoding = expat_external(
+    'XML_SetEncoding', [XML_Parser, rffi.CCHARP], rffi.INT)
 
 # Declarations of external functions
 
@@ -622,10 +625,17 @@
 
     # Parse methods
 
-    @unwrap_spec(data='bufferstr_or_u', isfinal=bool)
-    def Parse(self, space, data, isfinal=False):
+    @unwrap_spec(isfinal=bool)
+    def Parse(self, space, w_data, isfinal=False):
         """Parse(data[, isfinal])
 Parse XML data.  `isfinal' should be true at end of input."""
+        if space.isinstance_w(w_data, space.w_unicode):
+            u = w_data.unicode_w(space)
+            data = encode_utf8(space, w_data.unicode_w(space))
+            # Explicitly set UTF-8 encoding. Return code ignored.
+            XML_SetEncoding(self.itself, "utf-8")
+        else:
+            data = space.bufferstr_w(w_data)
         res = XML_Parse(self.itself, data, len(data), isfinal)
         if self._exc_info:
             e = self._exc_info
@@ -643,9 +653,8 @@
         eof = False
         while not eof:
             w_data = space.call_method(w_file, 'read', space.wrap(2048))
-            data = space.bytes_w(w_data)
-            eof = len(data) == 0
-            w_res = self.Parse(space, data, isfinal=eof)
+            eof = space.len_w(w_data) == 0
+            w_res = self.Parse(space, w_data, isfinal=eof)
         return w_res
 
     @unwrap_spec(base=str)
diff --git a/pypy/module/pyexpat/test/test_parser.py b/pypy/module/pyexpat/test/test_parser.py
--- a/pypy/module/pyexpat/test/test_parser.py
+++ b/pypy/module/pyexpat/test/test_parser.py
@@ -100,7 +100,7 @@
         p.Parse(xml)
 
     def test_python_encoding(self):
-        # This name is not knonwn by expat
+        # This name is not known by expat
         xml = b"<?xml version='1.0' encoding='latin1'?><s>caf\xe9</s>"
         import pyexpat
         p = pyexpat.ParserCreate()
@@ -110,12 +110,21 @@
         p.Parse(xml)
 
     def test_mbcs(self):
-        xml = "<?xml version='1.0' encoding='gbk'?><p/>"
+        xml = b"<?xml version='1.0' encoding='gbk'?><p/>"
         import pyexpat
         p = pyexpat.ParserCreate()
         exc = raises(ValueError, p.Parse, xml)
         assert str(exc.value) == "multi-byte encodings are not supported"
 
+    def test_parse_str(self):
+        xml = "<?xml version='1.0' encoding='latin1'?><s>caf\xe9</s>"
+        import pyexpat
+        p = pyexpat.ParserCreate()
+        def gotText(text):
+            assert text == "caf\xe9"
+        p.CharacterDataHandler = gotText
+        p.Parse(xml)
+
     def test_decode_error(self):
         xml = b'<fran\xe7ais>Comment \xe7a va ? Tr\xe8s bien ?</fran\xe7ais>'
         import pyexpat


More information about the pypy-commit mailing list