[pypy-commit] pypy stdlib-3.2.5: Expat parser now correctly works with unicode input,
amauryfa
noreply at buildbot.pypy.org
Wed Apr 2 02:58:12 CEST 2014
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: stdlib-3.2.5
Changeset: r70392:73ca2fbe4077
Date: 2014-04-02 02:21 +0200
http://bitbucket.org/pypy/pypy/changeset/73ca2fbe4077/
Log: Expat parser now correctly works with unicode input, even when the
XML internal encoding is not UTF8 (CPython issue 17089)
diff --git a/pypy/module/pyexpat/interp_pyexpat.py b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -2,6 +2,7 @@
from pypy.interpreter.typedef import TypeDef, GetSetProperty
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.unicodehelper import encode_utf8
from rpython.rlib import rgc, jit
from rpython.rtyper.lltypesystem import rffi, lltype
from rpython.rtyper.tool import rffi_platform
@@ -348,6 +349,8 @@
XML_SetUnknownEncodingHandler = expat_external(
'XML_SetUnknownEncodingHandler',
[XML_Parser, callback_type, rffi.VOIDP], lltype.Void)
+XML_SetEncoding = expat_external(
+ 'XML_SetEncoding', [XML_Parser, rffi.CCHARP], rffi.INT)
# Declarations of external functions
@@ -622,10 +625,17 @@
# Parse methods
- @unwrap_spec(data='bufferstr_or_u', isfinal=bool)
- def Parse(self, space, data, isfinal=False):
+ @unwrap_spec(isfinal=bool)
+ def Parse(self, space, w_data, isfinal=False):
"""Parse(data[, isfinal])
Parse XML data. `isfinal' should be true at end of input."""
+ if space.isinstance_w(w_data, space.w_unicode):
+ u = w_data.unicode_w(space)
+ data = encode_utf8(space, w_data.unicode_w(space))
+ # Explicitly set UTF-8 encoding. Return code ignored.
+ XML_SetEncoding(self.itself, "utf-8")
+ else:
+ data = space.bufferstr_w(w_data)
res = XML_Parse(self.itself, data, len(data), isfinal)
if self._exc_info:
e = self._exc_info
@@ -643,9 +653,8 @@
eof = False
while not eof:
w_data = space.call_method(w_file, 'read', space.wrap(2048))
- data = space.bytes_w(w_data)
- eof = len(data) == 0
- w_res = self.Parse(space, data, isfinal=eof)
+ eof = space.len_w(w_data) == 0
+ w_res = self.Parse(space, w_data, isfinal=eof)
return w_res
@unwrap_spec(base=str)
diff --git a/pypy/module/pyexpat/test/test_parser.py b/pypy/module/pyexpat/test/test_parser.py
--- a/pypy/module/pyexpat/test/test_parser.py
+++ b/pypy/module/pyexpat/test/test_parser.py
@@ -100,7 +100,7 @@
p.Parse(xml)
def test_python_encoding(self):
- # This name is not knonwn by expat
+ # This name is not known by expat
xml = b"<?xml version='1.0' encoding='latin1'?><s>caf\xe9</s>"
import pyexpat
p = pyexpat.ParserCreate()
@@ -110,12 +110,21 @@
p.Parse(xml)
def test_mbcs(self):
- xml = "<?xml version='1.0' encoding='gbk'?><p/>"
+ xml = b"<?xml version='1.0' encoding='gbk'?><p/>"
import pyexpat
p = pyexpat.ParserCreate()
exc = raises(ValueError, p.Parse, xml)
assert str(exc.value) == "multi-byte encodings are not supported"
+ def test_parse_str(self):
+ xml = "<?xml version='1.0' encoding='latin1'?><s>caf\xe9</s>"
+ import pyexpat
+ p = pyexpat.ParserCreate()
+ def gotText(text):
+ assert text == "caf\xe9"
+ p.CharacterDataHandler = gotText
+ p.Parse(xml)
+
def test_decode_error(self):
xml = b'<fran\xe7ais>Comment \xe7a va ? Tr\xe8s bien ?</fran\xe7ais>'
import pyexpat
More information about the pypy-commit
mailing list