[pypy-svn] pypy default: Fix the "replace" error handler: on UnicodeDecodeError, only one U+FFFD is returned,

amauryfa commits-noreply at bitbucket.org
Mon Feb 14 10:51:23 CET 2011


Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: 
Changeset: r41892:ed6b9cdcc22f
Date: 2011-02-14 10:50 +0100
http://bitbucket.org/pypy/pypy/changeset/ed6b9cdcc22f/

Log:	Fix the "replace" error handler: on UnicodeDecodeError, only one
	U+FFFD is returned, even when the error spans multiple bytes.

diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -415,6 +415,13 @@
         assert codecs.replace_errors(UnicodeTranslateError(
             u"\u3042", 0, 1, "ouch")) == (u"\ufffd", 1)
 
+        assert codecs.replace_errors(UnicodeEncodeError(
+            "ascii", u"\u3042\u3042", 0, 2, "ouch")) == (u"??", 2)
+        assert codecs.replace_errors(UnicodeDecodeError(
+            "ascii", "\xff\xff", 0, 2, "ouch")) == (u"\ufffd", 2)
+        assert codecs.replace_errors(UnicodeTranslateError(
+            u"\u3042\u3042", 0, 2, "ouch")) == (u"\ufffd\ufffd", 2)
+
         class BadStartUnicodeEncodeError(UnicodeEncodeError):
             def __init__(self):
                 UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")

diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -525,6 +525,7 @@
         raises(UnicodeError, "\\".decode, "unicode-escape")
 
         raises(UnicodeError, "\xc2".decode, "utf-8")
+        assert '\xe1\x80'.decode('utf-8', 'replace') == u"\ufffd"
 
     def test_repr_bug(self):
         assert (repr(u'\U00090418\u027d\U000582b9\u54c3\U000fcb6e') == 

diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -181,8 +181,10 @@
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
         text = '?' * size
         return space.newtuple([space.wrap(text), w_end])
-    elif (space.isinstance_w(w_exc, space.w_UnicodeDecodeError) or
-          space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
+    elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
+        text = u'\ufffd'
+        return space.newtuple([space.wrap(text), w_end])
+    elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError):
         text = u'\ufffd' * size
         return space.newtuple([space.wrap(text), w_end])
     else:


More information about the Pypy-commit mailing list