[pypy-commit] pypy default: errors="replace" in decode.

Armin Rigo noreply at buildbot.pypy.org
Sun Jun 5 11:14:36 CEST 2011


Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r44709:ab73d694925f
Date: 2011-06-05 11:06 +0200
http://bitbucket.org/pypy/pypy/changeset/ab73d694925f/

Log:	errors="replace" in decode.

diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -104,7 +104,8 @@
 pypy_cjk_dec_inbuf_consumed = llexternal('pypy_cjk_dec_inbuf_consumed',
                                          [DECODEBUF_P], rffi.SSIZE_T)
 pypy_cjk_dec_inbuf_add = llexternal('pypy_cjk_dec_inbuf_add',
-                                    [DECODEBUF_P, rffi.SSIZE_T], lltype.Void)
+                                    [DECODEBUF_P, rffi.SSIZE_T, rffi.INT],
+                                    rffi.INT)
 
 def decode(codec, stringdata, errors="strict"):
     inleft = len(stringdata)
@@ -141,9 +142,13 @@
     else:
         raise RuntimeError
     #
-    # if errors == ERROR_REPLACE:...
-    if errors == "ignore":   # or errors == ERROR_REPLACE
-        pypy_cjk_dec_inbuf_add(decodebuf, esize)
+    if errors == "ignore":
+        pypy_cjk_dec_inbuf_add(decodebuf, esize, 0)
+        return     # continue decoding
+    if errors == "replace":
+        e = pypy_cjk_dec_inbuf_add(decodebuf, esize, 1)
+        if e == MBERR_NOMEMORY:
+            raise MemoryError
         return     # continue decoding
     start = pypy_cjk_dec_inbuf_consumed(decodebuf)
     end = start + esize
diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py b/pypy/module/_multibytecodec/test/test_app_codecs.py
--- a/pypy/module/_multibytecodec/test/test_app_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_app_codecs.py
@@ -44,6 +44,14 @@
         r = codec.decode("def~{}abc", 'ignore')
         assert r == (u'def\u5fcf', 9)
 
+    def test_decode_hz_replace(self):
+        import _codecs_cn
+        codec = _codecs_cn.getcodec("hz")
+        r = codec.decode("def~{}abc", errors='replace')
+        assert r == (u'def\ufffd\u5fcf', 9)
+        r = codec.decode("def~{}abc", 'replace')
+        assert r == (u'def\ufffd\u5fcf', 9)
+
     def test_encode_hz(self):
         import _codecs_cn
         codec = _codecs_cn.getcodec("hz")
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -41,6 +41,11 @@
     u = decode(c, 'def~{}abc', 'ignore')
     assert u == u'def\u5fcf'
 
+def test_decode_hz_replace():
+    c = getcodec("hz")
+    u = decode(c, 'def~{}abc', 'replace')
+    assert u == u'def\ufffd\u5fcf'
+
 def test_encode_hz():
     c = getcodec("hz")
     s = encode(c, u'foobar')
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
@@ -1,6 +1,8 @@
 #include <stdlib.h>
 #include "src/cjkcodecs/multibytecodec.h"
 
+#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
+
 
 struct pypy_cjk_dec_s *pypy_cjk_dec_init(const MultibyteCodec *codec,
                                          char *inbuf, Py_ssize_t inlen)
@@ -93,9 +95,18 @@
   return d->inbuf - d->inbuf_start;
 }
 
-void pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s* d, Py_ssize_t skip)
+int pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s* d, Py_ssize_t skip,
+                           int add_replacement_character)
 {
+  if (add_replacement_character)
+    {
+      if (d->outbuf >= d->outbuf_end)
+        if (expand_decodebuffer(d, 1) == -1)
+          return MBERR_NOMEMORY;
+      *d->outbuf++ = Py_UNICODE_REPLACEMENT_CHARACTER;
+    }
   d->inbuf += skip;
+  return 0;
 }
 
 /************************************************************/
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
@@ -102,7 +102,7 @@
 Py_ssize_t pypy_cjk_dec_outlen(struct pypy_cjk_dec_s *);
 Py_ssize_t pypy_cjk_dec_inbuf_remaining(struct pypy_cjk_dec_s *d);
 Py_ssize_t pypy_cjk_dec_inbuf_consumed(struct pypy_cjk_dec_s* d);
-void pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s*, Py_ssize_t);
+int pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s*, Py_ssize_t, int);
 
 struct pypy_cjk_enc_s {
   const MultibyteCodec *codec;


More information about the pypy-commit mailing list