[pypy-commit] pypy default: decode(errors="ignore") at the C level

Armin Rigo noreply at buildbot.pypy.org
Sun Jun 5 11:14:33 CEST 2011


Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r44707:4ad72b733e1f
Date: 2011-06-05 10:52 +0200
http://bitbucket.org/pypy/pypy/changeset/4ad72b733e1f/

Log:	decode(errors="ignore") at the C level

diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -103,8 +103,10 @@
                                           [DECODEBUF_P], rffi.SSIZE_T)
 pypy_cjk_dec_inbuf_consumed = llexternal('pypy_cjk_dec_inbuf_consumed',
                                          [DECODEBUF_P], rffi.SSIZE_T)
+pypy_cjk_dec_inbuf_add = llexternal('pypy_cjk_dec_inbuf_add',
+                                    [DECODEBUF_P, rffi.SSIZE_T], lltype.Void)
 
-def decode(codec, stringdata):
+def decode(codec, stringdata, errors="strict"):
     inleft = len(stringdata)
     inbuf = rffi.get_nonmovingbuffer(stringdata)
     try:
@@ -112,10 +114,11 @@
         if not decodebuf:
             raise MemoryError
         try:
-            r = pypy_cjk_dec_chunk(decodebuf)
-            if r != 0:
-                multibytecodec_decerror(decodebuf, r)
-                assert False
+            while True:
+                r = pypy_cjk_dec_chunk(decodebuf)
+                if r == 0:
+                    break
+                multibytecodec_decerror(decodebuf, r, errors)
             src = pypy_cjk_dec_outbuf(decodebuf)
             length = pypy_cjk_dec_outlen(decodebuf)
             return rffi.wcharpsize2unicode(src, length)
@@ -126,7 +129,7 @@
     finally:
         rffi.free_nonmovingbuffer(stringdata, inbuf)
 
-def multibytecodec_decerror(decodebuf, e):
+def multibytecodec_decerror(decodebuf, e, errors):
     if e > 0:
         reason = "illegal multibyte sequence"
         esize = e
@@ -139,7 +142,9 @@
         raise RuntimeError
     #
     # if errors == ERROR_REPLACE:...
-    # if errors == ERROR_IGNORE or errors == ERROR_REPLACE:...
+    if errors == "ignore":   # or errors == ERROR_REPLACE
+        pypy_cjk_dec_inbuf_add(decodebuf, esize)
+        return     # continue decoding
     start = pypy_cjk_dec_inbuf_consumed(decodebuf)
     end = start + esize
     if 1:  # errors == ERROR_STRICT:
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -36,6 +36,11 @@
     assert e.end == 4
     assert e.reason == "illegal multibyte sequence"
 
+def test_decode_hz_ignore():
+    c = getcodec("hz")
+    u = decode(c, 'def~{}abc', 'ignore')
+    assert u == u'def\u5fcf'
+
 def test_encode_hz():
     c = getcodec("hz")
     s = encode(c, u'foobar')
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
@@ -93,6 +93,11 @@
   return d->inbuf - d->inbuf_start;
 }
 
+void pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s* d, Py_ssize_t skip)
+{
+  d->inbuf += skip;
+}
+
 /************************************************************/
 
 struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
@@ -102,6 +102,7 @@
 Py_ssize_t pypy_cjk_dec_outlen(struct pypy_cjk_dec_s *);
 Py_ssize_t pypy_cjk_dec_inbuf_remaining(struct pypy_cjk_dec_s *d);
 Py_ssize_t pypy_cjk_dec_inbuf_consumed(struct pypy_cjk_dec_s* d);
+void pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s*, Py_ssize_t);
 
 struct pypy_cjk_enc_s {
   const MultibyteCodec *codec;


More information about the pypy-commit mailing list