[pypy-commit] pypy default: Implement errors for encode() too.

Sun Jun 5 11:35:19 CEST 2011

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r44712:8f59a7c650d7
Date: 2011-06-05 11:33 +0200
http://bitbucket.org/pypy/pypy/changeset/8f59a7c650d7/

Log:	Implement errors for encode() too.

diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -147,7 +147,7 @@
         return     # continue decoding
     if errors == "replace":
         e = pypy_cjk_dec_inbuf_add(decodebuf, esize, 1)
-        if e == MBERR_NOMEMORY:
+        if rffi.cast(lltype.Signed, e) == MBERR_NOMEMORY:
             raise MemoryError
         return     # continue decoding
     start = pypy_cjk_dec_inbuf_consumed(decodebuf)
@@ -176,8 +176,11 @@
                                           [ENCODEBUF_P], rffi.SSIZE_T)
 pypy_cjk_enc_inbuf_consumed = llexternal('pypy_cjk_enc_inbuf_consumed',
                                          [ENCODEBUF_P], rffi.SSIZE_T)
+pypy_cjk_enc_inbuf_add = llexternal('pypy_cjk_enc_inbuf_add',
+                                    [ENCODEBUF_P, rffi.SSIZE_T, rffi.INT],
+                                    rffi.INT)
 
-def encode(codec, unicodedata):
+def encode(codec, unicodedata, errors="strict"):
     inleft = len(unicodedata)
     inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata)
     try:
@@ -185,14 +188,16 @@
         if not encodebuf:
             raise MemoryError
         try:
-            r = pypy_cjk_enc_chunk(encodebuf)
-            if r != 0:
-                multibytecodec_encerror(encodebuf, r)
-                assert False
-            r = pypy_cjk_enc_reset(encodebuf)
-            if r != 0:
-                multibytecodec_encerror(encodebuf, r)
-                assert False
+            while True:
+                r = pypy_cjk_enc_chunk(encodebuf)
+                if r == 0:
+                    break
+                multibytecodec_encerror(encodebuf, r, errors)
+            while True:
+                r = pypy_cjk_enc_reset(encodebuf)
+                if r == 0:
+                    break
+                multibytecodec_encerror(encodebuf, r, errors)
             src = pypy_cjk_enc_outbuf(encodebuf)
             length = pypy_cjk_enc_outlen(encodebuf)
             return rffi.charpsize2str(src, length)
@@ -203,7 +208,7 @@
     finally:
         rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf)
 
-def multibytecodec_encerror(encodebuf, e):
+def multibytecodec_encerror(encodebuf, e, errors):
     if e > 0:
         reason = "illegal multibyte sequence"
         esize = e
@@ -215,9 +220,16 @@
     else:
         raise RuntimeError
     #
-    # if errors == ERROR_REPLACE:...
-    # if errors == ERROR_IGNORE or errors == ERROR_REPLACE:...
+    if errors == 'ignore':
+        pypy_cjk_enc_inbuf_add(encodebuf, esize, 0)
+        return     # continue encoding
+    if errors == "replace":
+        e = pypy_cjk_enc_inbuf_add(encodebuf, esize, 1)
+        if rffi.cast(lltype.Signed, e) == MBERR_NOMEMORY:
+            raise MemoryError
+        return     # continue decoding
     start = pypy_cjk_enc_inbuf_consumed(encodebuf)
     end = start + esize
-    if 1:  # errors == ERROR_STRICT:
-        raise EncodeDecodeError(start, end, reason)
+    if errors != "strict":
+        reason = "not implemented: custom error handlers"   # XXX implement me
+    raise EncodeDecodeError(start, end, reason)
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -35,13 +35,11 @@
 
     @unwrap_spec(input=unicode, errors="str_or_None")
     def encode(self, space, input, errors=None):
-        if errors is not None and errors != 'strict':
-            raise OperationError(space.w_NotImplementedError,    # XXX
-                                 space.wrap("errors='%s' in _multibytecodec"
-                                            % errors))
+        if errors is None:
+            errors = 'strict'
         #
         try:
-            output = c_codecs.encode(self.codec, input)
+            output = c_codecs.encode(self.codec, input, errors)
         except c_codecs.EncodeDecodeError, e:
             raise OperationError(
                 space.w_UnicodeEncodeError,
diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py b/pypy/module/_multibytecodec/test/test_app_codecs.py
--- a/pypy/module/_multibytecodec/test/test_app_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_app_codecs.py
@@ -70,3 +70,17 @@
         assert e.start == 3
         assert e.end == 4
         assert e.reason == 'illegal multibyte sequence'
+
+    def test_encode_hz_ignore(self):
+        import _codecs_cn
+        codec = _codecs_cn.getcodec("hz")
+        r = codec.encode(u'abc\u1234def', 'ignore')
+        assert r == ('abcdef', 7)
+        assert type(r[0]) is str
+
+    def test_encode_hz_replace(self):
+        import _codecs_cn
+        codec = _codecs_cn.getcodec("hz")
+        r = codec.encode(u'abc\u1234def', 'replace')
+        assert r == ('abc?def', 7)
+        assert type(r[0]) is str
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -69,6 +69,25 @@
     assert e.end == 4
     assert e.reason == "illegal multibyte sequence"
 
+def test_encode_hz_ignore():
+    c = getcodec("hz")
+    s = encode(c, u'abc\u1234def', 'ignore')
+    assert s == 'abcdef'
+
+def test_encode_hz_replace():
+    c = getcodec("hz")
+    s = encode(c, u'abc\u1234def', 'replace')
+    assert s == 'abc?def'
+
+def test_encode_hz_foobar():
+    # not implemented yet: custom error handlers
+    c = getcodec("hz")
+    e = py.test.raises(EncodeDecodeError, encode,
+                       c, u'abc\u1234def', 'foobar').value
+    assert e.start == 3
+    assert e.end == 4
+    assert e.reason == "not implemented: custom error handlers"
+
 def test_encode_jisx0208():
     c = getcodec('iso2022_jp')
     s = encode(c, u'\u83ca\u5730\u6642\u592b')
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
@@ -225,3 +225,34 @@
 {
   return d->inbuf - d->inbuf_start;
 }
+
+int pypy_cjk_enc_inbuf_add(struct pypy_cjk_enc_s* d, Py_ssize_t skip,
+                           int add_replacement_character)
+{
+  if (add_replacement_character)
+    {
+      const Py_UNICODE replchar = '?', *inbuf = &replchar;
+      Py_ssize_t r;
+
+      while (1)
+        {
+          Py_ssize_t outleft = (Py_ssize_t)(d->outbuf_end - d->outbuf);
+          r = d->codec->encode(&d->state, d->codec->config,
+                               &inbuf, 1, &d->outbuf, outleft, 0);
+          if (r != MBERR_TOOSMALL)
+            break;
+          /* output buffer too small; grow it and continue. */
+          if (expand_encodebuffer(d, -1) == -1)
+            return MBERR_NOMEMORY;
+        }
+      if (r != 0)
+        {
+          if (d->outbuf >= d->outbuf_end)
+            if (expand_encodebuffer(d, 1) == -1)
+              return MBERR_NOMEMORY;
+          *d->outbuf++ = '?';
+        }
+    }
+  d->inbuf += skip;
+  return 0;
+}
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
@@ -120,6 +120,7 @@
 Py_ssize_t pypy_cjk_enc_outlen(struct pypy_cjk_enc_s *);
 Py_ssize_t pypy_cjk_enc_inbuf_remaining(struct pypy_cjk_enc_s *d);
 Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d);
+int pypy_cjk_enc_inbuf_add(struct pypy_cjk_enc_s*, Py_ssize_t, int);
 
 /* list of codecs defined in the .c files */