[pypy-commit] pypy default: Custom encode error handlers.

arigo noreply at buildbot.pypy.org
Sun Jun 5 17:22:47 CEST 2011


Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r44721:be2600cf63a3
Date: 2011-06-05 17:10 +0200
http://bitbucket.org/pypy/pypy/changeset/be2600cf63a3/

Log:	Custom encode error handlers.

diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -176,11 +176,12 @@
                                           [ENCODEBUF_P], rffi.SSIZE_T)
 pypy_cjk_enc_inbuf_consumed = llexternal('pypy_cjk_enc_inbuf_consumed',
                                          [ENCODEBUF_P], rffi.SSIZE_T)
-pypy_cjk_enc_inbuf_add = llexternal('pypy_cjk_enc_inbuf_add',
-                                    [ENCODEBUF_P, rffi.SSIZE_T, rffi.INT],
-                                    rffi.INT)
+pypy_cjk_enc_replace_on_error = llexternal('pypy_cjk_enc_replace_on_error',
+                                           [ENCODEBUF_P, rffi.CCHARP,
+                                            rffi.SSIZE_T, rffi.SSIZE_T],
+                                           rffi.SSIZE_T)
 
-def encode(codec, unicodedata, errors="strict"):
+def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None):
     inleft = len(unicodedata)
     inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata)
     try:
@@ -192,12 +193,14 @@
                 r = pypy_cjk_enc_chunk(encodebuf)
                 if r == 0:
                     break
-                multibytecodec_encerror(encodebuf, r, errors)
+                multibytecodec_encerror(encodebuf, r, errors,
+                                        codec, errorcb, namecb, unicodedata)
             while True:
                 r = pypy_cjk_enc_reset(encodebuf)
                 if r == 0:
                     break
-                multibytecodec_encerror(encodebuf, r, errors)
+                multibytecodec_encerror(encodebuf, r, errors,
+                                        codec, errorcb, namecb, unicodedata)
             src = pypy_cjk_enc_outbuf(encodebuf)
             length = pypy_cjk_enc_outlen(encodebuf)
             return rffi.charpsize2str(src, length)
@@ -208,7 +211,8 @@
     finally:
         rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf)
 
-def multibytecodec_encerror(encodebuf, e, errors):
+def multibytecodec_encerror(encodebuf, e, errors,
+                            codec, errorcb, namecb, unicodedata):
     if e > 0:
         reason = "illegal multibyte sequence"
         esize = e
@@ -220,16 +224,27 @@
     else:
         raise RuntimeError
     #
-    if errors == 'ignore':
-        pypy_cjk_enc_inbuf_add(encodebuf, esize, rffi.cast(rffi.INT, 0))
-        return     # continue encoding
-    if errors == "replace":
-        e = pypy_cjk_enc_inbuf_add(encodebuf, esize, rffi.cast(rffi.INT, 1))
-        if rffi.cast(lltype.Signed, e) == MBERR_NOMEMORY:
-            raise MemoryError
-        return     # continue decoding
+    # compute the string to use as a replacement -> 'replace', and
+    # the current position in the input 'unicodedata' -> 'end'
     start = pypy_cjk_enc_inbuf_consumed(encodebuf)
     end = start + esize
-    if errors != "strict":
-        reason = "not implemented: custom error handlers"   # XXX implement me
-    raise EncodeDecodeError(start, end, reason)
+    if errors == "strict":
+        raise EncodeDecodeError(start, end, reason)
+    elif errors == "ignore":
+        replace = ""
+    elif errors == "replace":
+        try:
+            replace = encode(codec, u"?")
+        except EncodeDecodeError:
+            replace = "?"
+    else:
+        assert errorcb != None
+        replace, end = errorcb(errors, namecb, reason,
+                               unicodedata, start, end)
+    inbuf = rffi.get_nonmovingbuffer(replace)
+    try:
+        r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
+    finally:
+        rffi.free_nonmovingbuffer(replace, inbuf)
+    if r == MBERR_NOMEMORY:
+        raise MemoryError
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -3,6 +3,7 @@
 from pypy.interpreter.typedef import TypeDef
 from pypy.interpreter.error import OperationError
 from pypy.module._multibytecodec import c_codecs
+from pypy.module._codecs.interp_codecs import CodecState
 
 
 class MultibyteCodec(Wrappable):
@@ -37,9 +38,11 @@
     def encode(self, space, input, errors=None):
         if errors is None:
             errors = 'strict'
+        state = space.fromcache(CodecState)
         #
         try:
-            output = c_codecs.encode(self.codec, input, errors)
+            output = c_codecs.encode(self.codec, input, errors,
+                                     state.encode_error_handler, self.name)
         except c_codecs.EncodeDecodeError, e:
             raise OperationError(
                 space.w_UnicodeEncodeError,
diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py b/pypy/module/_multibytecodec/test/test_app_codecs.py
--- a/pypy/module/_multibytecodec/test/test_app_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_app_codecs.py
@@ -84,3 +84,10 @@
         r = codec.encode(u'abc\u1234def', 'replace')
         assert r == ('abc?def', 7)
         assert type(r[0]) is str
+
+    def test_encode_custom_error_handler(self):
+        import codecs
+        codecs.register_error("test.multi_bad_handler", lambda e: (repl, 1))
+        repl = u"\u2014"
+        s = u"\uDDA1".encode("gbk", "test.multi_bad_handler")
+        assert s == '\xA1\xAA'
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -46,14 +46,6 @@
     u = decode(c, 'def~{}abc', 'replace')
     assert u == u'def\ufffd\u5fcf'
 
-def test_decode_hz_foobar():
-    # not implemented yet: custom error handlers
-    c = getcodec("hz")
-    e = py.test.raises(EncodeDecodeError, decode, c, "~{xyz}", "foobar").value
-    assert e.start == 2
-    assert e.end == 4
-    assert e.reason == "not implemented: custom error handlers"
-
 def test_encode_hz():
     c = getcodec("hz")
     s = encode(c, u'foobar')
@@ -79,15 +71,6 @@
     s = encode(c, u'abc\u1234def', 'replace')
     assert s == 'abc?def'
 
-def test_encode_hz_foobar():
-    # not implemented yet: custom error handlers
-    c = getcodec("hz")
-    e = py.test.raises(EncodeDecodeError, encode,
-                       c, u'abc\u1234def', 'foobar').value
-    assert e.start == 3
-    assert e.end == 4
-    assert e.reason == "not implemented: custom error handlers"
-
 def test_encode_jisx0208():
     c = getcodec('iso2022_jp')
     s = encode(c, u'\u83ca\u5730\u6642\u592b')
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
@@ -226,33 +226,18 @@
   return d->inbuf - d->inbuf_start;
 }
 
-int pypy_cjk_enc_inbuf_add(struct pypy_cjk_enc_s* d, Py_ssize_t skip,
-                           int add_replacement_character)
+Py_ssize_t pypy_cjk_enc_replace_on_error(struct pypy_cjk_enc_s* d,
+                                         char *newbuf, Py_ssize_t newlen,
+                                         Py_ssize_t in_offset)
 {
-  if (add_replacement_character)
+  if (newlen > 0)
     {
-      const Py_UNICODE replchar = '?', *inbuf = &replchar;
-      Py_ssize_t r;
-
-      while (1)
-        {
-          Py_ssize_t outleft = (Py_ssize_t)(d->outbuf_end - d->outbuf);
-          r = d->codec->encode(&d->state, d->codec->config,
-                               &inbuf, 1, &d->outbuf, outleft, 0);
-          if (r != MBERR_TOOSMALL)
-            break;
-          /* output buffer too small; grow it and continue. */
-          if (expand_encodebuffer(d, -1) == -1)
-            return MBERR_NOMEMORY;
-        }
-      if (r != 0)
-        {
-          if (d->outbuf >= d->outbuf_end)
-            if (expand_encodebuffer(d, 1) == -1)
-              return MBERR_NOMEMORY;
-          *d->outbuf++ = '?';
-        }
+      if (d->outbuf + newlen > d->outbuf_end)
+        if (expand_encodebuffer(d, newlen) == -1)
+          return MBERR_NOMEMORY;
+      memcpy(d->outbuf, newbuf, newlen);
+      d->outbuf += newlen;
     }
-  d->inbuf += skip;
+  d->inbuf = d->inbuf_start + in_offset;
   return 0;
 }
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
@@ -120,7 +120,8 @@
 Py_ssize_t pypy_cjk_enc_outlen(struct pypy_cjk_enc_s *);
 Py_ssize_t pypy_cjk_enc_inbuf_remaining(struct pypy_cjk_enc_s *d);
 Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d);
-int pypy_cjk_enc_inbuf_add(struct pypy_cjk_enc_s*, Py_ssize_t, int);
+Py_ssize_t pypy_cjk_enc_replace_on_error(struct pypy_cjk_enc_s* d,
+                                         char *, Py_ssize_t, Py_ssize_t);
 
 /* list of codecs defined in the .c files */
 


More information about the pypy-commit mailing list