[pypy-commit] pypy default: Incremental encoder, first try.

arigo noreply at buildbot.pypy.org
Mon Aug 1 16:18:39 CEST 2011


Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r46148:67f047ef8c7a
Date: 2011-08-01 13:16 +0200
http://bitbucket.org/pypy/pypy/changeset/67f047ef8c7a/

Log:	Incremental encoder, first try.

diff --git a/pypy/module/_multibytecodec/__init__.py b/pypy/module/_multibytecodec/__init__.py
--- a/pypy/module/_multibytecodec/__init__.py
+++ b/pypy/module/_multibytecodec/__init__.py
@@ -10,11 +10,11 @@
 
         'MultibyteIncrementalDecoder':
             'interp_incremental.MultibyteIncrementalDecoder',
+        'MultibyteIncrementalEncoder':
+            'interp_incremental.MultibyteIncrementalEncoder',
     }
 
     appleveldefs = {
-        'MultibyteIncrementalEncoder':
-            'app_multibytecodec.MultibyteIncrementalEncoder',
         'MultibyteStreamReader':
             'app_multibytecodec.MultibyteStreamReader',
         'MultibyteStreamWriter':
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -183,9 +183,11 @@
 # ____________________________________________________________
 # Encoding
 ENCODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_enc_s', compilation_info=eci)
+pypy_cjk_enc_new = llexternal('pypy_cjk_enc_new',
+                               [MULTIBYTECODEC_P], ENCODEBUF_P)
 pypy_cjk_enc_init = llexternal('pypy_cjk_enc_init',
-                               [MULTIBYTECODEC_P, rffi.CWCHARP, rffi.SSIZE_T],
-                               ENCODEBUF_P)
+                               [ENCODEBUF_P, rffi.CWCHARP, rffi.SSIZE_T],
+                               rffi.SSIZE_T)
 pypy_cjk_enc_free = llexternal('pypy_cjk_enc_free', [ENCODEBUF_P],
                                lltype.Void)
 pypy_cjk_enc_chunk = llexternal('pypy_cjk_enc_chunk', [ENCODEBUF_P],
@@ -204,39 +206,46 @@
                                            [ENCODEBUF_P, rffi.CCHARP,
                                             rffi.SSIZE_T, rffi.SSIZE_T],
                                            rffi.SSIZE_T)
+pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec',
+                                   [ENCODEBUF_P], MULTIBYTECODEC_P)
 
 def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None):
+    encodebuf = pypy_cjk_enc_new(codec)
+    if not encodebuf:
+        raise MemoryError
+    try:
+        return encodeex(encodebuf, unicodedata, errors, errorcb, namecb)
+    finally:
+        pypy_cjk_enc_free(encodebuf)
+
+def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
+             namecb=None, ignore_error=0):
     inleft = len(unicodedata)
     inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata)
     try:
-        encodebuf = pypy_cjk_enc_init(codec, inbuf, inleft)
-        if not encodebuf:
+        if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
             raise MemoryError
-        try:
-            while True:
-                r = pypy_cjk_enc_chunk(encodebuf)
-                if r == 0:
-                    break
-                multibytecodec_encerror(encodebuf, r, errors,
-                                        codec, errorcb, namecb, unicodedata)
-            while True:
-                r = pypy_cjk_enc_reset(encodebuf)
-                if r == 0:
-                    break
-                multibytecodec_encerror(encodebuf, r, errors,
-                                        codec, errorcb, namecb, unicodedata)
-            src = pypy_cjk_enc_outbuf(encodebuf)
-            length = pypy_cjk_enc_outlen(encodebuf)
-            return rffi.charpsize2str(src, length)
-        #
-        finally:
-            pypy_cjk_enc_free(encodebuf)
+        while True:
+            r = pypy_cjk_enc_chunk(encodebuf)
+            if r == 0 or r == ignore_error:
+                break
+            multibytecodec_encerror(encodebuf, r, errors,
+                                    errorcb, namecb, unicodedata)
+        while True:
+            r = pypy_cjk_enc_reset(encodebuf)
+            if r == 0:
+                break
+            multibytecodec_encerror(encodebuf, r, errors,
+                                    errorcb, namecb, unicodedata)
+        src = pypy_cjk_enc_outbuf(encodebuf)
+        length = pypy_cjk_enc_outlen(encodebuf)
+        return rffi.charpsize2str(src, length)
     #
     finally:
         rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf)
 
 def multibytecodec_encerror(encodebuf, e, errors,
-                            codec, errorcb, namecb, unicodedata):
+                            errorcb, namecb, unicodedata):
     if e > 0:
         reason = "illegal multibyte sequence"
         esize = e
@@ -257,6 +266,7 @@
     elif errors == "ignore":
         replace = ""
     elif errors == "replace":
+        codec = pypy_cjk_enc_getcodec(encodebuf)
         try:
             replace = encode(codec, u"?")
         except EncodeDecodeError:
diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -1,14 +1,15 @@
 from pypy.rpython.lltypesystem import lltype
 from pypy.module._multibytecodec import c_codecs
 from pypy.module._multibytecodec.interp_multibytecodec import (
-    MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror)
+    MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror,
+    wrap_unicodeencodeerror)
 from pypy.interpreter.baseobjspace import Wrappable
 from pypy.interpreter.gateway import interp2app, unwrap_spec
 from pypy.interpreter.typedef import TypeDef, GetSetProperty
 from pypy.module._codecs.interp_codecs import CodecState
 
 
-class MultibyteIncrementalDecoder(Wrappable):
+class MultibyteIncrementalBase(Wrappable):
 
     def __init__(self, space, errors):
         if errors is None:
@@ -21,6 +22,22 @@
         self.name = codec.name
         self._initialize()
 
+    def __del__(self):
+        self._free()
+
+    def reset_w(self):
+        self._free()
+        self._initialize()
+
+    def fget_errors(self, space):
+        return space.wrap(self.errors)
+
+    def fset_errors(self, space, w_errors):
+        self.errors = space.str_w(w_errors)
+
+
+class MultibyteIncrementalDecoder(MultibyteIncrementalBase):
+
     def _initialize(self):
         self.decodebuf = c_codecs.pypy_cjk_dec_new(self.codec)
         self.pending = ""
@@ -31,13 +48,6 @@
             c_codecs.pypy_cjk_dec_free(self.decodebuf)
             self.decodebuf = lltype.nullptr(c_codecs.DECODEBUF_P.TO)
 
-    def __del__(self):
-        self._free()
-
-    def reset_w(self):
-        self._free()
-        self._initialize()
-
     @unwrap_spec(object=str, final=bool)
     def decode_w(self, object, final=False):
         space = self.space
@@ -57,12 +67,6 @@
         self.pending = object[pos:]
         return space.wrap(output)
 
-    def fget_errors(self, space):
-        return space.wrap(self.errors)
-
-    def fset_errors(self, space, w_errors):
-        self.errors = space.str_w(w_errors)
-
 
 @unwrap_spec(errors="str_or_None")
 def mbidecoder_new(space, w_subtype, errors=None):
@@ -81,6 +85,55 @@
     )
 
 
+class MultibyteIncrementalEncoder(MultibyteIncrementalBase):
+
+    def _initialize(self):
+        self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec)
+        self.pending = u""
+
+    def _free(self):
+        self.pending = None
+        if self.encodebuf:
+            c_codecs.pypy_cjk_enc_free(self.encodebuf)
+            self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO)
+
+    @unwrap_spec(object=unicode, final=bool)
+    def encode_w(self, object, final=False):
+        space = self.space
+        state = space.fromcache(CodecState)
+        if len(self.pending) > 0:
+            object = self.pending + object
+        try:
+            output = c_codecs.encodeex(self.encodebuf, object, self.errors,
+                                       state.encode_error_handler, self.name,
+                                       get_ignore_error(final))
+        except c_codecs.EncodeDecodeError, e:
+            raise wrap_unicodeencodeerror(space, e, object, self.name)
+        except RuntimeError:
+            raise wrap_runtimeerror(space)
+        pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf)
+        assert 0 <= pos <= len(object)
+        self.pending = object[pos:]
+        return space.wrap(output)
+
+
+ at unwrap_spec(errors="str_or_None")
+def mbiencoder_new(space, w_subtype, errors=None):
+    r = space.allocate_instance(MultibyteIncrementalEncoder, w_subtype)
+    r.__init__(space, errors)
+    return space.wrap(r)
+
+MultibyteIncrementalEncoder.typedef = TypeDef(
+    'MultibyteIncrementalEncoder',
+    __module__ = '_multibytecodec',
+    __new__ = interp2app(mbiencoder_new),
+    encode  = interp2app(MultibyteIncrementalEncoder.encode_w),
+    reset   = interp2app(MultibyteIncrementalEncoder.reset_w),
+    errors  = GetSetProperty(MultibyteIncrementalEncoder.fget_errors,
+                             MultibyteIncrementalEncoder.fset_errors),
+    )
+
+
 def get_ignore_error(final):
     if final:
         return 0    # don't ignore any error
diff --git a/pypy/module/_multibytecodec/test/test_app_incremental.py b/pypy/module/_multibytecodec/test/test_app_incremental.py
--- a/pypy/module/_multibytecodec/test/test_app_incremental.py
+++ b/pypy/module/_multibytecodec/test/test_app_incremental.py
@@ -13,6 +13,15 @@
 
             return IncrementalHzDecoder
         """)
+        cls.w_IncrementalHzEncoder = cls.space.appexec([], """():
+            import _codecs_cn
+            from _multibytecodec import MultibyteIncrementalEncoder
+
+            class IncrementalHzEncoder(MultibyteIncrementalEncoder):
+                codec = _codecs_cn.getcodec('hz')
+
+            return IncrementalHzEncoder
+        """)
 
     def test_decode_hz(self):
         d = self.IncrementalHzDecoder()
@@ -74,3 +83,14 @@
         d.errors = "replace"
         r = d.decode("~{abc", True)
         assert r == u'\u5f95\ufffd'
+
+    def test_decode_hz_buffer_grow(self):
+        d = self.IncrementalHzDecoder()
+        for i in range(13):
+            r = d.decode("a" * (2**i))
+            assert r == unicode("a" * (2**i))
+
+    def test_encode_hz(self):
+        e = self.IncrementalHzEncoder()
+        r = e.encode("abcd")
+        assert r == u'abcd'
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
@@ -119,34 +119,40 @@
 
 /************************************************************/
 
-struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
-                                         Py_UNICODE *inbuf, Py_ssize_t inlen)
+struct pypy_cjk_enc_s *pypy_cjk_enc_new(const MultibyteCodec *codec)
 {
-  Py_ssize_t outlen;
   struct pypy_cjk_enc_s *d = malloc(sizeof(struct pypy_cjk_enc_s));
   if (!d)
     return NULL;
   if (codec->encinit != NULL && codec->encinit(&d->state, codec->config) != 0)
-    goto errorexit;
+    {
+      free(d);
+      return NULL;
+    }
+  d->codec = codec;
+  d->outbuf_start = NULL;
+  return d;
+}
 
-  d->codec = codec;
+Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d,
+                             Py_UNICODE *inbuf, Py_ssize_t inlen)
+{
+  Py_ssize_t outlen;
   d->inbuf_start = inbuf;
   d->inbuf = inbuf;
   d->inbuf_end = inbuf + inlen;
-
-  if (inlen > (PY_SSIZE_T_MAX - 16) / 2)
-    goto errorexit;
-  outlen = inlen * 2 + 16;
-  d->outbuf_start = malloc(outlen);
-  if (!d->outbuf_start)
-    goto errorexit;
+  if (d->outbuf_start == NULL)
+    {
+      if (inlen > (PY_SSIZE_T_MAX - 16) / 2)
+        return -1;
+      outlen = inlen * 2 + 16;
+      d->outbuf_start = malloc(outlen);
+      if (d->outbuf_start == NULL)
+        return -1;
+      d->outbuf_end = d->outbuf_start + outlen;
+    }
   d->outbuf = d->outbuf_start;
-  d->outbuf_end = d->outbuf_start + outlen;
-  return d;
-
- errorexit:
-  free(d);
-  return NULL;
+  return 0;
 }
 
 void pypy_cjk_enc_free(struct pypy_cjk_enc_s *d)
@@ -249,3 +255,8 @@
   d->inbuf = d->inbuf_start + in_offset;
   return 0;
 }
+
+const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *d)
+{
+  return d->codec;
+}
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
@@ -113,8 +113,9 @@
   unsigned char *outbuf_start, *outbuf, *outbuf_end;
 };
 
-struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
-                                         Py_UNICODE *inbuf, Py_ssize_t inlen);
+struct pypy_cjk_enc_s *pypy_cjk_enc_new(const MultibyteCodec *codec);
+Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d,
+                             Py_UNICODE *inbuf, Py_ssize_t inlen);
 void pypy_cjk_enc_free(struct pypy_cjk_enc_s *);
 Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *);
 Py_ssize_t pypy_cjk_enc_reset(struct pypy_cjk_enc_s *);
@@ -124,6 +125,7 @@
 Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d);
 Py_ssize_t pypy_cjk_enc_replace_on_error(struct pypy_cjk_enc_s* d,
                                          char *, Py_ssize_t, Py_ssize_t);
+const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *);
 
 /* list of codecs defined in the .c files */
 


More information about the pypy-commit mailing list