[pypy-commit] pypy default: Incremental encoder, first try.
arigo
noreply at buildbot.pypy.org
Mon Aug 1 16:18:39 CEST 2011
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r46148:67f047ef8c7a
Date: 2011-08-01 13:16 +0200
http://bitbucket.org/pypy/pypy/changeset/67f047ef8c7a/
Log: Incremental encoder, first try.
diff --git a/pypy/module/_multibytecodec/__init__.py b/pypy/module/_multibytecodec/__init__.py
--- a/pypy/module/_multibytecodec/__init__.py
+++ b/pypy/module/_multibytecodec/__init__.py
@@ -10,11 +10,11 @@
'MultibyteIncrementalDecoder':
'interp_incremental.MultibyteIncrementalDecoder',
+ 'MultibyteIncrementalEncoder':
+ 'interp_incremental.MultibyteIncrementalEncoder',
}
appleveldefs = {
- 'MultibyteIncrementalEncoder':
- 'app_multibytecodec.MultibyteIncrementalEncoder',
'MultibyteStreamReader':
'app_multibytecodec.MultibyteStreamReader',
'MultibyteStreamWriter':
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -183,9 +183,11 @@
# ____________________________________________________________
# Encoding
ENCODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_enc_s', compilation_info=eci)
+pypy_cjk_enc_new = llexternal('pypy_cjk_enc_new',
+ [MULTIBYTECODEC_P], ENCODEBUF_P)
pypy_cjk_enc_init = llexternal('pypy_cjk_enc_init',
- [MULTIBYTECODEC_P, rffi.CWCHARP, rffi.SSIZE_T],
- ENCODEBUF_P)
+ [ENCODEBUF_P, rffi.CWCHARP, rffi.SSIZE_T],
+ rffi.SSIZE_T)
pypy_cjk_enc_free = llexternal('pypy_cjk_enc_free', [ENCODEBUF_P],
lltype.Void)
pypy_cjk_enc_chunk = llexternal('pypy_cjk_enc_chunk', [ENCODEBUF_P],
@@ -204,39 +206,46 @@
[ENCODEBUF_P, rffi.CCHARP,
rffi.SSIZE_T, rffi.SSIZE_T],
rffi.SSIZE_T)
+pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec',
+ [ENCODEBUF_P], MULTIBYTECODEC_P)
def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None):
+ encodebuf = pypy_cjk_enc_new(codec)
+ if not encodebuf:
+ raise MemoryError
+ try:
+ return encodeex(encodebuf, unicodedata, errors, errorcb, namecb)
+ finally:
+ pypy_cjk_enc_free(encodebuf)
+
+def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
+ namecb=None, ignore_error=0):
inleft = len(unicodedata)
inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata)
try:
- encodebuf = pypy_cjk_enc_init(codec, inbuf, inleft)
- if not encodebuf:
+ if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
raise MemoryError
- try:
- while True:
- r = pypy_cjk_enc_chunk(encodebuf)
- if r == 0:
- break
- multibytecodec_encerror(encodebuf, r, errors,
- codec, errorcb, namecb, unicodedata)
- while True:
- r = pypy_cjk_enc_reset(encodebuf)
- if r == 0:
- break
- multibytecodec_encerror(encodebuf, r, errors,
- codec, errorcb, namecb, unicodedata)
- src = pypy_cjk_enc_outbuf(encodebuf)
- length = pypy_cjk_enc_outlen(encodebuf)
- return rffi.charpsize2str(src, length)
- #
- finally:
- pypy_cjk_enc_free(encodebuf)
+ while True:
+ r = pypy_cjk_enc_chunk(encodebuf)
+ if r == 0 or r == ignore_error:
+ break
+ multibytecodec_encerror(encodebuf, r, errors,
+ errorcb, namecb, unicodedata)
+ while True:
+ r = pypy_cjk_enc_reset(encodebuf)
+ if r == 0:
+ break
+ multibytecodec_encerror(encodebuf, r, errors,
+ errorcb, namecb, unicodedata)
+ src = pypy_cjk_enc_outbuf(encodebuf)
+ length = pypy_cjk_enc_outlen(encodebuf)
+ return rffi.charpsize2str(src, length)
#
finally:
rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf)
def multibytecodec_encerror(encodebuf, e, errors,
- codec, errorcb, namecb, unicodedata):
+ errorcb, namecb, unicodedata):
if e > 0:
reason = "illegal multibyte sequence"
esize = e
@@ -257,6 +266,7 @@
elif errors == "ignore":
replace = ""
elif errors == "replace":
+ codec = pypy_cjk_enc_getcodec(encodebuf)
try:
replace = encode(codec, u"?")
except EncodeDecodeError:
diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -1,14 +1,15 @@
from pypy.rpython.lltypesystem import lltype
from pypy.module._multibytecodec import c_codecs
from pypy.module._multibytecodec.interp_multibytecodec import (
- MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror)
+ MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror,
+ wrap_unicodeencodeerror)
from pypy.interpreter.baseobjspace import Wrappable
from pypy.interpreter.gateway import interp2app, unwrap_spec
from pypy.interpreter.typedef import TypeDef, GetSetProperty
from pypy.module._codecs.interp_codecs import CodecState
-class MultibyteIncrementalDecoder(Wrappable):
+class MultibyteIncrementalBase(Wrappable):
def __init__(self, space, errors):
if errors is None:
@@ -21,6 +22,22 @@
self.name = codec.name
self._initialize()
+ def __del__(self):
+ self._free()
+
+ def reset_w(self):
+ self._free()
+ self._initialize()
+
+ def fget_errors(self, space):
+ return space.wrap(self.errors)
+
+ def fset_errors(self, space, w_errors):
+ self.errors = space.str_w(w_errors)
+
+
+class MultibyteIncrementalDecoder(MultibyteIncrementalBase):
+
def _initialize(self):
self.decodebuf = c_codecs.pypy_cjk_dec_new(self.codec)
self.pending = ""
@@ -31,13 +48,6 @@
c_codecs.pypy_cjk_dec_free(self.decodebuf)
self.decodebuf = lltype.nullptr(c_codecs.DECODEBUF_P.TO)
- def __del__(self):
- self._free()
-
- def reset_w(self):
- self._free()
- self._initialize()
-
@unwrap_spec(object=str, final=bool)
def decode_w(self, object, final=False):
space = self.space
@@ -57,12 +67,6 @@
self.pending = object[pos:]
return space.wrap(output)
- def fget_errors(self, space):
- return space.wrap(self.errors)
-
- def fset_errors(self, space, w_errors):
- self.errors = space.str_w(w_errors)
-
@unwrap_spec(errors="str_or_None")
def mbidecoder_new(space, w_subtype, errors=None):
@@ -81,6 +85,55 @@
)
+class MultibyteIncrementalEncoder(MultibyteIncrementalBase):
+
+ def _initialize(self):
+ self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec)
+ self.pending = u""
+
+ def _free(self):
+ self.pending = None
+ if self.encodebuf:
+ c_codecs.pypy_cjk_enc_free(self.encodebuf)
+ self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO)
+
+ @unwrap_spec(object=unicode, final=bool)
+ def encode_w(self, object, final=False):
+ space = self.space
+ state = space.fromcache(CodecState)
+ if len(self.pending) > 0:
+ object = self.pending + object
+ try:
+ output = c_codecs.encodeex(self.encodebuf, object, self.errors,
+ state.encode_error_handler, self.name,
+ get_ignore_error(final))
+ except c_codecs.EncodeDecodeError, e:
+ raise wrap_unicodeencodeerror(space, e, object, self.name)
+ except RuntimeError:
+ raise wrap_runtimeerror(space)
+ pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf)
+ assert 0 <= pos <= len(object)
+ self.pending = object[pos:]
+ return space.wrap(output)
+
+
+ at unwrap_spec(errors="str_or_None")
+def mbiencoder_new(space, w_subtype, errors=None):
+ r = space.allocate_instance(MultibyteIncrementalEncoder, w_subtype)
+ r.__init__(space, errors)
+ return space.wrap(r)
+
+MultibyteIncrementalEncoder.typedef = TypeDef(
+ 'MultibyteIncrementalEncoder',
+ __module__ = '_multibytecodec',
+ __new__ = interp2app(mbiencoder_new),
+ encode = interp2app(MultibyteIncrementalEncoder.encode_w),
+ reset = interp2app(MultibyteIncrementalEncoder.reset_w),
+ errors = GetSetProperty(MultibyteIncrementalEncoder.fget_errors,
+ MultibyteIncrementalEncoder.fset_errors),
+ )
+
+
def get_ignore_error(final):
if final:
return 0 # don't ignore any error
diff --git a/pypy/module/_multibytecodec/test/test_app_incremental.py b/pypy/module/_multibytecodec/test/test_app_incremental.py
--- a/pypy/module/_multibytecodec/test/test_app_incremental.py
+++ b/pypy/module/_multibytecodec/test/test_app_incremental.py
@@ -13,6 +13,15 @@
return IncrementalHzDecoder
""")
+ cls.w_IncrementalHzEncoder = cls.space.appexec([], """():
+ import _codecs_cn
+ from _multibytecodec import MultibyteIncrementalEncoder
+
+ class IncrementalHzEncoder(MultibyteIncrementalEncoder):
+ codec = _codecs_cn.getcodec('hz')
+
+ return IncrementalHzEncoder
+ """)
def test_decode_hz(self):
d = self.IncrementalHzDecoder()
@@ -74,3 +83,14 @@
d.errors = "replace"
r = d.decode("~{abc", True)
assert r == u'\u5f95\ufffd'
+
+ def test_decode_hz_buffer_grow(self):
+ d = self.IncrementalHzDecoder()
+ for i in range(13):
+ r = d.decode("a" * (2**i))
+ assert r == unicode("a" * (2**i))
+
+ def test_encode_hz(self):
+ e = self.IncrementalHzEncoder()
+ r = e.encode("abcd")
+ assert r == u'abcd'
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
@@ -119,34 +119,40 @@
/************************************************************/
-struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
- Py_UNICODE *inbuf, Py_ssize_t inlen)
+struct pypy_cjk_enc_s *pypy_cjk_enc_new(const MultibyteCodec *codec)
{
- Py_ssize_t outlen;
struct pypy_cjk_enc_s *d = malloc(sizeof(struct pypy_cjk_enc_s));
if (!d)
return NULL;
if (codec->encinit != NULL && codec->encinit(&d->state, codec->config) != 0)
- goto errorexit;
+ {
+ free(d);
+ return NULL;
+ }
+ d->codec = codec;
+ d->outbuf_start = NULL;
+ return d;
+}
- d->codec = codec;
+Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d,
+ Py_UNICODE *inbuf, Py_ssize_t inlen)
+{
+ Py_ssize_t outlen;
d->inbuf_start = inbuf;
d->inbuf = inbuf;
d->inbuf_end = inbuf + inlen;
-
- if (inlen > (PY_SSIZE_T_MAX - 16) / 2)
- goto errorexit;
- outlen = inlen * 2 + 16;
- d->outbuf_start = malloc(outlen);
- if (!d->outbuf_start)
- goto errorexit;
+ if (d->outbuf_start == NULL)
+ {
+ if (inlen > (PY_SSIZE_T_MAX - 16) / 2)
+ return -1;
+ outlen = inlen * 2 + 16;
+ d->outbuf_start = malloc(outlen);
+ if (d->outbuf_start == NULL)
+ return -1;
+ d->outbuf_end = d->outbuf_start + outlen;
+ }
d->outbuf = d->outbuf_start;
- d->outbuf_end = d->outbuf_start + outlen;
- return d;
-
- errorexit:
- free(d);
- return NULL;
+ return 0;
}
void pypy_cjk_enc_free(struct pypy_cjk_enc_s *d)
@@ -249,3 +255,8 @@
d->inbuf = d->inbuf_start + in_offset;
return 0;
}
+
+const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *d)
+{
+ return d->codec;
+}
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
@@ -113,8 +113,9 @@
unsigned char *outbuf_start, *outbuf, *outbuf_end;
};
-struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
- Py_UNICODE *inbuf, Py_ssize_t inlen);
+struct pypy_cjk_enc_s *pypy_cjk_enc_new(const MultibyteCodec *codec);
+Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d,
+ Py_UNICODE *inbuf, Py_ssize_t inlen);
void pypy_cjk_enc_free(struct pypy_cjk_enc_s *);
Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *);
Py_ssize_t pypy_cjk_enc_reset(struct pypy_cjk_enc_s *);
@@ -124,6 +125,7 @@
Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d);
Py_ssize_t pypy_cjk_enc_replace_on_error(struct pypy_cjk_enc_s* d,
char *, Py_ssize_t, Py_ssize_t);
+const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *);
/* list of codecs defined in the .c files */
More information about the pypy-commit
mailing list