[pypy-commit] pypy default: Implement errors for encode() too.
Armin Rigo
noreply at buildbot.pypy.org
Sun Jun 5 11:35:19 CEST 2011
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r44712:8f59a7c650d7
Date: 2011-06-05 11:33 +0200
http://bitbucket.org/pypy/pypy/changeset/8f59a7c650d7/
Log: Implement errors for encode() too.
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -147,7 +147,7 @@
return # continue decoding
if errors == "replace":
e = pypy_cjk_dec_inbuf_add(decodebuf, esize, 1)
- if e == MBERR_NOMEMORY:
+ if rffi.cast(lltype.Signed, e) == MBERR_NOMEMORY:
raise MemoryError
return # continue decoding
start = pypy_cjk_dec_inbuf_consumed(decodebuf)
@@ -176,8 +176,11 @@
[ENCODEBUF_P], rffi.SSIZE_T)
pypy_cjk_enc_inbuf_consumed = llexternal('pypy_cjk_enc_inbuf_consumed',
[ENCODEBUF_P], rffi.SSIZE_T)
+pypy_cjk_enc_inbuf_add = llexternal('pypy_cjk_enc_inbuf_add',
+ [ENCODEBUF_P, rffi.SSIZE_T, rffi.INT],
+ rffi.INT)
-def encode(codec, unicodedata):
+def encode(codec, unicodedata, errors="strict"):
inleft = len(unicodedata)
inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata)
try:
@@ -185,14 +188,16 @@
if not encodebuf:
raise MemoryError
try:
- r = pypy_cjk_enc_chunk(encodebuf)
- if r != 0:
- multibytecodec_encerror(encodebuf, r)
- assert False
- r = pypy_cjk_enc_reset(encodebuf)
- if r != 0:
- multibytecodec_encerror(encodebuf, r)
- assert False
+ while True:
+ r = pypy_cjk_enc_chunk(encodebuf)
+ if r == 0:
+ break
+ multibytecodec_encerror(encodebuf, r, errors)
+ while True:
+ r = pypy_cjk_enc_reset(encodebuf)
+ if r == 0:
+ break
+ multibytecodec_encerror(encodebuf, r, errors)
src = pypy_cjk_enc_outbuf(encodebuf)
length = pypy_cjk_enc_outlen(encodebuf)
return rffi.charpsize2str(src, length)
@@ -203,7 +208,7 @@
finally:
rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf)
-def multibytecodec_encerror(encodebuf, e):
+def multibytecodec_encerror(encodebuf, e, errors):
if e > 0:
reason = "illegal multibyte sequence"
esize = e
@@ -215,9 +220,16 @@
else:
raise RuntimeError
#
- # if errors == ERROR_REPLACE:...
- # if errors == ERROR_IGNORE or errors == ERROR_REPLACE:...
+ if errors == 'ignore':
+ pypy_cjk_enc_inbuf_add(encodebuf, esize, 0)
+ return # continue encoding
+ if errors == "replace":
+ e = pypy_cjk_enc_inbuf_add(encodebuf, esize, 1)
+ if rffi.cast(lltype.Signed, e) == MBERR_NOMEMORY:
+ raise MemoryError
+ return # continue decoding
start = pypy_cjk_enc_inbuf_consumed(encodebuf)
end = start + esize
- if 1: # errors == ERROR_STRICT:
- raise EncodeDecodeError(start, end, reason)
+ if errors != "strict":
+ reason = "not implemented: custom error handlers" # XXX implement me
+ raise EncodeDecodeError(start, end, reason)
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -35,13 +35,11 @@
@unwrap_spec(input=unicode, errors="str_or_None")
def encode(self, space, input, errors=None):
- if errors is not None and errors != 'strict':
- raise OperationError(space.w_NotImplementedError, # XXX
- space.wrap("errors='%s' in _multibytecodec"
- % errors))
+ if errors is None:
+ errors = 'strict'
#
try:
- output = c_codecs.encode(self.codec, input)
+ output = c_codecs.encode(self.codec, input, errors)
except c_codecs.EncodeDecodeError, e:
raise OperationError(
space.w_UnicodeEncodeError,
diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py b/pypy/module/_multibytecodec/test/test_app_codecs.py
--- a/pypy/module/_multibytecodec/test/test_app_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_app_codecs.py
@@ -70,3 +70,17 @@
assert e.start == 3
assert e.end == 4
assert e.reason == 'illegal multibyte sequence'
+
+ def test_encode_hz_ignore(self):
+ import _codecs_cn
+ codec = _codecs_cn.getcodec("hz")
+ r = codec.encode(u'abc\u1234def', 'ignore')
+ assert r == ('abcdef', 7)
+ assert type(r[0]) is str
+
+ def test_encode_hz_replace(self):
+ import _codecs_cn
+ codec = _codecs_cn.getcodec("hz")
+ r = codec.encode(u'abc\u1234def', 'replace')
+ assert r == ('abc?def', 7)
+ assert type(r[0]) is str
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -69,6 +69,25 @@
assert e.end == 4
assert e.reason == "illegal multibyte sequence"
+def test_encode_hz_ignore():
+ c = getcodec("hz")
+ s = encode(c, u'abc\u1234def', 'ignore')
+ assert s == 'abcdef'
+
+def test_encode_hz_replace():
+ c = getcodec("hz")
+ s = encode(c, u'abc\u1234def', 'replace')
+ assert s == 'abc?def'
+
+def test_encode_hz_foobar():
+ # not implemented yet: custom error handlers
+ c = getcodec("hz")
+ e = py.test.raises(EncodeDecodeError, encode,
+ c, u'abc\u1234def', 'foobar').value
+ assert e.start == 3
+ assert e.end == 4
+ assert e.reason == "not implemented: custom error handlers"
+
def test_encode_jisx0208():
c = getcodec('iso2022_jp')
s = encode(c, u'\u83ca\u5730\u6642\u592b')
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
@@ -225,3 +225,34 @@
{
return d->inbuf - d->inbuf_start;
}
+
+int pypy_cjk_enc_inbuf_add(struct pypy_cjk_enc_s* d, Py_ssize_t skip,
+ int add_replacement_character)
+{
+ if (add_replacement_character)
+ {
+ const Py_UNICODE replchar = '?', *inbuf = &replchar;
+ Py_ssize_t r;
+
+ while (1)
+ {
+ Py_ssize_t outleft = (Py_ssize_t)(d->outbuf_end - d->outbuf);
+ r = d->codec->encode(&d->state, d->codec->config,
+ &inbuf, 1, &d->outbuf, outleft, 0);
+ if (r != MBERR_TOOSMALL)
+ break;
+ /* output buffer too small; grow it and continue. */
+ if (expand_encodebuffer(d, -1) == -1)
+ return MBERR_NOMEMORY;
+ }
+ if (r != 0)
+ {
+ if (d->outbuf >= d->outbuf_end)
+ if (expand_encodebuffer(d, 1) == -1)
+ return MBERR_NOMEMORY;
+ *d->outbuf++ = '?';
+ }
+ }
+ d->inbuf += skip;
+ return 0;
+}
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
@@ -120,6 +120,7 @@
Py_ssize_t pypy_cjk_enc_outlen(struct pypy_cjk_enc_s *);
Py_ssize_t pypy_cjk_enc_inbuf_remaining(struct pypy_cjk_enc_s *d);
Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d);
+int pypy_cjk_enc_inbuf_add(struct pypy_cjk_enc_s*, Py_ssize_t, int);
/* list of codecs defined in the .c files */
More information about the pypy-commit
mailing list