[Python-checkins] cpython: Issue #20574: Implement incremental decoder for cp65001 code
victor.stinner
python-checkins at python.org
Mon Mar 17 23:12:27 CET 2014
http://hg.python.org/cpython/rev/08f9b881f78c
changeset: 89832:08f9b881f78c
user: Victor Stinner <victor.stinner at gmail.com>
date: Mon Mar 17 23:08:06 2014 +0100
summary:
Issue #20574: Implement incremental decoder for cp65001 code
(Windows code page 65001, Microsoft UTF-8).
files:
Lib/encodings/cp65001.py | 9 ++++--
Lib/test/test_codecs.py | 12 ++-----
Misc/NEWS | 3 ++
Objects/unicodeobject.c | 41 ++++++---------------------
4 files changed, 22 insertions(+), 43 deletions(-)
diff --git a/Lib/encodings/cp65001.py b/Lib/encodings/cp65001.py
--- a/Lib/encodings/cp65001.py
+++ b/Lib/encodings/cp65001.py
@@ -11,20 +11,23 @@
### Codec APIs
encode = functools.partial(codecs.code_page_encode, 65001)
-decode = functools.partial(codecs.code_page_decode, 65001)
+_decode = functools.partial(codecs.code_page_decode, 65001)
+
+def decode(input, errors='strict'):
+ return codecs.code_page_decode(65001, input, errors, True)
class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return encode(input, self.errors)[0]
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
- _buffer_decode = decode
+ _buffer_decode = _decode
class StreamWriter(codecs.StreamWriter):
encode = encode
class StreamReader(codecs.StreamReader):
- decode = decode
+ decode = _decode
### encodings module API
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -890,10 +890,6 @@
"\U00010fff\uD800")
self.assertTrue(codecs.lookup_error("surrogatepass"))
- def test_readline(self):
- self.skipTest("issue #20571: code page 65001 codec does not "
- "support partial decoder yet")
-
class UTF7Test(ReadTest, unittest.TestCase):
encoding = "utf-7"
@@ -2750,15 +2746,15 @@
self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
codecs.code_page_encode, 932, '\xff')
self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
- codecs.code_page_decode, 932, b'\x81\x00')
+ codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
- codecs.code_page_decode, self.CP_UTF8, b'\xff')
+ codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
def check_decode(self, cp, tests):
for raw, errors, expected in tests:
if expected is not None:
try:
- decoded = codecs.code_page_decode(cp, raw, errors)
+ decoded = codecs.code_page_decode(cp, raw, errors, True)
except UnicodeDecodeError as err:
self.fail('Unable to decode %a from "cp%s" with '
'errors=%r: %s' % (raw, cp, errors, err))
@@ -2770,7 +2766,7 @@
self.assertLessEqual(decoded[1], len(raw))
else:
self.assertRaises(UnicodeDecodeError,
- codecs.code_page_decode, cp, raw, errors)
+ codecs.code_page_decode, cp, raw, errors, True)
def check_encode(self, cp, tests):
for text, errors, expected in tests:
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -13,6 +13,9 @@
Library
-------
+- Issue #20574: Implement incremental decoder for cp65001 code (Windows code
+ page 65001, Microsoft UTF-8).
+
- Issue #20879: Delay the initialization of encoding and decoding tables for
base32, ascii85 and base85 codecs in the base64 module, and delay the
initialization of the unquote_to_bytes() table of the urllib.parse module, to
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6817,28 +6817,6 @@
return PyBytes_AS_STRING(*obj);
}
-static int
-is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
-{
- const char *curr = s + offset;
- const char *prev;
-
- if (!IsDBCSLeadByteEx(code_page, *curr))
- return 0;
-
- prev = CharPrevExA(code_page, s, curr, 0);
- if (prev == curr)
- return 1;
- /* FIXME: This code is limited to "true" double-byte encodings,
- as it assumes an incomplete character consists of a single
- byte. */
- if (curr - prev == 2)
- return 1;
- if (!IsDBCSLeadByteEx(code_page, *prev))
- return 1;
- return 0;
-}
-
static DWORD
decode_code_page_flags(UINT code_page)
{
@@ -6913,7 +6891,7 @@
decode_code_page_errors(UINT code_page,
PyObject **v,
const char *in, const int size,
- const char *errors)
+ const char *errors, int final)
{
const char *startin = in;
const char *endin = in + size;
@@ -6940,7 +6918,7 @@
if (encoding == NULL)
return -1;
- if (errors == NULL || strcmp(errors, "strict") == 0) {
+ if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
/* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
UnicodeDecodeError. */
make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
@@ -7003,6 +6981,10 @@
if (outsize <= 0) {
Py_ssize_t startinpos, endinpos, outpos;
+ /* last character in partial decode? */
+ if (in + insize >= endin && !final)
+ break;
+
startinpos = in - startin;
endinpos = startinpos + 1;
outpos = out - PyUnicode_AS_UNICODE(*v);
@@ -7031,7 +7013,7 @@
assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
if (unicode_resize(v, outsize) < 0)
goto error;
- ret = size;
+ ret = in - startin;
error:
Py_XDECREF(encoding_obj);
@@ -7072,24 +7054,19 @@
done = 1;
}
- /* Skip trailing lead-byte unless 'final' is set */
- if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
- --chunk_size;
-
if (chunk_size == 0 && done) {
if (v != NULL)
break;
_Py_RETURN_UNICODE_EMPTY();
}
-
converted = decode_code_page_strict(code_page, &v,
s, chunk_size);
if (converted == -2)
converted = decode_code_page_errors(code_page, &v,
s, chunk_size,
- errors);
- assert(converted != 0);
+ errors, final);
+ assert(converted != 0 || done);
if (converted < 0) {
Py_XDECREF(v);
--
Repository URL: http://hg.python.org/cpython
More information about the Python-checkins
mailing list