[Python-checkins] bpo-36312: Fix decoders for some code pages. (GH-12369)
Serhiy Storchaka
webhook-mailer at python.org
Wed Mar 20 15:45:22 EDT 2019
https://github.com/python/cpython/commit/c1e2c288f41cdc1c6e6e09d9a5277a58232ceb03
commit: c1e2c288f41cdc1c6e6e09d9a5277a58232ceb03
branch: master
author: Serhiy Storchaka <storchaka at gmail.com>
committer: GitHub <noreply at github.com>
date: 2019-03-20T21:45:18+02:00
summary:
bpo-36312: Fix decoders for some code pages. (GH-12369)
files:
A Misc/NEWS.d/next/Windows/2019-03-16-16-51-17.bpo-36312.Niwm-T.rst
M Lib/test/test_codecs.py
M Objects/unicodeobject.c
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index e8c7d76544e1..331449397e37 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3066,6 +3066,15 @@ def test_multibyte_encoding(self):
('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
))
+ def test_code_page_decode_flags(self):
+ # Issue #36312: For some code pages (e.g. UTF-7) flags for
+ # MultiByteToWideChar() must be set to 0.
+ for cp in (50220, 50221, 50222, 50225, 50227, 50229,
+ *range(57002, 57011+1), 65000):
+ self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3))
+ self.assertEqual(codecs.code_page_decode(42, b'abc'),
+ ('\uf061\uf062\uf063', 3))
+
def test_incremental(self):
decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
self.assertEqual(decoded, ('', 0))
diff --git a/Misc/NEWS.d/next/Windows/2019-03-16-16-51-17.bpo-36312.Niwm-T.rst b/Misc/NEWS.d/next/Windows/2019-03-16-16-51-17.bpo-36312.Niwm-T.rst
new file mode 100644
index 000000000000..8b325db3a989
--- /dev/null
+++ b/Misc/NEWS.d/next/Windows/2019-03-16-16-51-17.bpo-36312.Niwm-T.rst
@@ -0,0 +1,2 @@
+Fixed decoders for the following code pages: 50220, 50221, 50222, 50225,
+50227, 50229, 57002 through 57011, 65000 and 42.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 6e83ed6bdd43..8ab3943e61b2 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -7083,15 +7083,21 @@ decode_code_page_strict(UINT code_page,
const char *in,
int insize)
{
- const DWORD flags = decode_code_page_flags(code_page);
+ DWORD flags = MB_ERR_INVALID_CHARS;
wchar_t *out;
DWORD outsize;
/* First get the size of the result */
assert(insize > 0);
- outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
- if (outsize <= 0)
- goto error;
+ while ((outsize = MultiByteToWideChar(code_page, flags,
+ in, insize, NULL, 0)) <= 0)
+ {
+ if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
+ goto error;
+ }
+ /* For some code pages (e.g. UTF-7) flags must be set to 0. */
+ flags = 0;
+ }
/* Extend a wchar_t* buffer */
Py_ssize_t n = *bufsize; /* Get the current length */
@@ -7129,7 +7135,7 @@ decode_code_page_errors(UINT code_page,
{
const char *startin = in;
const char *endin = in + size;
- const DWORD flags = decode_code_page_flags(code_page);
+ DWORD flags = MB_ERR_INVALID_CHARS;
/* Ideally, we should get reason from FormatMessage. This is the Windows
2000 English version of the message. */
const char *reason = "No mapping for the Unicode character exists "
@@ -7187,6 +7193,11 @@ decode_code_page_errors(UINT code_page,
if (outsize > 0)
break;
err = GetLastError();
+ if (err == ERROR_INVALID_FLAGS && flags) {
+ /* For some code pages (e.g. UTF-7) flags must be set to 0. */
+ flags = 0;
+ continue;
+ }
if (err != ERROR_NO_UNICODE_TRANSLATION
&& err != ERROR_INSUFFICIENT_BUFFER)
{
More information about the Python-checkins
mailing list