[Python-checkins] bpo-36311: Fixes decoding multibyte characters around chunk boundaries and improves decoding performance (GH-15083)

Wed Aug 21 19:22:37 EDT 2019

https://github.com/python/cpython/commit/7ebdda0dbee7df6f0c945a7e1e623e47676e112d
commit: 7ebdda0dbee7df6f0c945a7e1e623e47676e112d
branch: master
author: Steve Dower <steve.dower at python.org>
committer: GitHub <noreply at github.com>
date: 2019-08-21T16:22:33-07:00
summary:

bpo-36311: Fixes decoding multibyte characters around chunk boundaries and improves decoding performance (GH-15083)

files:
A Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst
M Lib/test/test_codecs.py
M Objects/unicodeobject.c

diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index b187ca650dc6..ba7f4847468a 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3075,13 +3075,13 @@ def test_mbcs_alias(self):
             self.assertEqual(codec.name, 'mbcs')
 
     @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
-    def test_large_input(self):
+    def test_large_input(self, size):
         # Test input longer than INT_MAX.
         # Input should contain undecodable bytes before and after
         # the INT_MAX limit.
-        encoded = (b'01234567' * (2**28-1) +
+        encoded = (b'01234567' * ((size//8)-1) +
                    b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
-        self.assertEqual(len(encoded), 2**31+2)
+        self.assertEqual(len(encoded), size+2)
         decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
         self.assertEqual(decoded[1], len(encoded))
         del encoded
@@ -3092,6 +3092,20 @@ def test_large_input(self):
                          '\udc85\udc86\udcea\udceb\udcec'
                          '\udcef\udcfc\udcfd\udcfe\udcff')
 
+    @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
+    def test_large_utf8_input(self, size):
+        # Test input longer than INT_MAX.
+        # Input should contain a decodable multi-byte character
+        # surrounding INT_MAX
+        encoded = (b'0123456\xed\x84\x80' * (size//8))
+        self.assertEqual(len(encoded), size // 8 * 10)
+        decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
+        self.assertEqual(decoded[1], len(encoded))
+        del encoded
+        self.assertEqual(len(decoded[0]), size)
+        self.assertEqual(decoded[0][:10], '0123456\ud10001')
+        self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
+
 
 class ASCIITest(unittest.TestCase):
     def test_encode(self):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst b/Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst
new file mode 100644
index 000000000000..c45f2224237b
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2019-08-02-15-01-33.bpo-36311.uY5vt-.rst	
@@ -0,0 +1,2 @@
+Decoding bytes objects larger than 2GiB is faster and no longer fails when a
+multibyte characters spans a chunk boundary.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 5545eae79505..aa933773233b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -7186,6 +7186,12 @@ PyUnicode_AsASCIIString(PyObject *unicode)
 #define NEED_RETRY
 #endif
 
+/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
+   transcoding from UTF-16), but INT_MAX / 4 perfoms better in
+   both cases also and avoids partial characters overrunning the
+   length limit in MultiByteToWideChar on Windows */
+#define DECODING_CHUNK_SIZE (INT_MAX/4)
+
 #ifndef WC_ERR_INVALID_CHARS
 #  define WC_ERR_INVALID_CHARS 0x0080
 #endif
@@ -7422,8 +7428,8 @@ decode_code_page_stateful(int code_page,
     do
     {
 #ifdef NEED_RETRY
-        if (size > INT_MAX) {
-            chunk_size = INT_MAX;
+        if (size > DECODING_CHUNK_SIZE) {
+            chunk_size = DECODING_CHUNK_SIZE;
             final = 0;
             done = 0;
         }
@@ -7827,10 +7833,8 @@ encode_code_page(int code_page,
     do
     {
 #ifdef NEED_RETRY
-        /* UTF-16 encoding may double the size, so use only INT_MAX/2
-           chunks. */
-        if (len > INT_MAX/2) {
-            chunk_len = INT_MAX/2;
+        if (len > DECODING_CHUNK_SIZE) {
+            chunk_len = DECODING_CHUNK_SIZE;
             done = 0;
         }
         else