[Python-checkins] bpo-32503: Avoid creating too small frames in pickles. (#5127)

Serhiy Storchaka webhook-mailer at python.org
Sat Jan 20 09:42:47 EST 2018


https://github.com/python/cpython/commit/1211c9a9897a174b7261ca258cabf289815a40d8
commit: 1211c9a9897a174b7261ca258cabf289815a40d8
branch: master
author: Serhiy Storchaka <storchaka at gmail.com>
committer: GitHub <noreply at github.com>
date: 2018-01-20T16:42:44+02:00
summary:

bpo-32503: Avoid creating too small frames in pickles. (#5127)

files:
A Misc/NEWS.d/next/Library/2018-01-07-09-22-26.bpo-32503.ViMxpD.rst
M Lib/pickle.py
M Lib/test/pickletester.py
M Modules/_pickle.c

diff --git a/Lib/pickle.py b/Lib/pickle.py
index 301e8cf5589..e6d003787ba 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -183,6 +183,7 @@ def __init__(self, value):
 
 class _Framer:
 
+    _FRAME_SIZE_MIN = 4
     _FRAME_SIZE_TARGET = 64 * 1024
 
     def __init__(self, file_write):
@@ -203,11 +204,12 @@ def commit_frame(self, force=False):
             if f.tell() >= self._FRAME_SIZE_TARGET or force:
                 data = f.getbuffer()
                 write = self.file_write
-                # Issue a single call to the write method of the underlying
-                # file object for the frame opcode with the size of the
-                # frame. The concatenation is expected to be less expensive
-                # than issuing an additional call to write.
-                write(FRAME + pack("<Q", len(data)))
+                if len(data) >= self._FRAME_SIZE_MIN:
+                    # Issue a single call to the write method of the underlying
+                    # file object for the frame opcode with the size of the
+                    # frame. The concatenation is expected to be less expensive
+                    # than issuing an additional call to write.
+                    write(FRAME + pack("<Q", len(data)))
 
                 # Issue a separate call to write to append the frame
                 # contents without concatenation to the above to avoid a
diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 062ec5ae72b..b84b87861d0 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -2037,6 +2037,7 @@ def test_setitems_on_non_dicts(self):
 
     # Exercise framing (proto >= 4) for significant workloads
 
+    FRAME_SIZE_MIN = 4
     FRAME_SIZE_TARGET = 64 * 1024
 
     def check_frame_opcodes(self, pickled):
@@ -2047,36 +2048,43 @@ def check_frame_opcodes(self, pickled):
         framed by default and are therefore considered a frame by themselves in
         the following consistency check.
         """
-        last_arg = last_pos = last_frame_opcode_size = None
-        frameless_opcode_sizes = {
-            'BINBYTES': 5,
-            'BINUNICODE': 5,
-            'BINBYTES8': 9,
-            'BINUNICODE8': 9,
-        }
+        frame_end = frameless_start = None
+        frameless_opcodes = {'BINBYTES', 'BINUNICODE', 'BINBYTES8', 'BINUNICODE8'}
         for op, arg, pos in pickletools.genops(pickled):
-            if op.name in frameless_opcode_sizes:
-                if len(arg) > self.FRAME_SIZE_TARGET:
-                    frame_opcode_size = frameless_opcode_sizes[op.name]
-                    arg = len(arg)
-                else:
-                    continue
-            elif op.name == 'FRAME':
-                frame_opcode_size = 9
-            else:
-                continue
-
-            if last_pos is not None:
-                # The previous frame's size should be equal to the number
-                # of bytes up to the current frame.
-                frame_size = pos - last_pos - last_frame_opcode_size
-                self.assertEqual(frame_size, last_arg)
-            last_arg, last_pos = arg, pos
-            last_frame_opcode_size = frame_opcode_size
-        # The last frame's size should be equal to the number of bytes up
-        # to the pickle's end.
-        frame_size = len(pickled) - last_pos - last_frame_opcode_size
-        self.assertEqual(frame_size, last_arg)
+            if frame_end is not None:
+                self.assertLessEqual(pos, frame_end)
+                if pos == frame_end:
+                    frame_end = None
+
+            if frame_end is not None:  # framed
+                self.assertNotEqual(op.name, 'FRAME')
+                if op.name in frameless_opcodes:
+                    # Only short bytes and str objects should be written
+                    # in a frame
+                    self.assertLessEqual(len(arg), self.FRAME_SIZE_TARGET)
+
+            else:  # not framed
+                if (op.name == 'FRAME' or
+                    (op.name in frameless_opcodes and
+                     len(arg) > self.FRAME_SIZE_TARGET)):
+                    # Frame or large bytes or str object
+                    if frameless_start is not None:
+                        # Only short data should be written outside of a frame
+                        self.assertLess(pos - frameless_start,
+                                        self.FRAME_SIZE_MIN)
+                        frameless_start = None
+                elif frameless_start is None and op.name != 'PROTO':
+                    frameless_start = pos
+
+            if op.name == 'FRAME':
+                self.assertGreaterEqual(arg, self.FRAME_SIZE_MIN)
+                frame_end = pos + 9 + arg
+
+        pos = len(pickled)
+        if frame_end is not None:
+            self.assertEqual(frame_end, pos)
+        elif frameless_start is not None:
+            self.assertLess(pos - frameless_start, self.FRAME_SIZE_MIN)
 
     def test_framing_many_objects(self):
         obj = list(range(10**5))
@@ -2095,7 +2103,8 @@ def test_framing_many_objects(self):
 
     def test_framing_large_objects(self):
         N = 1024 * 1024
-        obj = [b'x' * N, b'y' * N, 'z' * N]
+        small_items = [[i] for i in range(10)]
+        obj = [b'x' * N, *small_items, b'y' * N, 'z' * N]
         for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
             for fast in [False, True]:
                 with self.subTest(proto=proto, fast=fast):
@@ -2119,12 +2128,9 @@ def test_framing_large_objects(self):
                     # Perform full equality check if the lengths match.
                     self.assertEqual(obj, unpickled)
                     n_frames = count_opcode(pickle.FRAME, pickled)
-                    if not fast:
-                        # One frame per memoize for each large object.
-                        self.assertGreaterEqual(n_frames, len(obj))
-                    else:
-                        # One frame at the beginning and one at the end.
-                        self.assertGreaterEqual(n_frames, 2)
+                    # A single frame for small objects between
+                    # first two large objects.
+                    self.assertEqual(n_frames, 1)
                     self.check_frame_opcodes(pickled)
 
     def test_optional_frames(self):
@@ -2152,7 +2158,9 @@ def remove_frames(pickled, keep_frame=None):
 
         frame_size = self.FRAME_SIZE_TARGET
         num_frames = 20
-        obj = [bytes([i]) * frame_size for i in range(num_frames)]
+        # Large byte objects (dict values) intermitted with small objects
+        # (dict keys)
+        obj = {i: bytes([i]) * frame_size for i in range(num_frames)}
 
         for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
             pickled = self.dumps(obj, proto)
diff --git a/Misc/NEWS.d/next/Library/2018-01-07-09-22-26.bpo-32503.ViMxpD.rst b/Misc/NEWS.d/next/Library/2018-01-07-09-22-26.bpo-32503.ViMxpD.rst
new file mode 100644
index 00000000000..609c59308e6
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-01-07-09-22-26.bpo-32503.ViMxpD.rst
@@ -0,0 +1 @@
+Pickling with protocol 4 no longer creates too small frames.
diff --git a/Modules/_pickle.c b/Modules/_pickle.c
index f8cbea12e80..f06a87d04f2 100644
--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -119,8 +119,8 @@ enum {
     /* Prefetch size when unpickling (disabled on unpeekable streams) */
     PREFETCH = 8192 * 16,
 
+    FRAME_SIZE_MIN = 4,
     FRAME_SIZE_TARGET = 64 * 1024,
-
     FRAME_HEADER_SIZE = 9
 };
 
@@ -949,13 +949,6 @@ _write_size64(char *out, size_t value)
     }
 }
 
-static void
-_Pickler_WriteFrameHeader(PicklerObject *self, char *qdata, size_t frame_len)
-{
-    qdata[0] = FRAME;
-    _write_size64(qdata + 1, frame_len);
-}
-
 static int
 _Pickler_CommitFrame(PicklerObject *self)
 {
@@ -966,7 +959,14 @@ _Pickler_CommitFrame(PicklerObject *self)
         return 0;
     frame_len = self->output_len - self->frame_start - FRAME_HEADER_SIZE;
     qdata = PyBytes_AS_STRING(self->output_buffer) + self->frame_start;
-    _Pickler_WriteFrameHeader(self, qdata, frame_len);
+    if (frame_len >= FRAME_SIZE_MIN) {
+        qdata[0] = FRAME;
+        _write_size64(qdata + 1, frame_len);
+    }
+    else {
+        memmove(qdata, qdata + FRAME_HEADER_SIZE, frame_len);
+        self->output_len -= FRAME_HEADER_SIZE;
+    }
     self->frame_start = -1;
     return 0;
 }



More information about the Python-checkins mailing list