[Python-checkins] cpython (3.4): Issue #19858: pickletools.optimize() now aware of the MEMOIZE opcode, can

serhiy.storchaka python-checkins at python.org
Tue Dec 16 17:06:54 CET 2014


https://hg.python.org/cpython/rev/c49b7acba06f
changeset:   93908:c49b7acba06f
branch:      3.4
parent:      93906:3ffa8438d274
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Tue Dec 16 18:00:56 2014 +0200
summary:
  Issue #19858:  pickletools.optimize() now aware of the MEMOIZE opcode, can
produce more compact result and no longer produces invalid output if input
data contains MEMOIZE opcodes together with PUT or BINPUT opcodes.

files:
  Lib/pickletools.py           |  69 +++++++++++++++--------
  Lib/test/test_pickletools.py |  43 ++++++++++++++
  Misc/NEWS                    |   4 +
  3 files changed, 92 insertions(+), 24 deletions(-)


diff --git a/Lib/pickletools.py b/Lib/pickletools.py
--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py
@@ -2282,40 +2282,61 @@
 
 def optimize(p):
     'Optimize a pickle string by removing unused PUT opcodes'
-    not_a_put = object()
-    gets = { not_a_put }    # set of args used by a GET opcode
-    opcodes = []            # (startpos, stoppos, putid)
+    put = 'PUT'
+    get = 'GET'
+    oldids = set()          # set of all PUT ids
+    newids = {}             # set of ids used by a GET opcode
+    opcodes = []            # (op, idx) or (pos, end_pos)
     proto = 0
+    protoheader = b''
     for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True):
         if 'PUT' in opcode.name:
-            opcodes.append((pos, end_pos, arg))
+            oldids.add(arg)
+            opcodes.append((put, arg))
+        elif opcode.name == 'MEMOIZE':
+            idx = len(oldids)
+            oldids.add(idx)
+            opcodes.append((put, idx))
         elif 'FRAME' in opcode.name:
             pass
+        elif 'GET' in opcode.name:
+            if opcode.proto > proto:
+                proto = opcode.proto
+            newids[arg] = None
+            opcodes.append((get, arg))
+        elif opcode.name == 'PROTO':
+            if arg > proto:
+                proto = arg
+            if pos == 0:
+                protoheader = p[pos: end_pos]
+            else:
+                opcodes.append((pos, end_pos))
         else:
-            if 'GET' in opcode.name:
-                gets.add(arg)
-            elif opcode.name == 'PROTO':
-                assert pos == 0, pos
-                proto = arg
-            opcodes.append((pos, end_pos, not_a_put))
-            prevpos, prevarg = pos, None
+            opcodes.append((pos, end_pos))
+    del oldids
 
     # Copy the opcodes except for PUTS without a corresponding GET
     out = io.BytesIO()
-    opcodes = iter(opcodes)
-    if proto >= 2:
-        # Write the PROTO header before any framing
-        start, stop, _ = next(opcodes)
-        out.write(p[start:stop])
-    buf = pickle._Framer(out.write)
+    # Write the PROTO header before any framing
+    out.write(protoheader)
+    pickler = pickle._Pickler(out, proto)
     if proto >= 4:
-        buf.start_framing()
-    for start, stop, putid in opcodes:
-        if putid in gets:
-            buf.commit_frame()
-            buf.write(p[start:stop])
-    if proto >= 4:
-        buf.end_framing()
+        pickler.framer.start_framing()
+    idx = 0
+    for op, arg in opcodes:
+        if op is put:
+            if arg not in newids:
+                continue
+            data = pickler.put(idx)
+            newids[arg] = idx
+            idx += 1
+        elif op is get:
+            data = pickler.get(newids[arg])
+        else:
+            data = p[op:arg]
+        pickler.framer.commit_frame()
+        pickler.write(data)
+    pickler.framer.end_framing()
     return out.getvalue()
 
 ##############################################################################
diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py
--- a/Lib/test/test_pickletools.py
+++ b/Lib/test/test_pickletools.py
@@ -1,3 +1,4 @@
+import struct
 import pickle
 import pickletools
 from test import support
@@ -15,6 +16,48 @@
     # Test relies on precise output of dumps()
     test_pickle_to_2x = None
 
+    def test_optimize_long_binget(self):
+        data = [str(i) for i in range(257)]
+        data.append(data[-1])
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            pickled = pickle.dumps(data, proto)
+            unpickled = pickle.loads(pickled)
+            self.assertEqual(unpickled, data)
+            self.assertIs(unpickled[-1], unpickled[-2])
+
+            pickled2 = pickletools.optimize(pickled)
+            unpickled2 = pickle.loads(pickled2)
+            self.assertEqual(unpickled2, data)
+            self.assertIs(unpickled2[-1], unpickled2[-2])
+            self.assertNotIn(pickle.LONG_BINGET, pickled2)
+            self.assertNotIn(pickle.LONG_BINPUT, pickled2)
+
+    def test_optimize_binput_and_memoize(self):
+        pickled = (b'\x80\x04\x95\x15\x00\x00\x00\x00\x00\x00\x00'
+                   b']\x94(\x8c\x04spamq\x01\x8c\x03ham\x94h\x02e.')
+        #    0: \x80 PROTO      4
+        #    2: \x95 FRAME      21
+        #   11: ]    EMPTY_LIST
+        #   12: \x94 MEMOIZE
+        #   13: (    MARK
+        #   14: \x8c     SHORT_BINUNICODE 'spam'
+        #   20: q        BINPUT     1
+        #   22: \x8c     SHORT_BINUNICODE 'ham'
+        #   27: \x94     MEMOIZE
+        #   28: h        BINGET     2
+        #   30: e        APPENDS    (MARK at 13)
+        #   31: .    STOP
+        self.assertIn(pickle.BINPUT, pickled)
+        unpickled = pickle.loads(pickled)
+        self.assertEqual(unpickled, ['spam', 'ham', 'ham'])
+        self.assertIs(unpickled[1], unpickled[2])
+
+        pickled2 = pickletools.optimize(pickled)
+        unpickled2 = pickle.loads(pickled2)
+        self.assertEqual(unpickled2, ['spam', 'ham', 'ham'])
+        self.assertIs(unpickled2[1], unpickled2[2])
+        self.assertNotIn(pickle.BINPUT, pickled2)
+
 
 def test_main():
     support.run_unittest(OptimizedPickleTests)
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -41,6 +41,10 @@
 Library
 -------
 
+- Issue #19858:  pickletools.optimize() now aware of the MEMOIZE opcode, can
+  produce more compact result and no longer produces invalid output if input
+  data contains MEMOIZE opcodes together with PUT or BINPUT opcodes.
+
 - Issue #22095: Fixed HTTPConnection.set_tunnel with default port.  The port
   value in the host header was set to "None".  Patch by Demian Brecht.
 

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list