[Python-3000-checkins] r54846 - in python/branches/p3yk/Lib: io.py test/test_io.py

guido.van.rossum python-3000-checkins at python.org
Tue Apr 17 04:38:06 CEST 2007


Author: guido.van.rossum
Date: Tue Apr 17 04:38:04 2007
New Revision: 54846

Modified:
   python/branches/p3yk/Lib/io.py
   python/branches/p3yk/Lib/test/test_io.py
Log:
Instead of pickling the whole decoder, use the new getstate/setstate API.


Modified: python/branches/p3yk/Lib/io.py
==============================================================================
--- python/branches/p3yk/Lib/io.py	(original)
+++ python/branches/p3yk/Lib/io.py	Tue Apr 17 04:38:04 2007
@@ -18,7 +18,7 @@
 XXX whenever an argument is None, use the default value
 XXX read/write ops should check readable/writable
 XXX buffered readinto should work with arbitrary buffer objects
-XXX use incremental encoder for text output, at least for UTF-16
+XXX use incremental encoder for text output, at least for UTF-16 and UTF-8-SIG
 """
 
 __author__ = ("Guido van Rossum <guido at python.org>, "
@@ -36,11 +36,6 @@
 import _fileio
 import warnings
 
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
 # XXX Shouldn't we use st_blksize whenever we can?
 DEFAULT_BUFFER_SIZE = 8 * 1024  # bytes
 
@@ -957,17 +952,16 @@
         self._newline = newline or os.linesep
         self._fix_newlines = newline is None
         self._decoder = None
-        self._decoder_in_rest_pickle = None
         self._pending = ""
         self._snapshot = None
         self._seekable = self._telling = self.buffer.seekable()
 
     # A word about _snapshot.  This attribute is either None, or a
-    # tuple (decoder_pickle, readahead, pending) where decoder_pickle
-    # is a pickled decoder state, readahead is the chunk of bytes that
-    # was read, and pending is the characters that were rendered by
-    # the decoder after feeding it those bytes.  We use this to
-    # reconstruct intermediate decoder states in tell().
+    # tuple (decoder_state, readahead, pending) where decoder_state is
+    # the second (integer) item of the decoder state, readahead is the
+    # chunk of bytes that was read, and pending is the characters that
+    # were rendered by the decoder after feeding it those bytes.  We
+    # use this to reconstruct intermediate decoder states in tell().
 
     def _seekable(self):
         return self._seekable
@@ -1005,10 +999,6 @@
             raise IOError("Can't find an incremental decoder for encoding %s" %
                           self._encoding)
         decoder = self._decoder = make_decoder()  # XXX: errors
-        if isinstance(decoder, codecs.BufferedIncrementalDecoder):
-            # XXX Hack: make the codec use bytes instead of strings
-            decoder.buffer = b""
-        self._decoder_in_rest_pickle = pickle.dumps(decoder, 2)  # For tell()
         return decoder
 
     def _read_chunk(self):
@@ -1017,15 +1007,13 @@
             readahead = self.buffer.read1(self._CHUNK_SIZE)
             pending = self._decoder.decode(readahead, not readahead)
             return readahead, pending
-        decoder_state = pickle.dumps(self._decoder, 2)
+        decoder_buffer, decoder_state = self._decoder.getstate()
         readahead = self.buffer.read1(self._CHUNK_SIZE)
         pending = self._decoder.decode(readahead, not readahead)
-        self._snapshot = (decoder_state, readahead, pending)
+        self._snapshot = (decoder_state, decoder_buffer + readahead, pending)
         return readahead, pending
 
     def _encode_decoder_state(self, ds, pos):
-        if ds == self._decoder_in_rest_pickle:
-            return pos
         x = 0
         for i in bytes(ds):
             x = x<<8 | i
@@ -1048,7 +1036,8 @@
             raise IOError("Telling position disabled by next() call")
         self.flush()
         position = self.buffer.tell()
-        if self._decoder is None or self._snapshot is None:
+        decoder = self._decoder
+        if decoder is None or self._snapshot is None:
             assert self._pending == ""
             return position
         decoder_state, readahead, pending = self._snapshot
@@ -1056,15 +1045,21 @@
         needed = len(pending) - len(self._pending)
         if not needed:
             return self._encode_decoder_state(decoder_state, position)
-        decoder = pickle.loads(decoder_state)
-        n = 0
-        bb = bytes(1)
-        for i, bb[0] in enumerate(readahead):
-            n += len(decoder.decode(bb))
-            if n >= needed:
-                decoder_state = pickle.dumps(decoder, 2)
-                return self._encode_decoder_state(decoder_state, position+i+1)
-        raise IOError("Can't reconstruct logical file position")
+        saved_state = decoder.getstate()
+        try:
+            decoder.setstate(("", decoder_state))
+            n = 0
+            bb = bytes(1)
+            for i, bb[0] in enumerate(readahead):
+                n += len(decoder.decode(bb))
+                if n >= needed:
+                    decoder_buffer, decoder_state = decoder.getstate()
+                    return self._encode_decoder_state(
+                        decoder_state,
+                        position + (i+1) - len(decoder_buffer))
+            raise IOError("Can't reconstruct logical file position")
+        finally:
+            decoder.setstate(saved_state)
 
     def seek(self, pos, whence=0):
         if not self._seekable:
@@ -1097,12 +1092,11 @@
             self._pending = ""
             self._decoder = None
             return pos
-        decoder = pickle.loads(ds)
+        decoder = self._decoder or self._get_decoder()
+        decoder.set_state(("", ds))
         self.buffer.seek(pos)
         self._snapshot = (ds, b"", "")
         self._pending = ""
-        if not self._decoder_in_rest_pickle:
-            self._get_decoder()  # For its side effect
         self._decoder = decoder
         return orig_pos
 

Modified: python/branches/p3yk/Lib/test/test_io.py
==============================================================================
--- python/branches/p3yk/Lib/test/test_io.py	(original)
+++ python/branches/p3yk/Lib/test/test_io.py	Tue Apr 17 04:38:04 2007
@@ -581,6 +581,36 @@
         self.assertEquals(f.tell(), p2)
         f.close()
 
+    def testSeeking(self):
+        chunk_size = io.TextIOWrapper._CHUNK_SIZE
+        prefix_size = chunk_size - 2
+        u_prefix = u"a" * prefix_size
+        prefix = bytes(u_prefix.encode("utf-8"))
+        self.assertEquals(len(u_prefix), len(prefix))
+        u_suffix = u"\u8888\n"
+        suffix = bytes(u_suffix.encode("utf-8"))
+        line = prefix + suffix
+        f = io.open(test_support.TESTFN, "wb")
+        f.write(line*2)
+        f.close()
+        f = io.open(test_support.TESTFN, "r", encoding="utf-8")
+        s = f.read(prefix_size)
+        self.assertEquals(s, prefix)
+        self.assertEquals(f.tell(), prefix_size)
+        self.assertEquals(f.readline(), u_suffix)
+
+    def testSeekingToo(self):
+        # Regression test for a specific bug
+        data = b'\xe0\xbf\xbf\n'
+        f = io.open(test_support.TESTFN, "wb")
+        f.write(data)
+        f.close()
+        f = io.open(test_support.TESTFN, "r", encoding="utf-8")
+        f._CHUNK_SIZE  # Just test that it exists
+        f._CHUNK_SIZE = 2
+        f.readline()
+        f.tell()
+
     def timingTest(self):
         timer = time.time
         enc = "utf8"


More information about the Python-3000-checkins mailing list