[Python-3000-checkins] r54742 - in python/branches/p3yk/Lib: io.py test/test_io.py

Wed Apr 11 03:09:07 CEST 2007

Author: guido.van.rossum
Date: Wed Apr 11 03:09:03 2007
New Revision: 54742

Modified:
   python/branches/p3yk/Lib/io.py
   python/branches/p3yk/Lib/test/test_io.py
Log:
Checkpoint so I can continue to work on this at a different box.
There is somewhat working (but slow) code supporting seek/tell for text files,
but extensive testing exposes a bug I can't nail down.


Modified: python/branches/p3yk/Lib/io.py
==============================================================================

--- python/branches/p3yk/Lib/io.py	(original)
+++ python/branches/p3yk/Lib/io.py	Wed Apr 11 03:09:03 2007
@@ -13,8 +13,9 @@
 
 XXX need to default buffer size to 1 if isatty()
 XXX need to support 1 meaning line-buffered
-XXX change behavior of blocking I/O
 XXX don't use assert to validate input requirements
+XXX whenever an argument is None, use the default value
+XXX read/write ops should check readable/writable
 """
 
 __author__ = ("Guido van Rossum <guido at python.org>, "
@@ -29,9 +30,11 @@
 import os
 import sys
 import codecs
+import pickle
 import _fileio
 import warnings
 
+# XXX Shouldn't we use st_blksize whenever we can?
 DEFAULT_BUFFER_SIZE = 8 * 1024  # bytes
 
 
@@ -44,18 +47,22 @@
         self.characters_written = characters_written
 
 
-def open(file, mode="r", buffering=None, *, encoding=None):
+def open(file, mode="r", buffering=None, *, encoding=None, newline=None):
     """Replacement for the built-in open function.
 
     Args:
       file: string giving the name of the file to be opened;
-            or integer file descriptor of the file to be wrapped (*)
-      mode: optional mode string; see below
+            or integer file descriptor of the file to be wrapped (*).
+      mode: optional mode string; see below.
       buffering: optional int >= 0 giving the buffer size; values
                  can be: 0 = unbuffered, 1 = line buffered,
-                 larger = fully buffered
-      encoding: optional string giving the text encoding (*must* be given
-                as a keyword argument)
+                 larger = fully buffered.
+    Keywords (for text modes only; *must* be given as keyword arguments):
+      encoding: optional string giving the text encoding.
+      newline: optional newlines specifier; must be None, '\n' or '\r\n';
+               specifies the line ending expected on input and written on
+               output.  If None, use universal newlines on input and
+               use os.linesep on output.
 
     (*) If a file descriptor is given, it is closed when the returned
     I/O object is closed.  If you don't want this to happen, use
@@ -79,6 +86,7 @@
       binary stream, a buffered binary stream, or a buffered text
       stream, open for reading and/or writing.
     """
+    # XXX Don't use asserts for these checks; raise TypeError or ValueError
     assert isinstance(file, (basestring, int)), repr(file)
     assert isinstance(mode, basestring), repr(mode)
     assert buffering is None or isinstance(buffering, int), repr(buffering)
@@ -101,7 +109,9 @@
     if not (reading or writing or appending):
         raise ValueError("must have exactly one of read/write/append mode")
     if binary and encoding is not None:
-        raise ValueError("binary mode doesn't take an encoding")
+        raise ValueError("binary mode doesn't take an encoding argument")
+    if binary and newline is not None:
+        raise ValueError("binary mode doesn't take a newline argument")
     raw = FileIO(file,
                  (reading and "r" or "") +
                  (writing and "w" or "") +
@@ -132,9 +142,7 @@
         buffer = BufferedReader(raw, buffering)
     if binary:
         return buffer
-    # XXX What about newline conventions?
-    textio = TextIOWrapper(buffer, encoding)
-    return textio
+    return TextIOWrapper(buffer, encoding, newline)
 
 
 class IOBase:
@@ -795,6 +803,8 @@
     """Base class for text I/O.
 
     This class provides a character and line based interface to stream I/O.
+
+    There is no readinto() method, as character strings are immutable.
     """
 
     def read(self, n: int = -1) -> str:
@@ -805,10 +815,18 @@
         """
         self._unsupported("read")
 
-    def write(self, s: str):
-        """write(s: str) -> None.  Write string s to stream."""
+    def write(self, s: str) -> int:
+        """write(s: str) -> int.  Write string s to stream."""
         self._unsupported("write")
 
+    def truncate(self, pos: int = None) -> int:
+        """truncate(pos: int = None) -> int.  Truncate size to pos."""
+        self.flush()
+        if pos is None:
+            pos = self.tell()
+        self.seek(pos)
+        return self.buffer.truncate()
+
     def readline(self) -> str:
         """readline() -> str.  Read until newline or EOF.
 
@@ -816,12 +834,12 @@
         """
         self._unsupported("readline")
 
-    def __iter__(self):
+    def __iter__(self) -> "TextIOBase":  # That's a forward reference
         """__iter__() -> Iterator.  Return line iterator (actually just self).
         """
         return self
 
-    def next(self):
+    def next(self) -> str:
         """Same as readline() except raises StopIteration on immediate EOF."""
         line = self.readline()
         if not line:
@@ -855,11 +873,11 @@
     Character and line based layer over a BufferedIOBase object.
     """
 
-    # XXX tell(), seek()
+    _CHUNK_SIZE = 64
 
     def __init__(self, buffer, encoding=None, newline=None):
-        if newline not in (None, '\n', '\r\n'):
-            raise IOError("illegal newline %s" % newline) # XXX: ValueError?
+        if newline not in (None, "\n", "\r\n"):
+            raise ValueError("illegal newline value: %r" % (newline,))
         if encoding is None:
             # XXX This is questionable
             encoding = sys.getfilesystemencoding() or "latin-1"
@@ -869,7 +887,20 @@
         self._newline = newline or os.linesep
         self._fix_newlines = newline is None
         self._decoder = None
-        self._pending = ''
+        self._decoder_in_rest_pickle = None
+        self._pending = ""
+        self._snapshot = None
+        self._seekable = self.buffer.seekable()
+
+    # A word about _snapshot.  This attribute is either None, or a
+    # tuple (position, decoder_pickle, readahead) where position is a
+    # position of the underlying buffer, decoder_pickle is a pickled
+    # decoder state, and readahead is the chunk of bytes that was read
+    # from that position.  We use this to reconstruct intermediate
+    # decoder states in tell().
+
+    def _seekable(self):
+        return self._seekable
 
     def flush(self):
         self.buffer.flush()
@@ -886,35 +917,124 @@
         return self.buffer.fileno()
 
     def write(self, s: str):
+        # XXX What if we were just reading?
         b = s.encode(self._encoding)
         if isinstance(b, str):
             b = bytes(b)
         n = self.buffer.write(b)
         if "\n" in s:
             self.flush()
-        return n
+        self._snapshot = self._decoder = None
+        return len(s)
 
     def _get_decoder(self):
         make_decoder = codecs.getincrementaldecoder(self._encoding)
         if make_decoder is None:
-            raise IOError(".readline() not supported for encoding %s" %
+            raise IOError("Can't find an incremental decoder for encoding %s" %
                           self._encoding)
         decoder = self._decoder = make_decoder()  # XXX: errors
         if isinstance(decoder, codecs.BufferedIncrementalDecoder):
             # XXX Hack: make the codec use bytes instead of strings
             decoder.buffer = b""
+        self._decoder_in_rest_pickle = pickle.dumps(decoder, 2)  # For tell()
         return decoder
 
+    def _read_chunk(self):
+        if not self._seekable:
+            return self.buffer.read(self._CHUNK_SIZE)
+        assert self._decoder is not None
+        position = self.buffer.tell()
+        decoder_state = pickle.dumps(self._decoder, 2)
+        readahead = self.buffer.read(self._CHUNK_SIZE)
+        self._snapshot = (position, decoder_state, readahead)
+        return readahead
+
+    def _encode_decoder_state(self, ds, pos):
+        if ds == self._decoder_in_rest_pickle:
+            return pos
+        x = 0
+        for i in bytes(ds):
+            x = x<<8 | i
+        return (x<<64) | pos
+
+    def _decode_decoder_state(self, pos):
+        x, pos = divmod(pos, 1<<64)
+        if not x:
+            return None, pos
+        b = b""
+        while x:
+            b.append(x&0xff)
+            x >>= 8
+        return str(b[::-1]), pos
+
+    def tell(self):
+        if not self._seekable:
+            raise IOError("Underlying stream is not seekable")
+        self.flush()
+        if self._decoder is None or self._snapshot is None:
+            assert self._pending == ""
+            return self.buffer.tell()
+        position, decoder_state, readahead = self._snapshot
+        decoder = pickle.loads(decoder_state)
+        characters = ""
+        sequence = []
+        for i, b in enumerate(readahead):
+            c = decoder.decode(bytes([b]))
+            if c:
+                characters += c
+                sequence.append((characters, i+1, pickle.dumps(decoder, 2)))
+        for ch, i, st in sequence:
+            if ch + self._pending == characters:
+                return self._encode_decoder_state(st, position + i)
+        raise IOError("Can't reconstruct logical file position")
+
+    def seek(self, pos, whence=0):
+        if not self._seekable:
+            raise IOError("Underlying stream is not seekable")
+        if whence == 1:
+            if pos != 0:
+                raise IOError("Can't do nonzero cur-relative seeks")
+            return self.tell()
+        if whence == 2:
+            if pos != 0:
+                raise IOError("Can't do nonzero end-relative seeks")
+            self.flush()
+            pos = self.buffer.seek(0, 2)
+            self._snapshot = None
+            self._pending = ""
+            self._decoder = None
+            return pos
+        if whence != 0:
+            raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" %
+                             (whence,))
+        if pos < 0:
+            raise ValueError("Negative seek position %r" % (pos,))
+        orig_pos = pos
+        ds, pos = self._decode_decoder_state(pos)
+        if not ds:
+            self.buffer.seek(pos)
+            self._snapshot = None
+            self._pending = ""
+            self._decoder = None
+            return pos
+        decoder = pickle.loads(ds)
+        self.buffer.seek(pos)
+        self._snapshot = (pos, ds, "")
+        self._pending = ""
+        self._decoder = None
+        return orig_pos
+
     def read(self, n: int = -1):
         decoder = self._decoder or self._get_decoder()
         res = self._pending
         if n < 0:
             res += decoder.decode(self.buffer.read(), True)
             self._pending = ""
+            self._snapshot = None
             return res
         else:
             while len(res) < n:
-                data = self.buffer.read(64)
+                data = self._read_chunk()
                 res += decoder.decode(data, not data)
                 if not data:
                     break
@@ -923,7 +1043,7 @@
 
     def readline(self, limit=None):
         if limit is not None:
-            # XXX Hack to support limit arg
+            # XXX Hack to support limit argument, for backwards compatibility
             line = self.readline()
             if len(line) <= limit:
                 return line
@@ -951,7 +1071,7 @@
 
                 # We've seen \r - is it standalone, \r\n or \r at end of line?
                 if endpos + 1 < len(line):
-                    if line[endpos+1] == '\n':
+                    if line[endpos+1] == "\n":
                         ending = "\r\n"
                     else:
                         ending = "\r"
@@ -963,7 +1083,7 @@
 
             # No line ending seen yet - get more data
             while True:
-                data = self.buffer.read(64)
+                data = self._read_chunk()
                 more_line = decoder.decode(data, not data)
                 if more_line or not data:
                     break

Modified: python/branches/p3yk/Lib/test/test_io.py
==============================================================================
--- python/branches/p3yk/Lib/test/test_io.py	(original)
+++ python/branches/p3yk/Lib/test/test_io.py	Wed Apr 11 03:09:03 2007
@@ -93,6 +93,32 @@
         self.assertEqual(f.truncate(12), 12)
         self.assertEqual(f.tell(), 12)
 
+    def read_ops(self, f, buffered=False):
+        data = f.read(5)
+        self.assertEqual(data, b"hello")
+        self.assertEqual(f.readinto(data), 5)
+        self.assertEqual(data, b" worl")
+        self.assertEqual(f.readinto(data), 2)
+        self.assertEqual(len(data), 5)
+        self.assertEqual(data[:2], b"d\n")
+        self.assertEqual(f.seek(0), 0)
+        self.assertEqual(f.read(20), b"hello world\n")
+        self.assertEqual(f.read(1), b"")
+        self.assertEqual(f.readinto(b"x"), 0)
+        self.assertEqual(f.seek(-6, 2), 6)
+        self.assertEqual(f.read(5), b"world")
+        self.assertEqual(f.read(0), b"")
+        self.assertEqual(f.readinto(b""), 0)
+        self.assertEqual(f.seek(-6, 1), 5)
+        self.assertEqual(f.read(5), b" worl")
+        self.assertEqual(f.tell(), 10)
+        if buffered:
+            f.seek(0)
+            self.assertEqual(f.read(), b"hello world\n")
+            f.seek(6)
+            self.assertEqual(f.read(), b"world\n")
+            self.assertEqual(f.read(), b"")
+
     LARGE = 2**31
 
     def large_file_ops(self, f):
@@ -112,24 +138,6 @@
         self.assertEqual(f.seek(-1, 2), self.LARGE)
         self.assertEqual(f.read(2), b"x")
 
-    def read_ops(self, f):
-        data = f.read(5)
-        self.assertEqual(data, b"hello")
-        n = f.readinto(data)
-        self.assertEqual(n, 5)
-        self.assertEqual(data, b" worl")
-        n = f.readinto(data)
-        self.assertEqual(n, 2)
-        self.assertEqual(len(data), 5)
-        self.assertEqual(data[:2], b"d\n")
-        f.seek(0)
-        self.assertEqual(f.read(20), b"hello world\n")
-        f.seek(-6, 2)
-        self.assertEqual(f.read(5), b"world")
-        f.seek(-6, 1)
-        self.assertEqual(f.read(5), b" worl")
-        self.assertEqual(f.tell(), 10)
-
     def test_raw_file_io(self):
         f = io.open(test_support.TESTFN, "wb", buffering=0)
         self.assertEqual(f.readable(), False)
@@ -155,7 +163,7 @@
         self.assertEqual(f.readable(), True)
         self.assertEqual(f.writable(), False)
         self.assertEqual(f.seekable(), True)
-        self.read_ops(f)
+        self.read_ops(f, True)
         f.close()
 
     def test_raw_bytes_io(self):
@@ -164,7 +172,7 @@
         data = f.getvalue()
         self.assertEqual(data, b"hello world\n")
         f = io.BytesIO(data)
-        self.read_ops(f)
+        self.read_ops(f, True)
 
     def test_large_file_ops(self):
         # On Windows and Mac OSX this test comsumes large resources; It takes
@@ -445,6 +453,10 @@
 
 
 class TextIOWrapperTest(unittest.TestCase):
+
+##     def tearDown(self):
+##         test_support.unlink(test_support.TESTFN)
+
     def testNewlines(self):
         input_lines = [ "unix\n", "windows\r\n", "os9\r", "last\n", "nonl" ]
 
@@ -486,6 +498,62 @@
                             self.assertEquals(got_line, exp_line)
                         self.assertEquals(len(got_lines), len(exp_lines))
 
+    # Systematic tests of the text I/O API
+
+    def testBasicIO(self):
+        for chunksize in (1, 2, 3, 4, 5, 15, 16, 17, 31, 32, 33, 63, 64, 65):
+            for enc in "ascii", "latin1", "utf8" :# , "utf-16-be", "utf-16-le":
+                f = io.open(test_support.TESTFN, "w+", encoding=enc)
+                f._CHUNK_SIZE = chunksize
+                self.assertEquals(f.write("abc"), 3)
+                f.close()
+                f = io.open(test_support.TESTFN, "r+", encoding=enc)
+                f._CHUNK_SIZE = chunksize
+                self.assertEquals(f.tell(), 0)
+                self.assertEquals(f.read(), "abc")
+                cookie = f.tell()
+                self.assertEquals(f.seek(0), 0)
+                self.assertEquals(f.read(2), "ab")
+                self.assertEquals(f.read(1), "c")
+                self.assertEquals(f.read(1), "")
+                self.assertEquals(f.read(), "")
+                self.assertEquals(f.tell(), cookie)
+                self.assertEquals(f.seek(0), 0)
+                self.assertEquals(f.seek(0, 2), cookie)
+                self.assertEquals(f.write("def"), 3)
+                self.assertEquals(f.seek(cookie), cookie)
+                self.assertEquals(f.read(), "def")
+                if enc.startswith("utf"):
+                    self.multi_line_test(f, enc)
+                f.close()
+
+    def multi_line_test(self, f, enc):
+        f.seek(0)
+        f.truncate()
+        sample = u"s\xff\u0fff\uffff"
+        wlines = []
+        for size in (0, 1, 2, 3, 4, 5, 15, 16, 17, 31, 32, 33, 63, 64, 65,
+                     100, 200, 300, 400, 500, 1000):
+            chars = []
+            for i in xrange(size):
+                chars.append(sample[i % len(sample)])
+            line = u"".join(chars) + "\n"
+            wlines.append((f.tell(), line))
+            f.write(line)
+        wendpos = f.tell()
+        f.seek(0)
+        rlines = []
+        while True:
+            pos = f.tell()
+            line = f.readline()
+            if not line:
+                rendpos = pos
+                break
+            rlines.append((pos, line))
+        self.assertEquals(rendpos, wendpos)
+        self.assertEquals(rlines, wlines)
+
+
 # XXX Tests for open()
 
 def test_main():