[Python-3000-checkins] r65264 - in python/branches/py3k: Lib/io.py Misc/NEWS

antoine.pitrou python-3000-checkins at python.org
Mon Jul 28 21:46:11 CEST 2008


Author: antoine.pitrou
Date: Mon Jul 28 21:46:11 2008
New Revision: 65264

Log:
#2523: binary buffered reading is quadratic



Modified:
   python/branches/py3k/Lib/io.py
   python/branches/py3k/Misc/NEWS

Modified: python/branches/py3k/Lib/io.py
==============================================================================
--- python/branches/py3k/Lib/io.py	(original)
+++ python/branches/py3k/Lib/io.py	Mon Jul 28 21:46:11 2008
@@ -893,8 +893,12 @@
         """
         raw._checkReadable()
         _BufferedIOMixin.__init__(self, raw)
-        self._read_buf = b""
         self.buffer_size = buffer_size
+        self._reset_read_buf()
+
+    def _reset_read_buf(self):
+        self._read_buf = b""
+        self._read_pos = 0
 
     def read(self, n=None):
         """Read n bytes.
@@ -904,25 +908,50 @@
         mode. If n is negative, read until EOF or until read() would
         block.
         """
-        if n is None:
-            n = -1
         nodata_val = b""
-        while n < 0 or len(self._read_buf) < n:
-            to_read = max(self.buffer_size,
-                          n if n is not None else 2*len(self._read_buf))
-            current = self.raw.read(to_read)
-            if current in (b"", None):
-                nodata_val = current
+        empty_values = (b"", None)
+        buf = self._read_buf
+        pos = self._read_pos
+
+        # Special case for when the number of bytes to read is unspecified.
+        if n is None or n == -1:
+            self._reset_read_buf()
+            chunks = [buf[pos:]]  # Strip the consumed bytes.
+            current_size = 0
+            while True:
+                # Read until EOF or until read() would block.
+                chunk = self.raw.read()
+                if chunk in empty_values:
+                    nodata_val = chunk
+                    break
+                current_size += len(chunk)
+                chunks.append(chunk)
+            return b"".join(chunks) or nodata_val
+
+        # The number of bytes to read is specified, return at most n bytes.
+        avail = len(buf) - pos  # Length of the available buffered data.
+        if n <= avail:
+            # Fast path: the data to read is fully buffered.
+            self._read_pos += n
+            return buf[pos:pos+n]
+        # Slow path: read from the stream until enough bytes are read,
+        # or until an EOF occurs or until read() would block.
+        chunks = [buf[pos:]]
+        wanted = max(self.buffer_size, n)
+        while avail < n:
+            chunk = self.raw.read(wanted)
+            if chunk in empty_values:
+                nodata_val = chunk
                 break
-            self._read_buf += current
-        if self._read_buf:
-            if n < 0:
-                n = len(self._read_buf)
-            out = self._read_buf[:n]
-            self._read_buf = self._read_buf[n:]
-        else:
-            out = nodata_val
-        return out
+            avail += len(chunk)
+            chunks.append(chunk)
+        # n is more then avail only when an EOF occurred or when
+        # read() would have blocked.
+        n = min(n, avail)
+        out = b"".join(chunks)
+        self._read_buf = out[n:]  # Save the extra data in the buffer.
+        self._read_pos = 0
+        return out[:n] if out else nodata_val
 
     def peek(self, n=0):
         """Returns buffered bytes without advancing the position.
@@ -932,13 +961,14 @@
         than self.buffer_size.
         """
         want = min(n, self.buffer_size)
-        have = len(self._read_buf)
+        have = len(self._read_buf) - self._read_pos
         if have < want:
             to_read = self.buffer_size - have
             current = self.raw.read(to_read)
             if current:
-                self._read_buf += current
-        return self._read_buf
+                self._read_buf = self._read_buf[self._read_pos:] + current
+                self._read_pos = 0
+        return self._read_buf[self._read_pos:]
 
     def read1(self, n):
         """Reads up to n bytes, with at most one read() system call."""
@@ -947,16 +977,16 @@
         if n <= 0:
             return b""
         self.peek(1)
-        return self.read(min(n, len(self._read_buf)))
+        return self.read(min(n, len(self._read_buf) - self._read_pos))
 
     def tell(self):
-        return self.raw.tell() - len(self._read_buf)
+        return self.raw.tell() - len(self._read_buf) + self._read_pos
 
     def seek(self, pos, whence=0):
         if whence == 1:
-            pos -= len(self._read_buf)
+            pos -= len(self._read_buf) - self._read_pos
         pos = self.raw.seek(pos, whence)
-        self._read_buf = b""
+        self._reset_read_buf()
         return pos
 
 
@@ -1125,14 +1155,14 @@
         # First do the raw seek, then empty the read buffer, so that
         # if the raw seek fails, we don't lose buffered data forever.
         pos = self.raw.seek(pos, whence)
-        self._read_buf = b""
+        self._reset_read_buf()
         return pos
 
     def tell(self):
-        if (self._write_buf):
+        if self._write_buf:
             return self.raw.tell() + len(self._write_buf)
         else:
-            return self.raw.tell() - len(self._read_buf)
+            return BufferedReader.tell(self)
 
     def truncate(self, pos=None):
         if pos is None:
@@ -1161,8 +1191,9 @@
 
     def write(self, b):
         if self._read_buf:
-            self.raw.seek(-len(self._read_buf), 1) # Undo readahead
-            self._read_buf = b""
+            # Undo readahead
+            self.raw.seek(self._read_pos - len(self._read_buf), 1)
+            self._reset_read_buf()
         return BufferedWriter.write(self, b)
 
 

Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS	(original)
+++ python/branches/py3k/Misc/NEWS	Mon Jul 28 21:46:11 2008
@@ -22,6 +22,9 @@
   file name rather than a ZipInfo instance, so files are extracted with
   mode 0600 rather than 000 under Unix.
 
+- Issue #2523: Fix quadratic behaviour when read()ing a binary file without
+  asking for a specific length.
+
 
 What's new in Python 3.0b2?
 ===========================


More information about the Python-3000-checkins mailing list