[Python-checkins] r64754 - in python/branches/release25-maint: Lib/socket.py Lib/test/test_socket.py Misc/NEWS

gregory.p.smith python-checkins at python.org
Sun Jul 6 06:04:08 CEST 2008


Author: gregory.p.smith
Date: Sun Jul  6 06:04:07 2008
New Revision: 64754

Log:
Backport r62627 + r62744 from trunk to fix issue 2632.
- Issue #2632: Prevent socket.read(bignumber) from over allocating memory
  in the common case when the data is returned from the underlying socket
  in increments much smaller than bignumber.


Modified:
   python/branches/release25-maint/Lib/socket.py
   python/branches/release25-maint/Lib/test/test_socket.py
   python/branches/release25-maint/Misc/NEWS

Modified: python/branches/release25-maint/Lib/socket.py
==============================================================================
--- python/branches/release25-maint/Lib/socket.py	(original)
+++ python/branches/release25-maint/Lib/socket.py	Sun Jul  6 06:04:07 2008
@@ -56,6 +56,11 @@
 import os, sys
 
 try:
+    from cStringIO import StringIO
+except ImportError:
+    from StringIO import StringIO
+
+try:
     from errno import EBADF
 except ImportError:
     EBADF = 9
@@ -211,6 +216,9 @@
             bufsize = self.default_bufsize
         self.bufsize = bufsize
         self.softspace = False
+        # _rbufsize is the suggested recv buffer size.  It is *strictly*
+        # obeyed within readline() for recv calls.  If it is larger than
+        # default_bufsize it will be used for recv calls within read().
         if bufsize == 0:
             self._rbufsize = 1
         elif bufsize == 1:
@@ -218,7 +226,11 @@
         else:
             self._rbufsize = bufsize
         self._wbufsize = bufsize
-        self._rbuf = "" # A string
+        # We use StringIO for the read buffer to avoid holding a list
+        # of variously sized string objects which have been known to
+        # fragment the heap due to how they are malloc()ed and often
+        # realloc()ed down much smaller than their original allocation.
+        self._rbuf = StringIO()
         self._wbuf = [] # A list of strings
         self._close = close
 
@@ -276,56 +288,85 @@
         return buf_len
 
     def read(self, size=-1):
-        data = self._rbuf
+        # Use max, disallow tiny reads in a loop as they are very inefficient.
+        # We never leave read() with any leftover data from a new recv() call
+        # in our internal buffer.
+        rbufsize = max(self._rbufsize, self.default_bufsize)
+        # Our use of StringIO rather than lists of string objects returned by
+        # recv() minimizes memory usage and fragmentation that occurs when
+        # rbufsize is large compared to the typical return value of recv().
+        buf = self._rbuf
+        buf.seek(0, 2)  # seek end
         if size < 0:
             # Read until EOF
-            buffers = []
-            if data:
-                buffers.append(data)
-            self._rbuf = ""
-            if self._rbufsize <= 1:
-                recv_size = self.default_bufsize
-            else:
-                recv_size = self._rbufsize
+            self._rbuf = StringIO()  # reset _rbuf.  we consume it via buf.
             while True:
-                data = self._sock.recv(recv_size)
+                data = self._sock.recv(rbufsize)
                 if not data:
                     break
-                buffers.append(data)
-            return "".join(buffers)
+                buf.write(data)
+            return buf.getvalue()
         else:
             # Read until size bytes or EOF seen, whichever comes first
-            buf_len = len(data)
+            buf_len = buf.tell()
             if buf_len >= size:
-                self._rbuf = data[size:]
-                return data[:size]
-            buffers = []
-            if data:
-                buffers.append(data)
-            self._rbuf = ""
+                # Already have size bytes in our buffer?  Extract and return.
+                buf.seek(0)
+                rv = buf.read(size)
+                self._rbuf = StringIO()
+                self._rbuf.write(buf.read())
+                return rv
+
+            self._rbuf = StringIO()  # reset _rbuf.  we consume it via buf.
             while True:
                 left = size - buf_len
-                recv_size = min(self._rbufsize, left)
-                data = self._sock.recv(recv_size)
+                # recv() will malloc the amount of memory given as its
+                # parameter even though it often returns much less data
+                # than that.  The returned data string is short lived
+                # as we copy it into a StringIO and free it.  This avoids
+                # fragmentation issues on many platforms.
+                data = self._sock.recv(left)
                 if not data:
                     break
-                buffers.append(data)
                 n = len(data)
-                if n >= left:
-                    self._rbuf = data[left:]
-                    buffers[-1] = data[:left]
+                if n == size and not buf_len:
+                    # Shortcut.  Avoid buffer data copies when:
+                    # - We have no data in our buffer.
+                    # AND
+                    # - Our call to recv returned exactly the
+                    #   number of bytes we were asked to read.
+                    return data
+                if n == left:
+                    buf.write(data)
+                    del data  # explicit free
                     break
+                assert n <= left, "recv(%d) returned %d bytes" % (left, n)
+                buf.write(data)
                 buf_len += n
-            return "".join(buffers)
+                del data  # explicit free
+                #assert buf_len == buf.tell()
+            return buf.getvalue()
 
     def readline(self, size=-1):
-        data = self._rbuf
+        buf = self._rbuf
+        buf.seek(0, 2)  # seek end
+        if buf.tell() > 0:
+            # check if we already have it in our buffer
+            buf.seek(0)
+            bline = buf.readline(size)
+            if bline.endswith('\n') or len(bline) == size:
+                self._rbuf = StringIO()
+                self._rbuf.write(buf.read())
+                return bline
+            del bline
         if size < 0:
             # Read until \n or EOF, whichever comes first
             if self._rbufsize <= 1:
                 # Speed up unbuffered case
-                assert data == ""
-                buffers = []
+                buf.seek(0)
+                buffers = [buf.read()]
+                self._rbuf = StringIO()  # reset _rbuf.  we consume it via buf.
+                data = None
                 recv = self._sock.recv
                 while data != "\n":
                     data = recv(1)
@@ -333,61 +374,64 @@
                         break
                     buffers.append(data)
                 return "".join(buffers)
-            nl = data.find('\n')
-            if nl >= 0:
-                nl += 1
-                self._rbuf = data[nl:]
-                return data[:nl]
-            buffers = []
-            if data:
-                buffers.append(data)
-            self._rbuf = ""
+
+            buf.seek(0, 2)  # seek end
+            self._rbuf = StringIO()  # reset _rbuf.  we consume it via buf.
             while True:
                 data = self._sock.recv(self._rbufsize)
                 if not data:
                     break
-                buffers.append(data)
                 nl = data.find('\n')
                 if nl >= 0:
                     nl += 1
-                    self._rbuf = data[nl:]
-                    buffers[-1] = data[:nl]
+                    buf.write(buffer(data, 0, nl))
+                    self._rbuf.write(buffer(data, nl))
+                    del data
                     break
-            return "".join(buffers)
+                buf.write(data)
+            return buf.getvalue()
         else:
             # Read until size bytes or \n or EOF seen, whichever comes first
-            nl = data.find('\n', 0, size)
-            if nl >= 0:
-                nl += 1
-                self._rbuf = data[nl:]
-                return data[:nl]
-            buf_len = len(data)
+            buf.seek(0, 2)  # seek end
+            buf_len = buf.tell()
             if buf_len >= size:
-                self._rbuf = data[size:]
-                return data[:size]
-            buffers = []
-            if data:
-                buffers.append(data)
-            self._rbuf = ""
+                buf.seek(0)
+                rv = buf.read(size)
+                self._rbuf = StringIO()
+                self._rbuf.write(buf.read())
+                return rv
+            self._rbuf = StringIO()  # reset _rbuf.  we consume it via buf.
             while True:
                 data = self._sock.recv(self._rbufsize)
                 if not data:
                     break
-                buffers.append(data)
                 left = size - buf_len
+                # did we just receive a newline?
                 nl = data.find('\n', 0, left)
                 if nl >= 0:
                     nl += 1
-                    self._rbuf = data[nl:]
-                    buffers[-1] = data[:nl]
-                    break
+                    # save the excess data to _rbuf
+                    self._rbuf.write(buffer(data, nl))
+                    if buf_len:
+                        buf.write(buffer(data, 0, nl))
+                        break
+                    else:
+                        # Shortcut.  Avoid data copy through buf when returning
+                        # a substring of our first recv().
+                        return data[:nl]
                 n = len(data)
+                if n == size and not buf_len:
+                    # Shortcut.  Avoid data copy through buf when
+                    # returning exactly all of our first recv().
+                    return data
                 if n >= left:
-                    self._rbuf = data[left:]
-                    buffers[-1] = data[:left]
+                    buf.write(buffer(data, 0, left))
+                    self._rbuf.write(buffer(data, left))
                     break
+                buf.write(data)
                 buf_len += n
-            return "".join(buffers)
+                #assert buf_len == buf.tell()
+            return buf.getvalue()
 
     def readlines(self, sizehint=0):
         total = 0

Modified: python/branches/release25-maint/Lib/test/test_socket.py
==============================================================================
--- python/branches/release25-maint/Lib/test/test_socket.py	(original)
+++ python/branches/release25-maint/Lib/test/test_socket.py	Sun Jul  6 06:04:07 2008
@@ -762,6 +762,33 @@
         self.cli_file.write(MSG)
         self.cli_file.flush()
 
+    def testReadlineAfterRead(self):
+        a_baloo_is = self.serv_file.read(len("A baloo is"))
+        self.assertEqual("A baloo is", a_baloo_is)
+        _a_bear = self.serv_file.read(len(" a bear"))
+        self.assertEqual(" a bear", _a_bear)
+        line = self.serv_file.readline()
+        self.assertEqual("\n", line)
+        line = self.serv_file.readline()
+        self.assertEqual("A BALOO IS A BEAR.\n", line)
+        line = self.serv_file.readline()
+        self.assertEqual(MSG, line)
+
+    def _testReadlineAfterRead(self):
+        self.cli_file.write("A baloo is a bear\n")
+        self.cli_file.write("A BALOO IS A BEAR.\n")
+        self.cli_file.write(MSG)
+        self.cli_file.flush()
+
+    def testReadlineAfterReadNoNewline(self):
+        end_of_ = self.serv_file.read(len("End Of "))
+        self.assertEqual("End Of ", end_of_)
+        line = self.serv_file.readline()
+        self.assertEqual("Line", line)
+
+    def _testReadlineAfterReadNoNewline(self):
+        self.cli_file.write("End Of Line")
+
     def testClosedAttr(self):
         self.assert_(not self.serv_file.closed)
 

Modified: python/branches/release25-maint/Misc/NEWS
==============================================================================
--- python/branches/release25-maint/Misc/NEWS	(original)
+++ python/branches/release25-maint/Misc/NEWS	Sun Jul  6 06:04:07 2008
@@ -102,6 +102,9 @@
   stdout and stderr fds rather than leaving them open until the
   instance is destroyed.
 
+- Issue #2632: Prevent socket.read(bignumber) from over allocating memory
+  in the common case when the data is returned from the underlying socket
+  in increments much smaller than bignumber.
 
 Extension Modules
 -----------------


More information about the Python-checkins mailing list