[Python-checkins] python/dist/src/Lib codecs.py,1.35,1.35.2.1

doerwalter at users.sourceforge.net doerwalter at users.sourceforge.net
Tue Dec 21 23:35:26 CET 2004


Update of /cvsroot/python/python/dist/src/Lib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv4847/Lib

Modified Files:
      Tag: release24-maint
	codecs.py 
Log Message:
Backport checkin:
The changes to the stateful codecs in 2.4 resulted in StreamReader.readline()
trying to return a complete line even if a size parameter was given (see
http://www.python.org/sf/1076985). This leads to buffer overflows with long
source lines under Windows if e.g. cp1252 is used as the source encoding.
This patch reverts the behaviour of readline() to something that behaves more
like Python 2.3: If a size parameter is given, read() is called only once.

As a side effect of this, readline() now supports all types of linebreaks
supported by unicode.splitlines().

Note that the tokenizer is still broken and it's possible to provoke segfaults
(see http://www.python.org/sf/1089395).


Index: codecs.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/codecs.py,v
retrieving revision 1.35
retrieving revision 1.35.2.1
diff -u -d -r1.35 -r1.35.2.1
--- codecs.py	17 Oct 2004 23:51:21 -0000	1.35
+++ codecs.py	21 Dec 2004 22:35:23 -0000	1.35.2.1
@@ -230,6 +230,7 @@
         self.errors = errors
         self.bytebuffer = ""
         self.charbuffer = u""
+        self.atcr = False
 
     def decode(self, input, errors='strict'):
         raise NotImplementedError
@@ -256,41 +257,39 @@
             definition of the encoding and the given size, e.g.  if
             optional encoding endings or state markers are available
             on the stream, these should be read too.
-
         """
         # read until we get the required number of characters (if available)
-        done = False
         while True:
             # can the request can be satisfied from the character buffer?
             if chars < 0:
                 if self.charbuffer:
-                    done = True
+                    break
             else:
                 if len(self.charbuffer) >= chars:
-                    done = True
-            if done:
-                if chars < 0:
-                    result = self.charbuffer
-                    self.charbuffer = u""
-                    break
-                else:
-                    result = self.charbuffer[:chars]
-                    self.charbuffer = self.charbuffer[chars:]
                     break
             # we need more data
             if size < 0:
                 newdata = self.stream.read()
             else:
                 newdata = self.stream.read(size)
+            # decode bytes (those remaining from the last call included)
             data = self.bytebuffer + newdata
-            object, decodedbytes = self.decode(data, self.errors)
+            newchars, decodedbytes = self.decode(data, self.errors)
             # keep undecoded bytes until the next call
             self.bytebuffer = data[decodedbytes:]
             # put new characters in the character buffer
-            self.charbuffer += object
+            self.charbuffer += newchars
             # there was no data available
             if not newdata:
-                done = True
+                break
+        if chars < 0:
+            # Return everything we've got
+            result = self.charbuffer
+            self.charbuffer = u""
+        else:
+            # Return the first chars characters
+            result = self.charbuffer[:chars]
+            self.charbuffer = self.charbuffer[chars:]
         return result
 
     def readline(self, size=None, keepends=True):
@@ -302,24 +301,36 @@
             read() method.
 
         """
-        if size is None:
-            size = 10
+        readsize = size or 72
         line = u""
+        # If size is given, we call read() only once
         while True:
-            data = self.read(size)
+            data = self.read(readsize)
+            if self.atcr and data.startswith(u"\n"):
+                data = data[1:]
+            if data:
+                self.atcr = data.endswith(u"\r")
             line += data
-            pos = line.find("\n")
-            if pos>=0:
-                self.charbuffer = line[pos+1:] + self.charbuffer
-                if keepends:
-                    line = line[:pos+1]
-                else:
-                    line = line[:pos]
-                return line
-            elif not data:
-                return line
-            if size<8000:
-                size *= 2
+            lines = line.splitlines(True)
+            if lines:
+                line0withend = lines[0]
+                line0withoutend = lines[0].splitlines(False)[0]
+                if line0withend != line0withoutend: # We really have a line end
+                    # Put the rest back together and keep it until the next call
+                    self.charbuffer = u"".join(lines[1:]) + self.charbuffer
+                    if keepends:
+                        line = line0withend
+                    else:
+                        line = line0withoutend
+                break
+            # we didn't get anything or this was our only try
+            elif not data or size is not None:
+                if line and not keepends:
+                    line = line.splitlines(False)[0]
+                break
+            if readsize<8000:
+                readsize *= 2
+        return line
 
     def readlines(self, sizehint=None, keepends=True):
 



More information about the Python-checkins mailing list