[Python-checkins] python/dist/src/Lib codecs.py,1.33,1.34

doerwalter at users.sourceforge.net doerwalter at users.sourceforge.net
Tue Sep 7 22:24:23 CEST 2004


Update of /cvsroot/python/python/dist/src/Lib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv7801/Lib

Modified Files:
	codecs.py 
Log Message:
SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support
decoding incomplete input (when the input stream is temporarily exhausted).
codecs.StreamReader now implements buffering, which enables proper
readline support for the UTF-16 decoders. codecs.StreamReader.read()
has a new argument chars which specifies the number of characters to
return. codecs.StreamReader.readline() and codecs.StreamReader.readlines()
have a new argument keepends. Trailing "\n"s will be stripped from the lines
if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and
PyUnicode_DecodeUTF16Stateful.


Index: codecs.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/codecs.py,v
retrieving revision 1.33
retrieving revision 1.34
diff -u -d -r1.33 -r1.34
--- codecs.py	26 Feb 2004 15:22:17 -0000	1.33
+++ codecs.py	7 Sep 2004 20:24:04 -0000	1.34
@@ -228,12 +228,22 @@
         """
         self.stream = stream
         self.errors = errors
+        self.bytebuffer = ""
+        self.charbuffer = u""
 
-    def read(self, size=-1):
+    def decode(self, input, errors='strict'):
+        raise NotImplementedError
+
+    def read(self, size=-1, chars=-1):
 
         """ Decodes data from the stream self.stream and returns the
             resulting object.
 
+            chars indicates the number of characters to read from the
+            stream. read() will never return more than chars
+            characters, but it might return less, if there are not enough
+            characters available.
+
             size indicates the approximate maximum number of bytes to
             read from the stream for decoding purposes. The decoder
             can modify this setting as appropriate. The default value
@@ -248,54 +258,70 @@
             on the stream, these should be read too.
 
         """
-        # Unsliced reading:
-        if size < 0:
-            return self.decode(self.stream.read(), self.errors)[0]
-
-        # Sliced reading:
-        read = self.stream.read
-        decode = self.decode
-        data = read(size)
-        i = 0
-        while 1:
-            try:
-                object, decodedbytes = decode(data, self.errors)
-            except ValueError, why:
-                # This method is slow but should work under pretty much
-                # all conditions; at most 10 tries are made
-                i = i + 1
-                newdata = read(1)
-                if not newdata or i > 10:
-                    raise
-                data = data + newdata
+        # read until we get the required number of characters (if available)
+        done = False
+        while True:
+            # can the request can be satisfied from the character buffer?
+            if chars < 0:
+                if self.charbuffer:
+                    done = True
             else:
-                return object
+                if len(self.charbuffer) >= chars:
+                    done = True
+            if done:
+                if chars < 0:
+                    result = self.charbuffer
+                    self.charbuffer = u""
+                    break
+                else:
+                    result = self.charbuffer[:chars]
+                    self.charbuffer = self.charbuffer[chars:]
+                    break
+            # we need more data
+            if size < 0:
+                newdata = self.stream.read()
+            else:
+                newdata = self.stream.read(size)
+            data = self.bytebuffer + newdata
+            object, decodedbytes = self.decode(data, self.errors)
+            # keep undecoded bytes until the next call
+            self.bytebuffer = data[decodedbytes:]
+            # put new characters in the character buffer
+            self.charbuffer += object
+            # there was no data available
+            if not newdata:
+                done = True
+        return result
 
-    def readline(self, size=None):
+    def readline(self, size=None, keepends=True):
 
         """ Read one line from the input stream and return the
             decoded data.
 
-            Note: Unlike the .readlines() method, this method inherits
-            the line breaking knowledge from the underlying stream's
-            .readline() method -- there is currently no support for
-            line breaking using the codec decoder due to lack of line
-            buffering. Subclasses should however, if possible, try to
-            implement this method using their own knowledge of line
-            breaking.
-
-            size, if given, is passed as size argument to the stream's
-            .readline() method.
+            size, if given, is passed as size argument to the
+            read() method.
 
         """
         if size is None:
-            line = self.stream.readline()
-        else:
-            line = self.stream.readline(size)
-        return self.decode(line, self.errors)[0]
-
+            size = 10
+        line = u""
+        while True:
+            data = self.read(size)
+            line += data
+            pos = line.find("\n")
+            if pos>=0:
+                self.charbuffer = line[pos+1:] + self.charbuffer
+                if keepends:
+                    line = line[:pos+1]
+                else:
+                    line = line[:pos]
+                return line
+            elif not data:
+                return line
+            if size<8000:
+                size *= 2
 
-    def readlines(self, sizehint=None):
+    def readlines(self, sizehint=None, keepends=True):
 
         """ Read all lines available on the input stream
             and return them as list of lines.
@@ -307,8 +333,8 @@
             way to finding the true end-of-line.
 
         """
-        data = self.stream.read()
-        return self.decode(data, self.errors)[0].splitlines(1)
+        data = self.read()
+        return self.splitlines(keepends)
 
     def reset(self):
 



More information about the Python-checkins mailing list