[Patches] Unicode Patch Set 2000-04-13

M.-A. Lemburg mal@lemburg.com
Thu, 13 Apr 2000 13:49:05 +0200


This is a multi-part message in MIME format.
--------------8747A0C814C2A18EFA16F3DA
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

The attached patch set contains the following fixes:

Patch Set Contents:
-------------------

Lib/test/output/test_unicode:

Added test output for Unicode string concatenation test.

Python/compile.c:

Fixed problem with Unicode string concatenation:
u = (u"abc" u"abc") previously dumped core.

Lib/codecs.py:

Added more documentation. Clarified some existing comments.

Lib/test/test_unicode.py:

Added test for Unicode string concatenation.

Misc/unicode.txt:

Updated to version 1.4.

-- 
Marc-Andre Lemburg
______________________________________________________________________
Business:                                      http://www.lemburg.com/
Python Pages:                           http://www.lemburg.com/python/
--------------8747A0C814C2A18EFA16F3DA
Content-Type: text/plain; charset=us-ascii;
 name="Unicode-Implementation-2000-04-13.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="Unicode-Implementation-2000-04-13.patch"

diff -u -rbP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x *.rej -x *.orig -x Demo -x CVS -x Doc -x *.orig -x .#* -x distutils -x PC -x PCbuild -x *.py -x ACKS -x *.txt -x README CVS-Python/Lib/test/output/test_unicode Python+Unicode/Lib/test/output/test_unicode
--- CVS-Python/Lib/test/output/test_unicode	Wed Apr  5 22:11:19 2000
+++ Python+Unicode/Lib/test/output/test_unicode	Thu Apr 13 13:40:53 2000
@@ -4,3 +4,4 @@
 Testing Unicode formatting strings... done.
 Testing builtin codecs... done.
 Testing standard mapping codecs... 0-127... 128-255... done.
+Testing Unicode string concatenation... done.
Only in CVS-Python/Lib/test/output: test_zipfile
diff -u -rbP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x *.rej -x *.orig -x Demo -x CVS -x Doc -x *.orig -x .#* -x distutils -x PC -x PCbuild -x *.py -x ACKS -x *.txt -x README CVS-Python/Python/compile.c Python+Unicode/Python/compile.c
--- CVS-Python/Python/compile.c	Tue Apr 11 11:26:15 2000
+++ Python+Unicode/Python/compile.c	Wed Apr 12 18:05:29 2000
@@ -984,11 +984,32 @@
 	REQ(CHILD(n, 0), STRING);
 	if ((v = parsestr(STR(CHILD(n, 0)))) != NULL) {
 		/* String literal concatenation */
-		for (i = 1; i < NCH(n) && v != NULL; i++) {
-			PyString_ConcatAndDel(&v, parsestr(STR(CHILD(n, i))));
+		for (i = 1; i < NCH(n); i++) {
+		    PyObject *s;
+		    s = parsestr(STR(CHILD(n, i)));
+		    if (s == NULL)
+			goto onError;
+		    if (PyString_Check(v) && PyString_Check(s)) {
+			PyString_ConcatAndDel(&v, s);
+			if (v == NULL)
+			    goto onError;
+		    }
+		    else {
+			PyObject *temp;
+			temp = PyUnicode_Concat(v, s);
+			Py_DECREF(s);
+			if (temp == NULL)
+			    goto onError;
+			Py_DECREF(v);
+			v = temp;
+		    }
 		}
 	}
 	return v;
+
+ onError:
+	Py_XDECREF(v);
+	return NULL;
 }
 
 static void
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x *.rej -x *.orig -x Demo -x CVS -x Doc -x *.orig -x .#* -x distutils -x PC -x PCbuild -x *.c -x *.h -x *.in -x output CVS-Python/Lib/codecs.py Python+Unicode/Lib/codecs.py
--- CVS-Python/Lib/codecs.py	Thu Apr 13 11:11:33 2000
+++ Python+Unicode/Lib/codecs.py	Thu Apr 13 13:40:21 2000
@@ -231,10 +231,13 @@
         """ Read one line from the input stream and return the
             decoded data.
 
-            Note: Unlike the .readlines() method, line breaking must
-            be implemented by the underlying stream's .readline()
-            method -- there is currently no support for line breaking
-            using the codec decoder due to lack of line buffering.
+            Note: Unlike the .readlines() method, this method inherits
+            the line breaking knowledge from the underlying stream's
+            .readline() method -- there is currently no support for
+            line breaking using the codec decoder due to lack of line
+            buffering. Sublcasses should however, if possible, try to
+            implement this method using their own knowledge of line
+            breaking.
 
             size, if given, is passed as size argument to the stream's
             .readline() method.
@@ -288,6 +291,14 @@
 
 class StreamReaderWriter:
 
+    """ StreamReaderWriter instances allow wrapping streams which
+        work in both read and write modes.
+
+        The design is such that one can use the factory functions
+        returned by the codec.lookup() function to contruct the
+        instance.
+
+    """
     # Optional attributes set by the file wrappers below
     encoding = 'unknown'
 
@@ -346,6 +357,21 @@
 
 class StreamRecoder:
 
+    """ StreamRecoder instances provide a frontend - backend
+        view of encoding data.
+
+        They use the complete set of APIs returned by the
+        codecs.lookup() function to implement their task.
+
+        Data written to the stream is first decoded into an
+        intermediate format (which is dependent on the given codec
+        combination) and then written to the stream using an instance
+        of the provided Writer class.
+
+        In the other direction, data is read from the stream using a
+        Reader instance and then return encoded data to the caller.
+
+    """
     # Optional attributes set by the file wrappers below
     data_encoding = 'unknown'
     file_encoding = 'unknown'
@@ -452,6 +478,11 @@
         buffering has the same meaning as for the builtin open() API.
         It defaults to line buffered.
 
+        The returned wrapped file object provides an extra attribute
+        .encoding which allows querying the used encoding. This
+        attribute is only available if an encoding was specified as
+        parameter.
+
     """
     if encoding is not None and \
        'b' not in mode:
@@ -487,6 +518,11 @@
 
         data_encoding and file_encoding are added to the wrapped file
         object as attributes .data_encoding and .file_encoding resp.
+
+        The returned wrapped file object provides two extra attributes
+        .data_encoding and .file_encoding which reflect the given
+        parameters of the same name. The attributes can be used for
+        introspection by Python programs.
 
     """
     if file_encoding is None:
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x *.rej -x *.orig -x Demo -x CVS -x Doc -x *.orig -x .#* -x distutils -x PC -x PCbuild -x *.c -x *.h -x *.in -x output CVS-Python/Lib/test/test_unicode.py Python+Unicode/Lib/test/test_unicode.py
--- CVS-Python/Lib/test/test_unicode.py	Thu Apr 13 11:11:37 2000
+++ Python+Unicode/Lib/test/test_unicode.py	Wed Apr 12 18:38:30 2000
@@ -391,3 +391,11 @@
         print '*** codec for "%s" failed: %s' % (encoding, why)
 
 print 'done.'
+
+print 'Testing Unicode string concatenation...',
+assert (u"abc" u"def") == u"abcdef"
+assert ("abc" u"def") == u"abcdef"
+assert (u"abc" "def") == u"abcdef"
+assert (u"abc" u"def" "ghi") == u"abcdefghi"
+assert ("abc" "def" u"ghi") == u"abcdefghi"
+print 'done.'
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x *.rej -x *.orig -x Demo -x CVS -x Doc -x *.orig -x .#* -x distutils -x PC -x PCbuild -x *.c -x *.h -x *.in -x output CVS-Python/Misc/unicode.txt Python+Unicode/Misc/unicode.txt
--- CVS-Python/Misc/unicode.txt	Tue Apr 11 11:25:53 2000
+++ Python+Unicode/Misc/unicode.txt	Thu Apr 13 13:40:51 2000
@@ -1,5 +1,5 @@
 =============================================================================
- Python Unicode Integration                            Proposal Version: 1.3
+ Python Unicode Integration                            Proposal Version: 1.4
 -----------------------------------------------------------------------------
 
 
@@ -162,6 +162,17 @@
 For the same reason, Unicode objects should return the same hash value
 as their UTF-8 equivalent strings.
 
+When compared using cmp() (or PyObject_Compare()) the implementation
+should mask TypeErrors raised during the conversion to remain in synch
+with the string behavior. All other errors such as ValueErrors raised
+during coercion of strings to Unicode should not be masked and passed
+through to the user.
+
+In containment tests ('a' in u'abc' and u'a' in 'abc') both sides
+should be coerced to Unicode before applying the test. Errors occuring
+during coercion (e.g. None in u'abc') should not be masked.
+
+
 Coercion:
 ---------
 
@@ -380,6 +391,13 @@
         data, consumed = self.encode(object,self.errors)
         self.stream.write(data)
         
+    def writelines(self, list):
+
+        """ Writes the concatenated list of strings to the stream
+            using .write().
+        """
+        self.write(''.join(list))
+        
     def reset(self):
 
         """ Flushes and resets the codec buffers used for keeping state.
@@ -463,6 +481,47 @@
             else:
                 return object
 
+    def readline(self, size=None):
+
+        """ Read one line from the input stream and return the
+            decoded data.
+
+            Note: Unlike the .readlines() method, this method inherits
+            the line breaking knowledge from the underlying stream's
+            .readline() method -- there is currently no support for
+            line breaking using the codec decoder due to lack of line
+            buffering. Sublcasses should however, if possible, try to
+            implement this method using their own knowledge of line
+            breaking.
+
+            size, if given, is passed as size argument to the stream's
+            .readline() method.
+            
+        """
+        if size is None:
+            line = self.stream.readline()
+        else:
+            line = self.stream.readline(size)
+        return self.decode(line)[0]
+
+    def readlines(self, sizehint=0):
+
+        """ Read all lines available on the input stream
+            and return them as list of lines.
+
+            Line breaks are implemented using the codec's decoder
+            method and are included in the list entries.
+            
+            sizehint, if given, is passed as size argument to the
+            stream's .read() method.
+
+        """
+        if sizehint is None:
+            data = self.stream.read()
+        else:
+            data = self.stream.read(sizehint)
+        return self.decode(data)[0].splitlines(1)
+
     def reset(self):
 
         """ Resets the codec buffers used for keeping state.
@@ -482,9 +541,6 @@
         """
         return getattr(self.stream,name)
 
-XXX What about .readline(), .readlines() ? These could be implemented
-    using .read() as generic functions instead of requiring their
-    implementation by all codecs. Also see Line Breaks.
 
 Stream codec implementors are free to combine the StreamWriter and
 StreamReader interfaces into one class. Even combining all these with
@@ -692,9 +748,10 @@
 are used as format strings, the following interpretations should be in
 effect:
 
-  '%s':                 '%s' does str(u) for Unicode objects embedded
-                        in Python strings, so the output will be
-                        u.encode(<default encoding>)
+  '%s':                 For Unicode objects this will cause coercion of the
+			whole format string to Unicode. Note that
+			you should use a Unicode format string to start
+			with for performance reasons.
 
 In case the format string is an Unicode object, all parameters are coerced
 to Unicode first and then put together and formatted according to the format
@@ -922,6 +979,9 @@
 	Introducing Unicode to ECMAScript --
 	http://www-4.ibm.com/software/developer/library/internationalization-support.html
 
+IANA Character Set Names:
+	ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets
+
 Encodings:
 
     Overview:
@@ -944,6 +1004,12 @@
 
 History of this Proposal:
 -------------------------
+1.4: Added note about mixed type comparisons and contains tests.
+     Changed treating of Unicode objects in format strings (if used
+     with '%s' % u they will now cause the format string to be
+     coerced to Unicode, thus producing a Unicode object on return).
+     Added link to IANA charset names (thanks to Lars Marius Garshol).
+     Added new codec methods .readline(), .readlines() and .writelines().
 1.3: Added new "es" and "es#" parser markers
 1.2: Removed POD about codecs.open()
 1.1: Added note about comparisons and hash values. Added note about

--------------8747A0C814C2A18EFA16F3DA--