[Python-checkins] CVS: python/dist/src/Objects unicodeobject.c,2.127,2.128

M.-A. Lemburg lemburg@users.sourceforge.net
Thu, 07 Feb 2002 03:33:51 -0800


Update of /cvsroot/python/python/dist/src/Objects
In directory usw-pr-cvs1:/tmp/cvs-serv8617/Objects

Modified Files:
	unicodeobject.c 
Log Message:
Fix to the UTF-8 encoder: it failed on 0-length input strings.

Fix for the UTF-8 decoder: it will now accept isolated surrogates
(previously it raised an exception which causes round-trips to
fail).

Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for
marshalling Unicode objects, so we better make sure it works for
all Unicode code points, including isolated surrogates).

Bumped the PYC magic in a non-standard way -- please review. This
was needed because the old PYC format used illegal UTF-8 sequences
for isolated high surrogates which now raise an exception.



Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.127
retrieving revision 2.128
diff -C2 -d -r2.127 -r2.128
*** unicodeobject.c	6 Feb 2002 18:20:19 -0000	2.127
--- unicodeobject.c	7 Feb 2002 11:33:49 -0000	2.128
***************
*** 1066,1075 ****
  	    }
              ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
!             if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
                  errmsg = "illegal encoding";
  		goto utf8Error;
  	    }
  	    else
! 				*p++ = (Py_UNICODE)ch;
              break;
  
--- 1066,1082 ----
  	    }
              ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
!             if (ch < 0x0800) {
! 		/* Note: UTF-8 encodings of surrogates are considered
! 		   legal UTF-8 sequences; 
! 
! 		   XXX For wide builds (UCS-4) we should probably try
! 		       to recombine the surrogates into a single code
! 		       unit.
! 		*/
                  errmsg = "illegal encoding";
  		goto utf8Error;
  	    }
  	    else
! 		*p++ = (Py_UNICODE)ch;
              break;
  
***************
*** 1085,1091 ****
              /* validate and convert to UTF-16 */
              if ((ch < 0x10000)        /* minimum value allowed for 4
!                                        byte encoding */
                  || (ch > 0x10ffff))   /* maximum value allowed for
!                                        UTF-16 */
  	    {
                  errmsg = "illegal encoding";
--- 1092,1098 ----
              /* validate and convert to UTF-16 */
              if ((ch < 0x10000)        /* minimum value allowed for 4
! 					 byte encoding */
                  || (ch > 0x10ffff))   /* maximum value allowed for
! 					 UTF-16 */
  	    {
                  errmsg = "illegal encoding";
***************
*** 1176,1184 ****
      int i = 0;
  
      v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
      if (v == NULL)
          return NULL;
-     if (size == 0)
-         return v;
  
      p = PyString_AS_STRING(v);
--- 1183,1195 ----
      int i = 0;
  
+     /* Short-cut for emtpy strings */
+     if (size == 0)
+ 	return PyString_FromStringAndSize(NULL, 0);
+ 
+     /* We allocate 4 more bytes to have room for at least one full
+        UTF-8 sequence; saves a few cycles in the loop below */
      v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
      if (v == NULL)
          return NULL;
  
      p = PyString_AS_STRING(v);