[Python-checkins] CVS: python/dist/src/Objects unicodeobject.c,2.97,2.98

Martin v. L?wis loewis@users.sourceforge.net
Tue, 26 Jun 2001 15:22:39 -0700


Update of /cvsroot/python/python/dist/src/Objects
In directory usw-pr-cvs1:/tmp/cvs-serv31084/Objects

Modified Files:
	unicodeobject.c 
Log Message:
Support using UCS-4 as the Py_UNICODE type:
Add configure option --enable-unicode.
Add config.h macros Py_USING_UNICODE, PY_UNICODE_TYPE, Py_UNICODE_SIZE,
                    SIZEOF_WCHAR_T.
Define Py_UCS2.
Encode and decode large UTF-8 characters into single Py_UNICODE values
for wide Unicode types; likewise for UTF-16.
Remove test whether sizeof Py_UNICODE is two.


Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.97
retrieving revision 2.98
diff -C2 -r2.97 -r2.98
*** unicodeobject.c	2001/06/26 17:17:07	2.97
--- unicodeobject.c	2001/06/26 22:22:37	2.98
***************
*** 772,782 ****
                   ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
              /* validate and convert to UTF-16 */
!             if ((ch < 0x10000) ||   /* minimum value allowed for 4
                                         byte encoding */
!                 (ch > 0x10ffff)) {  /* maximum value allowed for
                                         UTF-16 */
                  errmsg = "illegal encoding";
  		goto utf8Error;
  	    }
              /*  compute and append the two surrogates: */
              
--- 772,786 ----
                   ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
              /* validate and convert to UTF-16 */
!             if ((ch < 0x10000)        /* minimum value allowed for 4
                                         byte encoding */
!                 || (ch > 0x10ffff))   /* maximum value allowed for
                                         UTF-16 */
+ 	    {
                  errmsg = "illegal encoding";
  		goto utf8Error;
  	    }
+ #if Py_UNICODE_SIZE == 4
+ 	    *p++ = (Py_UNICODE)ch;
+ #else
              /*  compute and append the two surrogates: */
              
***************
*** 789,792 ****
--- 793,797 ----
              /*  low surrogate = bottom 10 bits added to DC00 */
              *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
+ #endif
              break;
  
***************
*** 879,883 ****
              cbWritten += 2;
          }
!         else {
              /* Check for high surrogate */
              if (0xD800 <= ch && ch <= 0xDBFF) {
--- 884,894 ----
              cbWritten += 2;
          }
!         else if (ch < 0x10000) {
! #if Py_UNICODE_SIZE == 4
! 	    *p++ = 0xe0 | (ch>>12);
!             *p++ = 0x80 | ((ch>>6) & 0x3f);
!             *p++ = 0x80 | (ch & 0x3f);
!             cbWritten += 3;
! #else
              /* Check for high surrogate */
              if (0xD800 <= ch && ch <= 0xDBFF) {
***************
*** 910,914 ****
              *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
              *p++ = (char)(0x80 | (ch & 0x3f));
!         }
      }
      *p = '\0';
--- 921,932 ----
              *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
              *p++ = (char)(0x80 | (ch & 0x3f));
! #endif
!         } else {
!             *p++ = 0xf0 | (ch>>18);
!             *p++ = 0x80 | ((ch>>12) & 0x3f);
!             *p++ = 0x80 | ((ch>>6) & 0x3f);
!             *p++ = 0x80 | (ch & 0x3f);
!             cbWritten += 4;
! 	}
      }
      *p = '\0';
***************
*** 936,940 ****
  
  static
! int utf16_decoding_error(const Py_UNICODE **source,
  			 Py_UNICODE **dest,
  			 const char *errors,
--- 954,958 ----
  
  static
! int utf16_decoding_error(const Py_UCS2 **source,
  			 Py_UNICODE **dest,
  			 const char *errors,
***************
*** 974,983 ****
      PyUnicodeObject *unicode;
      Py_UNICODE *p;
!     const Py_UNICODE *q, *e;
      int bo = 0;
      const char *errmsg = "";
  
      /* size should be an even number */
!     if (size % sizeof(Py_UNICODE) != 0) {
  	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
  	    return NULL;
--- 992,1001 ----
      PyUnicodeObject *unicode;
      Py_UNICODE *p;
!     const Py_UCS2 *q, *e;
      int bo = 0;
      const char *errmsg = "";
  
      /* size should be an even number */
!     if (size % sizeof(Py_UCS2) != 0) {
  	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
  	    return NULL;
***************
*** 996,1001 ****
      /* Unpack UTF-16 encoded data */
      p = unicode->str;
!     q = (Py_UNICODE *)s;
!     e = q + (size / sizeof(Py_UNICODE));
  
      if (byteorder)
--- 1014,1019 ----
      /* Unpack UTF-16 encoded data */
      p = unicode->str;
!     q = (Py_UCS2 *)s;
!     e = q + (size / sizeof(Py_UCS2));
  
      if (byteorder)
***************
*** 1027,1031 ****
      
      while (q < e) {
! 	register Py_UNICODE ch = *q++;
  
  	/* Swap input bytes if needed. (This assumes
--- 1045,1049 ----
      
      while (q < e) {
! 	register Py_UCS2 ch = *q++;
  
  	/* Swap input bytes if needed. (This assumes
***************
*** 1049,1054 ****
  	}
  	if (0xDC00 <= *q && *q <= 0xDFFF) {
! 	    q++;
! 	    if (0xD800 <= *q && *q <= 0xDBFF) {
  		/* This is valid data (a UTF-16 surrogate pair), but
  		   we are not able to store this information since our
--- 1067,1080 ----
  	}
  	if (0xDC00 <= *q && *q <= 0xDFFF) {
! 	    Py_UCS2 ch2 = *q++;
! #ifdef BYTEORDER_IS_LITTLE_ENDIAN
! 	    if (bo == 1)
! 		    ch = (ch >> 8) | (ch << 8);
! #else    
! 	    if (bo == -1)
! 		    ch = (ch >> 8) | (ch << 8);
! #endif
! 	    if (0xD800 <= ch && ch <= 0xDBFF) {
! #if Py_UNICODE_SIZE == 2
  		/* This is valid data (a UTF-16 surrogate pair), but
  		   we are not able to store this information since our
***************
*** 1057,1063 ****
  		errmsg = "code pairs are not supported";
  		goto utf16Error;
! 	    }
! 	    else
  		continue;
  	}
  	errmsg = "illegal encoding";
--- 1083,1097 ----
  		errmsg = "code pairs are not supported";
  		goto utf16Error;
! #else
! 		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
  		continue;
+ #endif
+ 		
+ 	    }
+ 	    else {
+                 errmsg = "illegal UTF-16 surrogate";
+ 		goto utf16Error;
+ 	    }
+ 
  	}
  	errmsg = "illegal encoding";
***************
*** 1091,1105 ****
  {
      PyObject *v;
!     Py_UNICODE *p;
      char *q;
  
!     /* We don't create UTF-16 pairs... */
      v = PyString_FromStringAndSize(NULL, 
! 			sizeof(Py_UNICODE) * (size + (byteorder == 0)));
      if (v == NULL)
          return NULL;
  
      q = PyString_AS_STRING(v);
!     p = (Py_UNICODE *)q;
      if (byteorder == 0)
  	*p++ = 0xFEFF;
--- 1125,1142 ----
  {
      PyObject *v;
!     Py_UCS2 *p;
      char *q;
+     int i, pairs, doswap = 1;
  
!     for (i = pairs = 0; i < size; i++)
! 	if (s[i] >= 0x10000)
! 	    pairs++;
      v = PyString_FromStringAndSize(NULL, 
! 		  sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
      if (v == NULL)
          return NULL;
  
      q = PyString_AS_STRING(v);
!     p = (Py_UCS2 *)q;
      if (byteorder == 0)
  	*p++ = 0xFEFF;
***************
*** 1113,1122 ****
  #endif
  	)
! 	Py_UNICODE_COPY(p, s, size);
!     else
! 	while (size-- > 0) {
! 	    Py_UNICODE ch = *s++;
  	    *p++ = (ch >> 8) | (ch << 8);
  	}
      return v;
  }
--- 1150,1171 ----
  #endif
  	)
! 	doswap = 0;
!     while (size-- > 0) {
! 	Py_UNICODE ch = *s++;
! 	Py_UNICODE ch2 = 0;
! 	if (ch >= 0x10000) {
! 	    ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
! 	    ch  = 0xD800|((ch-0x10000)>>10);
! 	}
! 	if (doswap){
  	    *p++ = (ch >> 8) | (ch << 8);
+ 	    if (ch2)
+ 		*p++ = (ch2 >> 8) | (ch2 << 8);
+ 	}else{
+ 	    *p++ = ch;
+ 	    if(ch2)
+ 		*p++ = ch2;
  	}
+     }
      return v;
  }
***************
*** 1272,1279 ****
                  *p++ = (Py_UNICODE) chr;
              else if (chr <= 0x10ffff) {
!                 /* UCS-4 character.  store as two surrogate characters */
                  chr -= 0x10000L;
                  *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
                  *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
              } else {
                  if (unicodeescape_decoding_error(
--- 1321,1332 ----
                  *p++ = (Py_UNICODE) chr;
              else if (chr <= 0x10ffff) {
!                 /* UCS-4 character. Either store directly, or as surrogate pair. */
! #if Py_UNICODE_SIZE == 4
!                 *p++ = chr;
! #else
                  chr -= 0x10000L;
                  *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
                  *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
+ #endif
              } else {
                  if (unicodeescape_decoding_error(
***************
*** 1384,1387 ****
--- 1437,1453 ----
              *p++ = (char) ch;
          } 
+         /* Map 21-bit characters to '\U00xxxxxx' */
+         else if (ch >= 0x10000) {
+             *p++ = '\\';
+             *p++ = 'U';
+             *p++ = hexdigit[(ch >> 28) & 0xf];
+             *p++ = hexdigit[(ch >> 24) & 0xf];
+             *p++ = hexdigit[(ch >> 20) & 0xf];
+             *p++ = hexdigit[(ch >> 16) & 0xf];
+             *p++ = hexdigit[(ch >> 12) & 0xf];
+             *p++ = hexdigit[(ch >> 8) & 0xf];
+             *p++ = hexdigit[(ch >> 4) & 0xf];
+             *p++ = hexdigit[ch & 15];
+         }
          /* Map 16-bit characters to '\uxxxx' */
          else if (ch >= 256) {
***************
*** 5281,5291 ****
  {
      int i;
- 
-     /* Doublecheck the configuration... */
- #ifndef USE_UCS4_STORAGE
-     if (sizeof(Py_UNICODE) != 2)
-         Py_FatalError("Unicode configuration error: "
- 		      "sizeof(Py_UNICODE) != 2 bytes");
- #endif
  
      /* Init the implementation */
--- 5347,5350 ----