[Python-checkins] python/dist/src/Objects unicodeobject.c,2.145,2.146

Sat, 27 Apr 2002 11:03:29 -0700

Update of /cvsroot/python/python/dist/src/Objects
In directory usw-pr-cvs1:/tmp/cvs-serv30259

Modified Files:
	unicodeobject.c 
Log Message:
SF patch 549375:  Compromise PyUnicode_EncodeUTF8
This implements ideas from Marc-Andre, Martin, Guido and me on Python-Dev.

"Short" Unicode strings are encoded into a "big enough" stack buffer,
then exactly as much string space as they turn out to need is allocated
at the end.  This should have speed benefits akin to Martin's "measure
once, allocate once" strategy, but without needing a distinct measuring
pass.

"Long" Unicode strings allocate as much heap space as they could possibly
need (4 x # Unicode chars), and do a realloc at the end to return the
untouched excess.  Since the overallocation is likely to be substantial,
this shouldn't burden the platform realloc with unusably small excess
blocks.

Also simplified uses of the PyString_xyz functions.  Also added a release-
build check that 4*size doesn't overflow a C int.  Sooner or later, that's
going to happen.

Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.145
retrieving revision 2.146
diff -C2 -d -r2.145 -r2.146
*** unicodeobject.c	22 Apr 2002 19:00:10 -0000	2.145
--- unicodeobject.c	27 Apr 2002 18:03:26 -0000	2.146
***************
*** 1139,1182 ****
  }

! /* Not used anymore, now that the encoder supports UTF-16
!    surrogates. */
! #if 0
! static
! int utf8_encoding_error(const Py_UNICODE **source,
! 			char **dest,
! 			const char *errors,
! 			const char *details) 
! {
!     if ((errors == NULL) ||
! 	(strcmp(errors,"strict") == 0)) {
! 	PyErr_Format(PyExc_UnicodeError,
! 		     "UTF-8 encoding error: %.400s",
! 		     details);
! 	return -1;
!     }
!     else if (strcmp(errors,"ignore") == 0) {
! 	return 0;
!     }
!     else if (strcmp(errors,"replace") == 0) {
! 	**dest = '?';
! 	(*dest)++;
! 	return 0;
!     }
!     else {
! 	PyErr_Format(PyExc_ValueError,
! 		     "UTF-8 encoding error; "
! 		     "unknown error handling code: %.400s",
! 		     errors);
! 	return -1;
!     }
! }
! #endif
! 
! /* Allocation strategy: we default to Latin-1, then do one resize
!    whenever we hit an order boundary. The assumption is that
!    characters from higher orders usually occur often enough to warrant
!    this.
  */
- 
  PyObject *
  PyUnicode_EncodeUTF8(const Py_UNICODE *s,
--- 1139,1147 ----
  }

! /* Allocation strategy:  if the string is short, convert into a stack buffer
!    and allocate exactly as much space needed at the end.  Else allocate the
!    maximum possible needed (4 result bytes per Unicode character), and return
!    the excess memory at the end.
  */
  PyObject *
  PyUnicode_EncodeUTF8(const Py_UNICODE *s,
***************
*** 1184,1278 ****
  		     const char *errors)
  {
!     PyObject *v;
!     char *p;
!     int len;
!     int i = 0;
!     long overalloc = 2;
!     int nallocated;  /* overalloc * size; PyString_ adds one more for \0 */

!     /* Short-cut for empty strings */
!     if (size == 0)
! 	return PyString_FromStringAndSize(NULL, 0);

!     nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
!     v = PyString_FromStringAndSize(NULL, nallocated);
!     if (v == NULL)
!         return NULL;

!     p = PyString_AS_STRING(v);

!     while (i < size) {
          Py_UCS4 ch = s[i++];

          if (ch < 0x80)
! 	    /* Encode ASCII */
              *p++ = (char) ch;

          else if (ch < 0x0800) {
! 	    /* Encode Latin-1 */
              *p++ = (char)(0xc0 | (ch >> 6));
              *p++ = (char)(0x80 | (ch & 0x3f));
          }
- 
          else {
! 	    /* Encode UCS2 Unicode ordinals */
! 	    if (ch < 0x10000) {
! 
! 		/* Special case: check for high surrogate */
! 		if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
! 		    Py_UCS4 ch2 = s[i];
! 		    /* Check for low surrogate and combine the two to
! 		       form a UCS4 value */
! 		    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
                          ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
! 			i++;
! 			goto encodeUCS4;
                      }
! 		    /* Fall through: handles isolated high surrogates */
                  }
- 
- 		if (overalloc < 3) {
- 		    len = Py_SAFE_DOWNCAST(p-PyString_AS_STRING(v), long, int);
-                     assert(len <= nallocated);
- 		    overalloc = 3;
-                     nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
- 		    if (_PyString_Resize(&v, nallocated))
- 			goto onError;
- 		    p = PyString_AS_STRING(v) + len;
- 		}
                  *p++ = (char)(0xe0 | (ch >> 12));
! 		*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
! 		*p++ = (char)(0x80 | (ch & 0x3f));
! 		continue;
! 	    }
! 
! 	    /* Encode UCS4 Unicode ordinals */
! 	encodeUCS4:
! 	    if (overalloc < 4) {
!                 len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
!                 assert(len <= nallocated);
! 		overalloc = 4;
!                 nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
! 		if (_PyString_Resize(&v, nallocated))
! 		    goto onError;
! 		p = PyString_AS_STRING(v) + len;
! 	    }
! 	    *p++ = (char)(0xf0 | (ch >> 18));
! 	    *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
! 	    *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
! 	    *p++ = (char)(0x80 | (ch & 0x3f));
! 	}
      }

!     *p = '\0';
!     len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
!     assert(len <= nallocated);
!     if (_PyString_Resize(&v, len))
! 	goto onError;
      return v;

!  onError:
!     Py_DECREF(v);
!     return NULL;
  }

--- 1149,1240 ----
  		     const char *errors)
  {
! #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */

!     int i;              /* index into s of next input byte */
!     PyObject *v;        /* result string object */
!     char *p;            /* next free byte in output buffer */
!     int nallocated;     /* number of result bytes allocated */
!     int nneeded;        /* number of result bytes needed */
!     char stackbuf[MAX_SHORT_UNICHARS * 4];

!     assert(s != NULL);
!     assert(size >= 0);

!     if (size <= MAX_SHORT_UNICHARS) {
!         /* Write into the stack buffer; nallocated can't overflow.
!          * At the end, we'll allocate exactly as much heap space as it
!          * turns out we need.
!          */
!         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
!         v = NULL;   /* will allocate after we're done */
!         p = stackbuf;
!     }
!     else {
!         /* Overallocate on the heap, and give the excess back at the end. */
!         nallocated = size * 4;
!         if (nallocated / 4 != size)  /* overflow! */
!             return PyErr_NoMemory();
!         v = PyString_FromStringAndSize(NULL, nallocated);
!         if (v == NULL)
!             return NULL;
!         p = PyString_AS_STRING(v);
!     }

!     for (i = 0; i < size;) {
          Py_UCS4 ch = s[i++];

          if (ch < 0x80)
!             /* Encode ASCII */
              *p++ = (char) ch;

          else if (ch < 0x0800) {
!             /* Encode Latin-1 */
              *p++ = (char)(0xc0 | (ch >> 6));
              *p++ = (char)(0x80 | (ch & 0x3f));
          }
          else {
!             /* Encode UCS2 Unicode ordinals */
!             if (ch < 0x10000) {
!                 /* Special case: check for high surrogate */
!                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
!                     Py_UCS4 ch2 = s[i];
!                     /* Check for low surrogate and combine the two to
!                        form a UCS4 value */
!                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
                          ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
!                         i++;
!                         goto encodeUCS4;
                      }
!                     /* Fall through: handles isolated high surrogates */
                  }
                  *p++ = (char)(0xe0 | (ch >> 12));
!                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
!                 *p++ = (char)(0x80 | (ch & 0x3f));
!                 continue;
!     	    }
! encodeUCS4:
!             /* Encode UCS4 Unicode ordinals */
!             *p++ = (char)(0xf0 | (ch >> 18));
!             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
!             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
!             *p++ = (char)(0x80 | (ch & 0x3f));
!         }
      }

!     if (v == NULL) {
!         /* This was stack allocated. */
!         nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
!         assert(nneeded <= nallocated);
!         v = PyString_FromStringAndSize(stackbuf, nneeded);
!     }
!     else {
!     	/* Cut back to size actually needed. */
!         nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
!         assert(nneeded <= nallocated);
!         _PyString_Resize(&v, nneeded);
!     }
      return v;

! #undef MAX_SHORT_UNICHARS
  }