[Python-checkins] python/dist/src/Objects unicodeobject.c,2.124.6.13,2.124.6.14
lemburg@users.sourceforge.net
lemburg@users.sourceforge.net
Tue, 24 Sep 2002 07:06:57 -0700
Update of /cvsroot/python/python/dist/src/Objects
In directory usw-pr-cvs1:/tmp/cvs-serv25610/Objects
Modified Files:
Tag: release22-maint
unicodeobject.c
Log Message:
Backport the UTF-8 codec from 2.3 and add a work-around to let the
UTF-8 decoder accept broken UTF-8 sequences which encode lone
high surrogates (the pre-2.2.2 versions forgot to generate the
UTF-8 prefix \xed for these).
Fixes SF bug #610783: Lone surrogates cause bad .pyc files.
Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.124.6.13
retrieving revision 2.124.6.14
diff -C2 -d -r2.124.6.13 -r2.124.6.14
*** unicodeobject.c 24 Sep 2002 09:29:44 -0000 2.124.6.13
--- unicodeobject.c 24 Sep 2002 14:06:51 -0000 2.124.6.14
***************
*** 1035,1038 ****
--- 1035,1069 ----
case 0:
+ /* Work-around for bug in Python 2.2.0 and 2.2.1: the
+ UTF-8 encoder "forgot" to add the correct \xed prefix
+ for the lone surrogates 0xd800 - 0xdcff. */
+ if (((unsigned char)s[0] >= 0xa0) &&
+ ((unsigned char)s[0] <= 0xaf)) {
+ n = 2;
+ if (s + n > e) {
+ errmsg = "unexpected end of data";
+ goto utf8Error;
+ }
+ if ((s[0] & 0xc0) != 0x80 ||
+ (s[1] & 0xc0) != 0x80) {
+ errmsg = "invalid data";
+ goto utf8Error;
+ }
+ ch = 0xd000 + ((s[0] & 0x3f) << 6) + (s[1] & 0x3f);
+ if (ch < 0x0800) {
+ /* Note: UTF-8 encodings of surrogates are considered
+ legal UTF-8 sequences;
+
+ XXX For wide builds (UCS-4) we should probably try
+ to recombine the surrogates into a single code
+ unit.
+ */
+ errmsg = "illegal encoding";
+ goto utf8Error;
+ }
+ else
+ *p++ = (Py_UNICODE)ch;
+ break;
+ }
errmsg = "unexpected code byte";
goto utf8Error;
***************
*** 1063,1072 ****
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
! if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
errmsg = "illegal encoding";
goto utf8Error;
}
else
! *p++ = (Py_UNICODE)ch;
break;
--- 1094,1110 ----
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
! if (ch < 0x0800) {
! /* Note: UTF-8 encodings of surrogates are considered
! legal UTF-8 sequences;
!
! XXX For wide builds (UCS-4) we should probably try
! to recombine the surrogates into a single code
! unit.
! */
errmsg = "illegal encoding";
goto utf8Error;
}
else
! *p++ = (Py_UNICODE)ch;
break;
***************
*** 1082,1088 ****
/* validate and convert to UTF-16 */
if ((ch < 0x10000) /* minimum value allowed for 4
! byte encoding */
|| (ch > 0x10ffff)) /* maximum value allowed for
! UTF-16 */
{
errmsg = "illegal encoding";
--- 1120,1126 ----
/* validate and convert to UTF-16 */
if ((ch < 0x10000) /* minimum value allowed for 4
! byte encoding */
|| (ch > 0x10ffff)) /* maximum value allowed for
! UTF-16 */
{
errmsg = "illegal encoding";
***************
*** 1129,1251 ****
}
! /* Not used anymore, now that the encoder supports UTF-16
! surrogates. */
! #if 0
! static
! int utf8_encoding_error(const Py_UNICODE **source,
! char **dest,
! const char *errors,
! const char *details)
{
! if ((errors == NULL) ||
! (strcmp(errors,"strict") == 0)) {
! PyErr_Format(PyExc_UnicodeError,
! "UTF-8 encoding error: %.400s",
! details);
! return -1;
! }
! else if (strcmp(errors,"ignore") == 0) {
! return 0;
! }
! else if (strcmp(errors,"replace") == 0) {
! **dest = '?';
! (*dest)++;
! return 0;
}
else {
! PyErr_Format(PyExc_ValueError,
! "UTF-8 encoding error; "
! "unknown error handling code: %.400s",
! errors);
! return -1;
}
- }
- #endif
-
- PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
- int size,
- const char *errors)
- {
- PyObject *v;
- char *p;
- char *q;
- Py_UCS4 ch2;
- unsigned int cbAllocated = 3 * size;
- int i = 0;
-
- v = PyString_FromStringAndSize(NULL, cbAllocated);
- if (v == NULL)
- return NULL;
- if (size == 0)
- return v;
! p = q = PyString_AS_STRING(v);
! while (i < size) {
Py_UCS4 ch = s[i++];
if (ch < 0x80)
*p++ = (char) ch;
else if (ch < 0x0800) {
! *p++ = 0xc0 | (ch >> 6);
! *p++ = 0x80 | (ch & 0x3f);
}
!
! else if (ch < 0x10000) {
! /* Check for high surrogate */
! if (0xD800 <= ch && ch <= 0xDBFF) {
! if (i != size) {
! ch2 = s[i];
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
!
! if ((Py_uintptr_t)(p - q) >= (cbAllocated - 4)) {
! /* Provide enough room for some more
! surrogates */
! cbAllocated += 4*10;
! if (_PyString_Resize(&v, cbAllocated))
! goto onError;
! p = PyString_AS_STRING(v) + (p - q);
! q = PyString_AS_STRING(v);
! }
!
! /* combine the two values */
! ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
!
! *p++ = (char)((ch >> 18) | 0xf0);
! *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
i++;
}
}
- }
- else
*p++ = (char)(0xe0 | (ch >> 12));
!
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
-
- } else {
- if ((Py_uintptr_t)(p - q) >= (cbAllocated - 4)) {
- /* Provide enough room for some more
- surrogates */
- cbAllocated += 4*10;
- if (_PyString_Resize(&v, cbAllocated))
- goto onError;
- p = PyString_AS_STRING(v) + (p - q);
- q = PyString_AS_STRING(v);
- }
-
- *p++ = 0xf0 | (ch>>18);
- *p++ = 0x80 | ((ch>>12) & 0x3f);
- *p++ = 0x80 | ((ch>>6) & 0x3f);
- *p++ = 0x80 | (ch & 0x3f);
}
}
! *p = '\0';
! if (_PyString_Resize(&v, p - q))
! goto onError;
return v;
! onError:
! Py_XDECREF(v);
! return NULL;
}
--- 1167,1268 ----
}
! /* Allocation strategy: if the string is short, convert into a stack buffer
! and allocate exactly as much space needed at the end. Else allocate the
! maximum possible needed (4 result bytes per Unicode character), and return
! the excess memory at the end.
! */
! PyObject *
! PyUnicode_EncodeUTF8(const Py_UNICODE *s,
! int size,
! const char *errors)
{
! #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
!
! int i; /* index into s of next input byte */
! PyObject *v; /* result string object */
! char *p; /* next free byte in output buffer */
! int nallocated; /* number of result bytes allocated */
! int nneeded; /* number of result bytes needed */
! char stackbuf[MAX_SHORT_UNICHARS * 4];
!
! assert(s != NULL);
! assert(size >= 0);
!
! if (size <= MAX_SHORT_UNICHARS) {
! /* Write into the stack buffer; nallocated can't overflow.
! * At the end, we'll allocate exactly as much heap space as it
! * turns out we need.
! */
! nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
! v = NULL; /* will allocate after we're done */
! p = stackbuf;
}
else {
! /* Overallocate on the heap, and give the excess back at the end. */
! nallocated = size * 4;
! if (nallocated / 4 != size) /* overflow! */
! return PyErr_NoMemory();
! v = PyString_FromStringAndSize(NULL, nallocated);
! if (v == NULL)
! return NULL;
! p = PyString_AS_STRING(v);
}
! for (i = 0; i < size;) {
Py_UCS4 ch = s[i++];
+
if (ch < 0x80)
+ /* Encode ASCII */
*p++ = (char) ch;
else if (ch < 0x0800) {
! /* Encode Latin-1 */
! *p++ = (char)(0xc0 | (ch >> 6));
! *p++ = (char)(0x80 | (ch & 0x3f));
}
! else {
! /* Encode UCS2 Unicode ordinals */
! if (ch < 0x10000) {
! /* Special case: check for high surrogate */
! if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
! Py_UCS4 ch2 = s[i];
! /* Check for low surrogate and combine the two to
! form a UCS4 value */
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
! ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
i++;
+ goto encodeUCS4;
}
+ /* Fall through: handles isolated high surrogates */
}
*p++ = (char)(0xe0 | (ch >> 12));
! *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
! *p++ = (char)(0x80 | (ch & 0x3f));
! continue;
! }
! encodeUCS4:
! /* Encode UCS4 Unicode ordinals */
! *p++ = (char)(0xf0 | (ch >> 18));
! *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
}
}
!
! if (v == NULL) {
! /* This was stack allocated. */
! nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
! assert(nneeded <= nallocated);
! v = PyString_FromStringAndSize(stackbuf, nneeded);
! }
! else {
! /* Cut back to size actually needed. */
! nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
! assert(nneeded <= nallocated);
! _PyString_Resize(&v, nneeded);
! }
return v;
! #undef MAX_SHORT_UNICHARS
}