[Python-checkins] r85738 - in python/branches/issue4388: Modules/python.c Objects/unicodeobject.c

Wed Oct 20 13:55:43 CEST 2010

Author: victor.stinner
Date: Wed Oct 20 13:55:43 2010
New Revision: 85738

Log:
Create _Py_DecodeUTF8_surrogateescape() to decode the command line arguments
from UTF-8 with surrogateescape error handler on Mac OS X


Modified:
   python/branches/issue4388/Modules/python.c
   python/branches/issue4388/Objects/unicodeobject.c

Modified: python/branches/issue4388/Modules/python.c
==============================================================================

--- python/branches/issue4388/Modules/python.c	(original)
+++ python/branches/issue4388/Modules/python.c	Wed Oct 20 13:55:43 2010
@@ -15,6 +15,10 @@
 }
 #else
 
+#ifdef __APPLE__
+extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
+#endif
+
 int
 main(int argc, char **argv)
 {
@@ -41,7 +45,11 @@
     oldloc = strdup(setlocale(LC_ALL, NULL));
     setlocale(LC_ALL, "");
     for (i = 0; i < argc; i++) {
+#ifdef __APPLE__
+        argv_copy[i] = _Py_DecodeUTF8_surrogateescape(argv[i], strlen(argv[i]));
+#else
         argv_copy[i] = _Py_char2wchar(argv[i], NULL);
+#endif
         if (!argv_copy[i])
             return 1;
         argv_copy2[i] = argv_copy[i];

Modified: python/branches/issue4388/Objects/unicodeobject.c
==============================================================================
--- python/branches/issue4388/Objects/unicodeobject.c	(original)
+++ python/branches/issue4388/Objects/unicodeobject.c	Wed Oct 20 13:55:43 2010
@@ -2716,6 +2716,123 @@
 
 #undef ASCII_CHAR_MASK
 
+#ifdef __APPLE__
+
+/* UTF-8 decoder using surrogateescape error handler,
+   used to decode the command line arguments on Mac OS X */
+
+wchar_t*
+_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
+{
+    int n;
+    const char *e;
+    wchar_t *unicode, *p;
+
+    /* Note: size will always be longer than the resulting Unicode
+       character count */
+    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
+    if (!unicode)
+        return NULL;
+    if (size == 0)
+        return unicode;
+
+    /* Unpack UTF-8 encoded data */
+    p = unicode;
+    e = s + size;
+
+    while (s < e) {
+        Py_UCS4 ch = (unsigned char)*s;
+
+        if (ch < 0x80) {
+            *p++ = (wchar_t)ch;
+            s++;
+            continue;
+        }
+
+        n = utf8_code_length[ch];
+        if (s + n > e) {
+            goto surrogateescape;
+        }
+
+        switch (n) {
+        case 0:
+        case 1:
+            goto surrogateescape;
+
+        case 2:
+            if ((s[1] & 0xc0) != 0x80)
+                goto surrogateescape;
+            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
+            assert ((ch > 0x007F) && (ch <= 0x07FF));
+            *p++ = (wchar_t)ch;
+            break;
+
+        case 3:
+            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
+               will result in surrogates in range d800-dfff. Surrogates are
+               not valid UTF-8 so they are rejected.
+               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
+               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[2] & 0xc0) != 0x80 ||
+                ((unsigned char)s[0] == 0xE0 &&
+                 (unsigned char)s[1] < 0xA0) ||
+                ((unsigned char)s[0] == 0xED &&
+                 (unsigned char)s[1] > 0x9F)) {
+
+                goto surrogateescape;
+            }
+            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
+            *p++ = (Py_UNICODE)ch;
+            break;
+
+        case 4:
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[2] & 0xc0) != 0x80 ||
+                (s[3] & 0xc0) != 0x80 ||
+                ((unsigned char)s[0] == 0xF0 &&
+                 (unsigned char)s[1] < 0x90) ||
+                ((unsigned char)s[0] == 0xF4 &&
+                 (unsigned char)s[1] > 0x8F)) {
+                goto surrogateescape;
+            }
+            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
+                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
+            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
+
+#if SIZEOF_WCHAR_T == 4
+            *p++ = (wchar_t)ch;
+#else
+            /*  compute and append the two surrogates: */
+
+            /*  translate from 10000..10FFFF to 0..FFFF */
+            ch -= 0x10000;
+
+            /*  high surrogate = top 10 bits added to D800 */
+            *p++ = (wchar_t)(0xD800 + (ch >> 10));
+
+            /*  low surrogate = bottom 10 bits added to DC00 */
+            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
+#endif
+            break;
+        }
+        s += n;
+        continue;
+
+      surrogateescape:
+        *p++ = 0xDC00 + ch;
+        s++;
+    }
+    *p = L'\0';
+    return unicode;
+}
+#endif /* __APPLE__ */
+
 
 /* Allocation strategy:  if the string is short, convert into a stack buffer
    and allocate exactly as much space needed at the end.  Else allocate the