[Python-checkins] cpython: Call directly PyUnicode_DecodeUTF8Stateful() instead of PyUnicode_DecodeUTF8()

victor.stinner python-checkins at python.org
Mon Dec 12 01:25:55 CET 2011


http://hg.python.org/cpython/rev/9538a4e2ec08
changeset:   73935:9538a4e2ec08
user:        Victor Stinner <victor.stinner at haypocalc.com>
date:        Sun Dec 11 21:53:09 2011 +0100
summary:
  Call directly PyUnicode_DecodeUTF8Stateful() instead of PyUnicode_DecodeUTF8()

 * Remove micro-optimization from PyUnicode_FromStringAndSize():
   PyUnicode_DecodeUTF8Stateful() has already these optimizations (for size=0
   and one ascii char).
 * Rename utf8_max_char_size_and_char_count() to utf8_scanner(), and remove an
   useless variable

files:
  Objects/unicodeobject.c |  47 ++++++++--------------------
  1 files changed, 14 insertions(+), 33 deletions(-)


diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1717,28 +1717,10 @@
                         "Negative size passed to PyUnicode_FromStringAndSize");
         return NULL;
     }
-
-    /* If the Unicode data is known at construction time, we can apply
-       some optimizations which share commonly used objects.
-       Also, this means the input must be UTF-8, so fall back to the
-       UTF-8 decoder at the end. */
-    if (u != NULL) {
-
-        /* Optimization for empty strings */
-        if (size == 0 && unicode_empty != NULL) {
-            Py_INCREF(unicode_empty);
-            return unicode_empty;
-        }
-
-        /* Single characters are shared when using this constructor.
-           Restrict to ASCII, since the input must be UTF-8. */
-        if (size == 1 && (unsigned char)*u < 128)
-            return get_latin1_char((unsigned char)*u);
-
-        return PyUnicode_DecodeUTF8(u, size, NULL);
-    }
-
-    return (PyObject *)_PyUnicode_New(size);
+    if (u != NULL)
+        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
+    else
+        return (PyObject *)_PyUnicode_New(size);
 }
 
 PyObject *
@@ -1749,15 +1731,16 @@
         PyErr_SetString(PyExc_OverflowError, "input too long");
         return NULL;
     }
-
-    return PyUnicode_FromStringAndSize(u, size);
+    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
 }
 
 PyObject *
 _PyUnicode_FromId(_Py_Identifier *id)
 {
     if (!id->object) {
-        id->object = PyUnicode_FromString(id->string);
+        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
+                                                  strlen(id->string),
+                                                  NULL, NULL);
         if (!id->object)
             return NULL;
         PyUnicode_InternInPlace(&id->object);
@@ -2443,7 +2426,7 @@
             {
                 /* UTF-8 */
                 const char *s = va_arg(count, const char*);
-                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
+                PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
                 if (!str)
                     goto fail;
                 /* since PyUnicode_DecodeUTF8 returns already flexible
@@ -2482,7 +2465,7 @@
                     *callresult++ = NULL;
                 }
                 else {
-                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
+                    str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
                     if (!str_obj)
                         goto fail;
                     if (PyUnicode_READY(str_obj)) {
@@ -2947,7 +2930,7 @@
     if (normalize_encoding(encoding, lower, sizeof(lower))) {
         if ((strcmp(lower, "utf-8") == 0) ||
             (strcmp(lower, "utf8") == 0))
-            return PyUnicode_DecodeUTF8(s, size, errors);
+            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
         else if ((strcmp(lower, "latin-1") == 0) ||
                  (strcmp(lower, "latin1") == 0) ||
                  (strcmp(lower, "iso-8859-1") == 0))
@@ -3260,7 +3243,7 @@
 #ifdef HAVE_MBCS
     return PyUnicode_DecodeMBCS(s, size, NULL);
 #elif defined(__APPLE__)
-    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
+    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
 #else
     PyInterpreterState *interp = PyThreadState_GET()->interp;
     /* Bootstrap check: if the filesystem codec is implemented in Python, we
@@ -4240,11 +4223,9 @@
    PyUnicode_DecodeUTF8Stateful.
    */
 static Py_UCS4
-utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
-                                  Py_ssize_t *unicode_size)
+utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
 {
     Py_ssize_t char_count = 0;
-    const unsigned char *p = (const unsigned char *)s;
     const unsigned char *end = p + string_size;
     const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
 
@@ -4563,7 +4544,7 @@
         return unicode_empty;
     }
 
-    maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
+    maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
 
     /* When the string is ASCII only, just use memcpy and return.
        unicode_size may be != size if there is an incomplete UTF-8

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list