[Python-checkins] cpython: Rewrite PyUnicode_EncodeDecimal() to use the new Unicode API

Mon Nov 21 22:51:52 CET 2011

http://hg.python.org/cpython/rev/849e9277906a
changeset:   73671:849e9277906a
user:        Victor Stinner <victor.stinner at haypocalc.com>
date:        Mon Nov 21 22:52:58 2011 +0100
summary:
  Rewrite PyUnicode_EncodeDecimal() to use the new Unicode API

Add tests for PyUnicode_EncodeDecimal() and
PyUnicode_TransformDecimalToASCII().

files:
  Lib/test/test_unicode.py  |  30 ++++++++
  Modules/_testcapimodule.c |  51 +++++++++++++-
  Objects/unicodeobject.c   |  97 ++++++++++++++------------
  3 files changed, 132 insertions(+), 46 deletions(-)

diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1806,6 +1806,36 @@
         s += "4"
         self.assertEqual(s, "3")
 
+    def test_encode_decimal(self):
+        from _testcapi import unicode_encodedecimal
+        self.assertEqual(unicode_encodedecimal('123'),
+                         b'123')
+        self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
+                         b'3.14')
+        self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
+                         b' 3.14 ')
+        self.assertRaises(UnicodeEncodeError,
+                          unicode_encodedecimal, "123\u20ac", "strict")
+        self.assertEqual(unicode_encodedecimal("123\u20ac", "replace"),
+                         b'123?')
+        self.assertEqual(unicode_encodedecimal("123\u20ac", "ignore"),
+                         b'123')
+        self.assertEqual(unicode_encodedecimal("123\u20ac", "xmlcharrefreplace"),
+                         b'123&#8364;')
+        self.assertEqual(unicode_encodedecimal("123\u20ac", "backslashreplace"),
+                         b'123\\u20ac')
+
+    def test_transform_decimal(self):
+        from _testcapi import unicode_transformdecimaltoascii as transform_decimal
+        self.assertEqual(transform_decimal('123'),
+                         '123')
+        self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
+                         '3.14')
+        self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
+                         "\N{EM SPACE}3.14\N{EN SPACE}")
+        self.assertEqual(transform_decimal('123\u20ac'),
+                         '123\u20ac')
+
 
 class StringModuleTest(unittest.TestCase):
     def test_formatter_parser(self):
diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -1500,6 +1500,51 @@
 }
 
 static PyObject *
+unicode_encodedecimal(PyObject *self, PyObject *args)
+{
+    Py_UNICODE *unicode;
+    Py_ssize_t length;
+    char *errors = NULL;
+    PyObject *decimal;
+    Py_ssize_t decimal_length, new_length;
+    int res;
+
+    if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length, &errors))
+        return NULL;
+
+    decimal_length = length * 7; /* len('&#8364;') */
+    decimal = PyBytes_FromStringAndSize(NULL, decimal_length);
+    if (decimal == NULL)
+        return NULL;
+
+    res = PyUnicode_EncodeDecimal(unicode, length,
+                                  PyBytes_AS_STRING(decimal),
+                                  errors);
+    if (res < 0) {
+        Py_DECREF(decimal);
+        return NULL;
+    }
+
+    new_length = strlen(PyBytes_AS_STRING(decimal));
+    assert(new_length <= decimal_length);
+    res = _PyBytes_Resize(&decimal, new_length);
+    if (res < 0)
+        return NULL;
+
+    return decimal;
+}
+
+static PyObject *
+unicode_transformdecimaltoascii(PyObject *self, PyObject *args)
+{
+    Py_UNICODE *unicode;
+    Py_ssize_t length;
+    if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length))
+        return NULL;
+    return PyUnicode_TransformDecimalToASCII(unicode, length);
+}
+
+static PyObject *
 getargs_w_star(PyObject *self, PyObject *args)
 {
     Py_buffer buffer;
@@ -2384,8 +2429,10 @@
     {"test_u_code",             (PyCFunction)test_u_code,        METH_NOARGS},
     {"test_Z_code",             (PyCFunction)test_Z_code,        METH_NOARGS},
     {"test_widechar",           (PyCFunction)test_widechar,      METH_NOARGS},
-    {"unicode_aswidechar",      unicode_aswidechar,                 METH_VARARGS},
-    {"unicode_aswidecharstring",unicode_aswidecharstring,           METH_VARARGS},
+    {"unicode_aswidechar",      unicode_aswidechar,              METH_VARARGS},
+    {"unicode_aswidecharstring",unicode_aswidecharstring,        METH_VARARGS},
+    {"unicode_encodedecimal",   unicode_encodedecimal,           METH_VARARGS},
+    {"unicode_transformdecimaltoascii", unicode_transformdecimaltoascii, METH_VARARGS},
 #ifdef WITH_THREAD
     {"_test_thread_state",      test_thread_state,               METH_VARARGS},
     {"_pending_threadfunc",     pending_threadfunc,              METH_VARARGS},
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -8829,7 +8829,6 @@
                         char *output,
                         const char *errors)
 {
-    Py_UNICODE *p, *end;
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
     PyObject *unicode;
@@ -8838,47 +8837,50 @@
     /* the following variable is used for caching string comparisons
      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
     int known_errorHandler = -1;
+    Py_ssize_t i, j;
+    enum PyUnicode_Kind kind;
+    void *data;
 
     if (output == NULL) {
         PyErr_BadArgument();
         return -1;
     }
 
-    p = s;
-    end = s + length;
-    while (p < end) {
-        register Py_UNICODE ch = *p;
+    unicode = PyUnicode_FromUnicode(s, length);
+    if (unicode == NULL)
+        return -1;
+
+    if (PyUnicode_READY(unicode) < 0)
+        goto onError;
+    kind = PyUnicode_KIND(unicode);
+    data = PyUnicode_DATA(unicode);
+
+    for (i=0; i < length; i++) {
+        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
         int decimal;
-        PyObject *repunicode;
-        Py_ssize_t repsize;
-        Py_ssize_t newpos;
-        Py_UNICODE *uni2;
-        Py_UNICODE *collstart;
-        Py_UNICODE *collend;
+        Py_ssize_t startpos, endpos;
 
         if (Py_UNICODE_ISSPACE(ch)) {
             *output++ = ' ';
-            ++p;
             continue;
         }
         decimal = Py_UNICODE_TODECIMAL(ch);
         if (decimal >= 0) {
             *output++ = '0' + decimal;
-            ++p;
             continue;
         }
         if (0 < ch && ch < 256) {
             *output++ = (char)ch;
-            ++p;
             continue;
         }
         /* All other characters are considered unencodable */
-        collstart = p;
-        collend = p+1;
-        while (collend < end) {
-            if ((0 < *collend && *collend < 256) ||
-                !Py_UNICODE_ISSPACE(*collend) ||
-                Py_UNICODE_TODECIMAL(*collend))
+        startpos = i;
+        endpos = i+1;
+        for (; endpos < length; endpos++) {
+            ch = PyUnicode_READ(kind, data, endpos);
+            if ((0 < ch && ch < 256) ||
+                !Py_UNICODE_ISSPACE(ch) ||
+                Py_UNICODE_TODECIMAL(ch))
                 break;
         }
         /* cache callback name lookup
@@ -8897,33 +8899,33 @@
         }
         switch (known_errorHandler) {
         case 1: /* strict */
-            unicode = PyUnicode_FromUnicode(s, length);
-            if (unicode == NULL)
-                goto onError;
-            raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason);
-            Py_DECREF(unicode);
+            raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
             goto onError;
         case 2: /* replace */
-            for (p = collstart; p < collend; ++p)
+            for (j=startpos; j < endpos; j++)
                 *output++ = '?';
             /* fall through */
         case 3: /* ignore */
-            p = collend;
+            i = endpos;
             break;
         case 4: /* xmlcharrefreplace */
-            /* generate replacement (temporarily (mis)uses p) */
-            for (p = collstart; p < collend; ++p)
-                output += sprintf(output, "&#%d;", (int)*p);
-            p = collend;
+            /* generate replacement */
+            for (j=startpos; j < endpos; j++) {
+                ch = PyUnicode_READ(kind, data, i);
+                output += sprintf(output, "&#%d;", (int)ch);
+                i++;
+            }
             break;
         default:
-            unicode = PyUnicode_FromUnicode(s, length);
-            if (unicode == NULL)
-                goto onError;
+        {
+            PyObject *repunicode;
+            Py_ssize_t repsize, newpos, k;
+            enum PyUnicode_Kind repkind;
+            void *repdata;
+
             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
                                                           encoding, reason, unicode, &exc,
-                                                          collstart-s, collend-s, &newpos);
-            Py_DECREF(unicode);
+                                                          startpos, endpos, &newpos);
             if (repunicode == NULL)
                 goto onError;
             if (!PyUnicode_Check(repunicode)) {
@@ -8932,10 +8934,17 @@
                 Py_DECREF(repunicode);
                 goto onError;
             }
+            if (PyUnicode_READY(repunicode) < 0) {
+                Py_DECREF(repunicode);
+                goto onError;
+            }
+            repkind = PyUnicode_KIND(repunicode);
+            repdata = PyUnicode_DATA(repunicode);
+
             /* generate replacement  */
             repsize = PyUnicode_GET_SIZE(repunicode);
-            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
-                Py_UNICODE ch = *uni2;
+            for (k=0; k<repsize; k++) {
+                ch = PyUnicode_READ(repkind, repdata, k);
                 if (Py_UNICODE_ISSPACE(ch))
                     *output++ = ' ';
                 else {
@@ -8946,29 +8955,29 @@
                         *output++ = (char)ch;
                     else {
                         Py_DECREF(repunicode);
-                        unicode = PyUnicode_FromUnicode(s, length);
-                        if (unicode == NULL)
-                            goto onError;
                         raise_encode_exception(&exc, encoding,
-                                               unicode, collstart-s, collend-s, reason);
-                        Py_DECREF(unicode);
+                                               unicode, startpos, endpos,
+                                               reason);
                         goto onError;
                     }
                 }
             }
-            p = s + newpos;
+            i = newpos;
             Py_DECREF(repunicode);
         }
+        }
     }
     /* 0-terminate the output string */
     *output++ = '\0';
     Py_XDECREF(exc);
     Py_XDECREF(errorHandler);
+    Py_DECREF(unicode);
     return 0;
 
   onError:
     Py_XDECREF(exc);
     Py_XDECREF(errorHandler);
+    Py_DECREF(unicode);
     return -1;
 }
 

-- 
Repository URL: http://hg.python.org/cpython