[Python-checkins] r68385 - sandbox/trunk/io-c/_textio.c

Thu Jan 8 00:31:02 CET 2009

Author: antoine.pitrou
Date: Thu Jan  8 00:31:01 2009
New Revision: 68385

Log:
Make newline translation much faster.



Modified:
   sandbox/trunk/io-c/_textio.c

Modified: sandbox/trunk/io-c/_textio.c
==============================================================================

--- sandbox/trunk/io-c/_textio.c	(original)
+++ sandbox/trunk/io-c/_textio.c	Thu Jan  8 00:31:01 2009
@@ -103,7 +103,7 @@
     char *kwlist[] = {"input", "final", NULL};
     PyObject *input, *output;
     int final = 0;
-    Py_ssize_t cr=0, lf=0, crlf=0;
+    Py_ssize_t output_len;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|i:IncrementalNewlineDecoder",
                                      kwlist, &input, &final))
@@ -126,102 +126,125 @@
         goto error;
     }
 
-    if (self->pendingcr && (final || PyUnicode_GET_SIZE(output) > 0)) {
-        PyObject *modified;
-
-        modified = PyUnicode_FromOrdinal('\r');
+    output_len = PyUnicode_GET_SIZE(output);
+    if (self->pendingcr && (final || output_len > 0)) {
+        Py_UNICODE *out;
+        PyObject *modified = PyUnicode_FromUnicode(NULL, output_len + 1);
         if (modified == NULL)
             goto error;
-
-        PyUnicode_Append(&modified, output);
-        if (modified == NULL)
-            goto error;
-
+        out = PyUnicode_AS_UNICODE(modified);
+        out[0] = '\r';
+        memcpy(out + 1, PyUnicode_AS_UNICODE(output),
+               output_len * sizeof(Py_UNICODE));
         Py_DECREF(output);
         output = modified;
         self->pendingcr = 0;
+        output_len++;
     }
 
     /* retain last \r even when not translating data:
      * then readline() is sure to get \r\n in one pass
      */
     if (!final) {
-        Py_ssize_t output_len = PyUnicode_GET_SIZE(output);
-
         if (output_len > 0 
             && PyUnicode_AS_UNICODE(output)[output_len - 1] == '\r') {
 
-            PyObject *modified = PyUnicode_FromUnicode(
-                PyUnicode_AS_UNICODE(output),
-                output_len - 1);
-            if (modified == NULL)
-                goto error;
-
-            Py_DECREF(output);
-            output = modified;
+            if (Py_REFCNT(output) == 1) {
+                if (PyUnicode_Resize(&output, output_len - 1) < 0)
+                    goto error;
+            }
+            else {
+                PyObject *modified = PyUnicode_FromUnicode(
+                    PyUnicode_AS_UNICODE(output),
+                    output_len - 1);
+                if (modified == NULL)
+                    goto error;
+                Py_DECREF(output);
+                output = modified;
+            }
             self->pendingcr = 1;
         }
-
     }
 
-    /* Record which newlines are read */
+    /* Record which newlines are read and do newline translation if desired,
+       all in one pass. */
     {
-        Py_UNICODE c;
-        Py_ssize_t index, previous;
-        int pendingcr = 0;
-
-        previous = '\0';
-
-        for(index = 0; index < PyUnicode_GET_SIZE(output); index++) {
-            c = PyUnicode_AS_UNICODE(output)[index];
-
-            switch (c) {
-            case '\r':
-                cr++;
-                pendingcr = 1;
-                break;
-
-            case '\n':
-                if (pendingcr) {
-                    cr--;
-                    crlf++;
-                    pendingcr = 0;
+        Py_UNICODE *in_str;
+        Py_ssize_t in, len;
+        int seennl = 0;
+
+        in_str = PyUnicode_AS_UNICODE(output);
+        len = PyUnicode_GET_SIZE(output);
+        if (!self->translate) {
+            for (in = 0; in < len;) {
+                Py_UNICODE c = in_str[in++];
+                if (c >= 0x20)
+                    continue;
+                if (c == '\n')
+                    seennl |= SEEN_LF;
+                else if (c == '\r') {
+                    if (in_str[in] == '\n') {
+                        seennl |= SEEN_CRLF;
+                        in++;
+                    }
+                    else
+                        seennl |= SEEN_CR;
                 }
-                else
-                    lf++;
-                break;
-
-            default:
-                pendingcr = 0;
-                break;
             }
         }
-
-        if (cr)
-            self->seennl |= SEEN_CR;
-        if (lf)
-            self->seennl |= SEEN_LF;
-        if (crlf)
-            self->seennl |= SEEN_CRLF;
-    }
-
-    if (self->translate) {
-        PyObject *modified;
-
-        if (crlf) {
-            modified = PyObject_CallMethod(output, "replace", "ss", "\r\n", "\n");
-            if (modified == NULL)
-                goto error;
-            Py_DECREF(output);
-            output = modified;
-        }
-        if (cr) {
-            modified = PyObject_CallMethod(output, "replace", "ss", "\r", "\n");
-            if (modified == NULL)
-                goto error;
-            Py_DECREF(output);
-            output = modified;
+        else {
+            PyObject *translated = NULL;
+            Py_UNICODE *out_str;
+            Py_ssize_t out;
+            if (Py_REFCNT(output) != 1 || len < 2) {
+                /* We could try to optimize this so that we only do a copy
+                   when there is something to translate. On the other hand,
+                   most decoders should only output non-shared strings, i.e.
+                   translation is done in place. */
+                translated = PyUnicode_FromUnicode(NULL, len);
+                if (translated == NULL)
+                    goto error;
+                memcpy(PyUnicode_AS_UNICODE(translated),
+                       PyUnicode_AS_UNICODE(output),
+                       len * sizeof(Py_UNICODE));
+            }
+            else {
+                translated = output;
+            }
+            out_str = PyUnicode_AS_UNICODE(translated);
+            for (in = 0, out = 0; in < len;) {
+                Py_UNICODE c = in_str[in++];
+                if (c >= 0x20) {
+                    out_str[out++] = c;
+                    continue;
+                }
+                if (c == '\n') {
+                    out_str[out++] = c;
+                    seennl |= SEEN_LF;
+                    continue;
+                }
+                if (c != '\r') {
+                    out_str[out++] = c;
+                    continue;
+                }
+                if (in_str[in] == '\n') {
+                    in++;
+                    seennl |= SEEN_CRLF;
+                }
+                else
+                    seennl |= SEEN_CR;
+                out_str[out++] = '\n';
+            }
+            if (translated != output) {
+                Py_DECREF(output);
+                output = translated;
+            }
+            if (out != in) {
+                if (PyUnicode_Resize(&output, out) < 0)
+                    goto error;
+            }
         }
+        self->seennl |= seennl;
     }
 
     return output;