[Python-checkins] r68385 - sandbox/trunk/io-c/_textio.c
antoine.pitrou
python-checkins at python.org
Thu Jan 8 00:31:02 CET 2009
Author: antoine.pitrou
Date: Thu Jan 8 00:31:01 2009
New Revision: 68385
Log:
Make newline translation much faster.
Modified:
sandbox/trunk/io-c/_textio.c
Modified: sandbox/trunk/io-c/_textio.c
==============================================================================
--- sandbox/trunk/io-c/_textio.c (original)
+++ sandbox/trunk/io-c/_textio.c Thu Jan 8 00:31:01 2009
@@ -103,7 +103,7 @@
char *kwlist[] = {"input", "final", NULL};
PyObject *input, *output;
int final = 0;
- Py_ssize_t cr=0, lf=0, crlf=0;
+ Py_ssize_t output_len;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|i:IncrementalNewlineDecoder",
kwlist, &input, &final))
@@ -126,102 +126,125 @@
goto error;
}
- if (self->pendingcr && (final || PyUnicode_GET_SIZE(output) > 0)) {
- PyObject *modified;
-
- modified = PyUnicode_FromOrdinal('\r');
+ output_len = PyUnicode_GET_SIZE(output);
+ if (self->pendingcr && (final || output_len > 0)) {
+ Py_UNICODE *out;
+ PyObject *modified = PyUnicode_FromUnicode(NULL, output_len + 1);
if (modified == NULL)
goto error;
-
- PyUnicode_Append(&modified, output);
- if (modified == NULL)
- goto error;
-
+ out = PyUnicode_AS_UNICODE(modified);
+ out[0] = '\r';
+ memcpy(out + 1, PyUnicode_AS_UNICODE(output),
+ output_len * sizeof(Py_UNICODE));
Py_DECREF(output);
output = modified;
self->pendingcr = 0;
+ output_len++;
}
/* retain last \r even when not translating data:
* then readline() is sure to get \r\n in one pass
*/
if (!final) {
- Py_ssize_t output_len = PyUnicode_GET_SIZE(output);
-
if (output_len > 0
&& PyUnicode_AS_UNICODE(output)[output_len - 1] == '\r') {
- PyObject *modified = PyUnicode_FromUnicode(
- PyUnicode_AS_UNICODE(output),
- output_len - 1);
- if (modified == NULL)
- goto error;
-
- Py_DECREF(output);
- output = modified;
+ if (Py_REFCNT(output) == 1) {
+ if (PyUnicode_Resize(&output, output_len - 1) < 0)
+ goto error;
+ }
+ else {
+ PyObject *modified = PyUnicode_FromUnicode(
+ PyUnicode_AS_UNICODE(output),
+ output_len - 1);
+ if (modified == NULL)
+ goto error;
+ Py_DECREF(output);
+ output = modified;
+ }
self->pendingcr = 1;
}
-
}
- /* Record which newlines are read */
+ /* Record which newlines are read and do newline translation if desired,
+ all in one pass. */
{
- Py_UNICODE c;
- Py_ssize_t index, previous;
- int pendingcr = 0;
-
- previous = '\0';
-
- for(index = 0; index < PyUnicode_GET_SIZE(output); index++) {
- c = PyUnicode_AS_UNICODE(output)[index];
-
- switch (c) {
- case '\r':
- cr++;
- pendingcr = 1;
- break;
-
- case '\n':
- if (pendingcr) {
- cr--;
- crlf++;
- pendingcr = 0;
+ Py_UNICODE *in_str;
+ Py_ssize_t in, len;
+ int seennl = 0;
+
+ in_str = PyUnicode_AS_UNICODE(output);
+ len = PyUnicode_GET_SIZE(output);
+ if (!self->translate) {
+ for (in = 0; in < len;) {
+ Py_UNICODE c = in_str[in++];
+ if (c >= 0x20)
+ continue;
+ if (c == '\n')
+ seennl |= SEEN_LF;
+ else if (c == '\r') {
+ if (in_str[in] == '\n') {
+ seennl |= SEEN_CRLF;
+ in++;
+ }
+ else
+ seennl |= SEEN_CR;
}
- else
- lf++;
- break;
-
- default:
- pendingcr = 0;
- break;
}
}
-
- if (cr)
- self->seennl |= SEEN_CR;
- if (lf)
- self->seennl |= SEEN_LF;
- if (crlf)
- self->seennl |= SEEN_CRLF;
- }
-
- if (self->translate) {
- PyObject *modified;
-
- if (crlf) {
- modified = PyObject_CallMethod(output, "replace", "ss", "\r\n", "\n");
- if (modified == NULL)
- goto error;
- Py_DECREF(output);
- output = modified;
- }
- if (cr) {
- modified = PyObject_CallMethod(output, "replace", "ss", "\r", "\n");
- if (modified == NULL)
- goto error;
- Py_DECREF(output);
- output = modified;
+ else {
+ PyObject *translated = NULL;
+ Py_UNICODE *out_str;
+ Py_ssize_t out;
+ if (Py_REFCNT(output) != 1 || len < 2) {
+ /* We could try to optimize this so that we only do a copy
+ when there is something to translate. On the other hand,
+ most decoders should only output non-shared strings, i.e.
+ translation is done in place. */
+ translated = PyUnicode_FromUnicode(NULL, len);
+ if (translated == NULL)
+ goto error;
+ memcpy(PyUnicode_AS_UNICODE(translated),
+ PyUnicode_AS_UNICODE(output),
+ len * sizeof(Py_UNICODE));
+ }
+ else {
+ translated = output;
+ }
+ out_str = PyUnicode_AS_UNICODE(translated);
+ for (in = 0, out = 0; in < len;) {
+ Py_UNICODE c = in_str[in++];
+ if (c >= 0x20) {
+ out_str[out++] = c;
+ continue;
+ }
+ if (c == '\n') {
+ out_str[out++] = c;
+ seennl |= SEEN_LF;
+ continue;
+ }
+ if (c != '\r') {
+ out_str[out++] = c;
+ continue;
+ }
+ if (in_str[in] == '\n') {
+ in++;
+ seennl |= SEEN_CRLF;
+ }
+ else
+ seennl |= SEEN_CR;
+ out_str[out++] = '\n';
+ }
+ if (translated != output) {
+ Py_DECREF(output);
+ output = translated;
+ }
+ if (out != in) {
+ if (PyUnicode_Resize(&output, out) < 0)
+ goto error;
+ }
}
+ self->seennl |= seennl;
}
return output;
More information about the Python-checkins
mailing list