[Python-checkins] r67833 - in sandbox/trunk/io-c: _fileio.c _iomodule.h _textio.c io.c io.py

amaury.forgeotdarc python-checkins at python.org
Thu Dec 18 00:17:08 CET 2008


Author: amaury.forgeotdarc
Date: Thu Dec 18 00:17:07 2008
New Revision: 67833

Log:
io-c: the IncrementalNewlineDecoder starts working.


Modified:
   sandbox/trunk/io-c/_fileio.c
   sandbox/trunk/io-c/_iomodule.h
   sandbox/trunk/io-c/_textio.c
   sandbox/trunk/io-c/io.c
   sandbox/trunk/io-c/io.py

Modified: sandbox/trunk/io-c/_fileio.c
==============================================================================
--- sandbox/trunk/io-c/_fileio.c	(original)
+++ sandbox/trunk/io-c/_fileio.c	Thu Dec 18 00:17:07 2008
@@ -311,7 +311,7 @@
 			goto error;
 	}
 
-        if (PyObject_SetAttrString(self, "name", nameobj) < 0)
+	if (PyObject_SetAttrString((PyObject *)self, "name", nameobj) < 0)
 		goto error;
 
 	goto done;
@@ -426,6 +426,31 @@
 	return PyLong_FromSsize_t(n);
 }
 
+static size_t
+new_buffersize(PyFileIOObject *self, size_t currentsize)
+{
+#ifdef HAVE_FSTAT
+	off_t pos, end;
+	struct stat st;
+	if (fstat(self->fd, &st) == 0) {
+		end = st.st_size;
+		pos = lseek(self->fd, 0L, SEEK_CUR);
+		if (end >= pos && pos >= 0)
+			return currentsize + end - pos + 1;
+		/* Add 1 so if the file were to grow we'd notice. */
+	}
+#endif
+	if (currentsize > SMALLCHUNK) {
+		/* Keep doubling until we reach BIGCHUNK;
+		   then keep adding BIGCHUNK. */
+		if (currentsize <= BIGCHUNK)
+			return currentsize + currentsize;
+		else
+			return currentsize + BIGCHUNK;
+	}
+	return currentsize + SMALLCHUNK;
+}
+
 static PyObject *
 fileio_readall(PyFileIOObject *self)
 {
@@ -438,17 +463,7 @@
 		return NULL;
 
 	while (1) {
-		Py_ssize_t newsize = (total < SMALLCHUNK) ? SMALLCHUNK : total;
-
-		/* Keep doubling until we reach BIGCHUNK;
-		   then keep adding BIGCHUNK. */
-		if (newsize <= BIGCHUNK) {
-			newsize += newsize;
-		}
-		else {
-			/* NOTE: overflow impossible due to limits on BUFSIZ */
-			newsize += BIGCHUNK;
-		}
+		Py_ssize_t newsize = new_buffersize(self, total);
 
 		if (PyBytes_GET_SIZE(result) < newsize) {
 			if (_PyBytes_Resize(&result, newsize) < 0) {

Modified: sandbox/trunk/io-c/_iomodule.h
==============================================================================
--- sandbox/trunk/io-c/_iomodule.h	(original)
+++ sandbox/trunk/io-c/_iomodule.h	Thu Dec 18 00:17:07 2008
@@ -12,6 +12,7 @@
 extern PyTypeObject PyBufferedRWPair_Type;
 extern PyTypeObject PyBufferedRandom_Type;
 extern PyTypeObject PyTextIOWrapper_Type;
+extern PyTypeObject PyIncrementalNewlineDecoder_Type;
 
 extern PyObject* _PyIOBase_checkReadable(PyObject *self, PyObject *unused);
 extern PyObject* _PyIOBase_checkWritable(PyObject *self, PyObject *unused);

Modified: sandbox/trunk/io-c/_textio.c
==============================================================================
--- sandbox/trunk/io-c/_textio.c	(original)
+++ sandbox/trunk/io-c/_textio.c	Thu Dec 18 00:17:07 2008
@@ -43,6 +43,245 @@
 /* XXX properties: encoding, newlines */
 
 
+/* IncrementalNewlineDecoder */
+
+PyDoc_STRVAR(IncrementalNewlineDecoder_doc,
+    "Codec used when reading a file in universal newlines mode.  It wraps\n"
+    "another incremental decoder, translating \\r\\n and \\r into \\n.  It also\n"
+    "records the types of newlines encountered.  When used with\n"
+    "translate=False, it ensures that the newline sequence is returned in\n"
+    "one piece.\n"
+    );
+
+typedef struct {
+    PyObject_HEAD
+    PyObject *decoder;
+    PyObject *errors;
+    int pendingcr:1;
+    int translate:1;
+    int seennl:3;
+} PyNewLineDecoderObject;
+
+static int
+IncrementalNewlineDecoder_init(PyNewLineDecoderObject *self, 
+                               PyObject *args, PyObject *kwds)
+{
+    PyObject *decoder;
+    int translate;
+    PyObject *errors = NULL;
+    char *kwlist[] = {"decoder", "translate", "errors", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "Oi|O:IncrementalNewlineDecoder",
+                                     kwlist, &decoder, &translate, &errors))
+        return -1;
+
+    self->decoder = decoder;
+    Py_INCREF(decoder);
+
+    if (errors == NULL) {
+        self->errors = PyUnicode_FromString("strict");
+        if (self->errors == NULL)
+            return -1;
+    }
+    else {
+        Py_INCREF(errors);
+        self->errors = errors;
+    }
+
+    self->translate = translate;
+    self->seennl = 0;
+    self->pendingcr = 0;
+
+    return 0;
+}
+
+#define SEEN_CR   1
+#define SEEN_LF   2
+#define SEEN_CRLF 4 
+
+static PyObject *
+IncrementalNewlineDecoder_decode(PyNewLineDecoderObject *self, 
+                                 PyObject *args, PyObject *kwds)
+{
+    char *kwlist[] = {"input", "final", NULL};
+    PyObject *input, *output;
+    int final = 0;
+    Py_ssize_t cr=0, lf=0, crlf=0;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|i:IncrementalNewlineDecoder",
+                                     kwlist, &input, &final))
+        return NULL;
+
+    if (self->decoder == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                        "IncrementalNewlineDecoder.__init__ not called");
+        return NULL;
+    }
+
+    /* decode input (with the eventual \r from a previous pass) */
+    output = PyObject_CallMethod(self->decoder, "decode", "Oi", input, final);
+    if (output == NULL)
+        return NULL;
+
+    if (!PyUnicode_Check(output)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "decoder should return a string result");
+        goto error;
+    }
+
+    if (self->pendingcr && (final || PyUnicode_GET_SIZE(output) > 0)) {
+        PyObject *modified;
+
+        modified = PyUnicode_FromOrdinal('\r');
+        if (modified == NULL)
+            goto error;
+
+        PyUnicode_Append(&modified, output);
+        if (modified == NULL)
+            goto error;
+
+        Py_DECREF(output);
+        output = modified;
+        self->pendingcr = 0;
+    }
+
+    /* retain last \r even when not translating data:
+     * then readline() is sure to get \r\n in one pass
+     */
+    if (!final) {
+        Py_ssize_t output_len = PyUnicode_GET_SIZE(output);
+
+        if (output_len > 0 
+            && PyUnicode_AS_UNICODE(output)[output_len - 1] == '\r') {
+
+            PyObject *modified = PyUnicode_FromUnicode(
+                PyUnicode_AS_UNICODE(output),
+                output_len - 1);
+            if (modified == NULL)
+                goto error;
+
+            Py_DECREF(output);
+            output = modified;
+            self->pendingcr = 1;
+        }
+
+    }
+
+    /* Record which newlines are read */
+    {
+        Py_UNICODE c;
+        Py_ssize_t index, previous;
+        int pendingcr = 0;
+
+        previous = '\0';
+
+        for(index = 0; index < PyUnicode_GET_SIZE(output); index++) {
+            c = PyUnicode_AS_UNICODE(output)[index];
+
+            switch (c) {
+            case '\r':
+                cr++;
+                pendingcr = 1;
+                break;
+
+            case '\n':
+                if (pendingcr) {
+                    cr--;
+                    crlf++;
+                    pendingcr = 0;
+                }
+                else
+                    lf++;
+                break;
+
+            default:
+                pendingcr = 0;
+                break;
+            }
+        }
+        
+        if (cr)
+            self->seennl |= SEEN_CR;
+        if (lf)
+            self->seennl |= SEEN_LF;
+        if (crlf)
+            self->seennl |= SEEN_CRLF;
+    }
+
+    if (self->translate) {
+        PyObject *modified;
+
+        if (crlf) {
+            modified = PyObject_CallMethod(output, "replace", "ss", "\r\n", "\n");
+            if (modified == NULL)
+                goto error;
+            Py_DECREF(output);
+            output = modified;
+        }
+        if (cr) {
+            modified = PyObject_CallMethod(output, "replace", "ss", "\r", "\n");
+            if (modified == NULL)
+                goto error;
+            Py_DECREF(output);
+            output = modified;
+        }
+    }
+
+    return output;
+
+  error:
+    Py_DECREF(output);
+    return NULL;
+}
+
+static PyMethodDef IncrementalNewlineDecoder_methods[] = {
+    {"decode", (PyCFunction)IncrementalNewlineDecoder_decode, METH_VARARGS|METH_KEYWORDS},
+    {0}
+};
+
+PyTypeObject PyIncrementalNewlineDecoder_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "IncrementalNewlineDecoder", /*tp_name*/
+    sizeof(PyNewLineDecoderObject), /*tp_basicsize*/
+    0,                          /*tp_itemsize*/
+    0,                          /*tp_dealloc*/
+    0,                          /*tp_print*/
+    0,                          /*tp_getattr*/
+    0,                          /*tp_setattr*/
+    0,                          /*tp_compare */
+    0,                          /*tp_repr*/
+    0,                          /*tp_as_number*/
+    0,                          /*tp_as_sequence*/
+    0,                          /*tp_as_mapping*/
+    0,                          /*tp_hash */
+    0,                          /*tp_call*/
+    0,                          /*tp_str*/
+    0,                          /*tp_getattro*/
+    0,                          /*tp_setattro*/
+    0,                          /*tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,  /*tp_flags*/
+    IncrementalNewlineDecoder_doc,          /* tp_doc */
+    0,                          /* tp_traverse */
+    0,                          /* tp_clear */
+    0,                          /* tp_richcompare */
+    0,                          /*tp_weaklistoffset*/
+    0,                          /* tp_iter */
+    0,                          /* tp_iternext */
+    IncrementalNewlineDecoder_methods, /* tp_methods */
+    0,                          /* tp_members */
+    0,                          /* tp_getset */
+    0,                          /* tp_base */
+    0,                          /* tp_dict */
+    0,                          /* tp_descr_get */
+    0,                          /* tp_descr_set */
+    0,                          /*tp_dictoffset*/
+    (initproc)IncrementalNewlineDecoder_init, /* tp_init */
+    0,                          /* tp_alloc */
+    PyType_GenericNew,          /* tp_new */
+};
+
+
+
 /* TextIOWrapper */
 
 PyDoc_STRVAR(TextIOWrapper_doc,
@@ -165,7 +404,7 @@
     if (errors == NULL)
 	errors = "strict";
 
-    self->chunk_size = 8192;
+    self->chunk_size = 2048;
     self->readuniversal = (newline == NULL || newline[0] == '\0');
     self->line_buffering = line_buffering;
     self->readtranslate = (newline == NULL);
@@ -196,9 +435,13 @@
 	    goto error;
 
         if (self->readuniversal) {
-            /* XXX
-             * decoder = IncrementalNewlineDecoder(decoder, self._readtranslate)
-             */
+            PyObject *incrementalDecoder = PyObject_CallFunction(
+                (PyObject *)&PyIncrementalNewlineDecoder_Type,
+                "Oi", self->decoder, (int)self->readtranslate);
+            if (incrementalDecoder == NULL)
+                goto error;
+            Py_CLEAR(self->decoder);
+            self->decoder = incrementalDecoder;
         }
     }
 
@@ -649,8 +892,17 @@
     /* Rewind decoded_chars to just after the line ending we found. */
     TextIOWrapper_rewind_decoded_chars(
         self, PyUnicode_GET_SIZE(line) - endpos);
-    if (PyUnicode_Resize(&line, endpos) < 0)
-        goto error;
+
+    if (PyUnicode_GET_SIZE(line) != endpos) {
+	PyObject *resized = PyUnicode_FromUnicode(
+	    PyUnicode_AS_UNICODE(line), endpos);
+
+	if (resized == NULL)
+	    goto error;
+
+	Py_DECREF(line);
+	line = resized;
+    }
     return line;
 
   error:

Modified: sandbox/trunk/io-c/io.c
==============================================================================
--- sandbox/trunk/io-c/io.c	(original)
+++ sandbox/trunk/io-c/io.c	Thu Dec 18 00:17:07 2008
@@ -593,6 +593,12 @@
     Py_INCREF(&PyTextIOWrapper_Type);
     PyModule_AddObject(m, "TextIOWrapper", (PyObject *) &PyTextIOWrapper_Type);
 
+    /* TextIOWrapper */
+    if (PyType_Ready(&PyIncrementalNewlineDecoder_Type) < 0)
+        goto fail;
+    Py_INCREF(&PyIncrementalNewlineDecoder_Type);
+    PyModule_AddObject(m, "IncrementalNewlineDecoder", (PyObject *) &PyIncrementalNewlineDecoder_Type);
+
     return m;
 
   fail:

Modified: sandbox/trunk/io-c/io.py
==============================================================================
--- sandbox/trunk/io-c/io.py	(original)
+++ sandbox/trunk/io-c/io.py	Thu Dec 18 00:17:07 2008
@@ -1296,25 +1296,23 @@
     """
     def __init__(self, decoder, translate, errors='strict'):
         codecs.IncrementalDecoder.__init__(self, errors=errors)
-        self.buffer = b''
         self.translate = translate
         self.decoder = decoder
         self.seennl = 0
+        self.pendingcr = False
 
     def decode(self, input, final=False):
         # decode input (with the eventual \r from a previous pass)
-        if self.buffer:
-            input = self.buffer + input
-
         output = self.decoder.decode(input, final=final)
+        if self.pendingcr and (output or final):
+            output = "\r" + output
+            self.pendingcr = False
 
         # retain last \r even when not translating data:
         # then readline() is sure to get \r\n in one pass
         if output.endswith("\r") and not final:
             output = output[:-1]
-            self.buffer = b'\r'
-        else:
-            self.buffer = b''
+            self.pendingcr = True
 
         # Record which newlines are read
         crlf = output.count('\r\n')
@@ -1333,20 +1331,19 @@
 
     def getstate(self):
         buf, flag = self.decoder.getstate()
-        return buf + self.buffer, flag
+        flag <<= 1
+        if self.pendingcr:
+            flag |= 1
+        return buf, flag
 
     def setstate(self, state):
         buf, flag = state
-        if buf.endswith(b'\r'):
-            self.buffer = b'\r'
-            buf = buf[:-1]
-        else:
-            self.buffer = b''
-        self.decoder.setstate((buf, flag))
+        self.pendingcr = bool(flag & 1)
+        self.decoder.setstate((buf, flag >> 1))
 
     def reset(self):
         self.seennl = 0
-        self.buffer = b''
+        self.pendingcr = False
         self.decoder.reset()
 
     _LF = 1


More information about the Python-checkins mailing list