[Python-checkins] r88607 - in python/branches/py3k: Lib/_pyio.py Misc/NEWS Modules/_io/textio.c

Fri Feb 25 21:27:33 CET 2011

Author: antoine.pitrou
Date: Fri Feb 25 21:27:33 2011
New Revision: 88607

Log:
Issue #11114: Fix catastrophic performance of tell() on text files (up
to 1000x faster in some cases).  It is still one to two order of magnitudes
slower than binary tell().



Modified:
   python/branches/py3k/Lib/_pyio.py
   python/branches/py3k/Misc/NEWS
   python/branches/py3k/Modules/_io/textio.c

Modified: python/branches/py3k/Lib/_pyio.py
==============================================================================

--- python/branches/py3k/Lib/_pyio.py	(original)
+++ python/branches/py3k/Lib/_pyio.py	Fri Feb 25 21:27:33 2011
@@ -1488,6 +1488,7 @@
         self._decoded_chars_used = 0  # offset into _decoded_chars for read()
         self._snapshot = None  # info for reconstructing decoder state
         self._seekable = self._telling = self.buffer.seekable()
+        self._b2cratio = 0.0
 
         if self._seekable and self.writable():
             position = self.buffer.tell()
@@ -1655,7 +1656,12 @@
         # Read a chunk, decode it, and put the result in self._decoded_chars.
         input_chunk = self.buffer.read1(self._CHUNK_SIZE)
         eof = not input_chunk
-        self._set_decoded_chars(self._decoder.decode(input_chunk, eof))
+        decoded_chars = self._decoder.decode(input_chunk, eof)
+        self._set_decoded_chars(decoded_chars)
+        if decoded_chars:
+            self._b2cratio = len(input_chunk) / len(self._decoded_chars)
+        else:
+            self._b2cratio = 0.0
 
         if self._telling:
             # At the snapshot point, len(dec_buffer) bytes before the read,
@@ -1709,20 +1715,56 @@
         # forward until it gives us enough decoded characters.
         saved_state = decoder.getstate()
         try:
+            # Fast search for an acceptable start point, close to our
+            # current pos.
+            # Rationale: calling decoder.decode() has a large overhead
+            # regardless of chunk size; we want the number of such calls to
+            # be O(1) in most situations (common decoders, non-crazy input).
+            # Actually, it will be exactly 1 for fixed-size codecs (all
+            # 8-bit codecs, also UTF-16 and UTF-32).
+            skip_bytes = int(self._b2cratio * chars_to_skip)
+            skip_back = 1
+            assert skip_bytes <= len(next_input)
+            while skip_bytes > 0:
+                decoder.setstate((b'', dec_flags))
+                # Decode up to temptative start point
+                n = len(decoder.decode(next_input[:skip_bytes]))
+                if n <= chars_to_skip:
+                    b, d = decoder.getstate()
+                    if not b:
+                        # Before pos and no bytes buffered in decoder => OK
+                        dec_flags = d
+                        chars_to_skip -= n
+                        break
+                    # Skip back by buffered amount and reset heuristic
+                    skip_bytes -= len(b)
+                    skip_back = 1
+                else:
+                    # We're too far ahead, skip back a bit
+                    skip_bytes -= skip_back
+                    skip_back = skip_back * 2
+            else:
+                skip_bytes = 0
+                decoder.setstate((b'', dec_flags))
+
             # Note our initial start point.
-            decoder.setstate((b'', dec_flags))
-            start_pos = position
-            start_flags, bytes_fed, chars_decoded = dec_flags, 0, 0
-            need_eof = 0
+            start_pos = position + skip_bytes
+            start_flags = dec_flags
+            if chars_to_skip == 0:
+                # We haven't moved from the start point.
+                return self._pack_cookie(start_pos, start_flags)
 
             # Feed the decoder one byte at a time.  As we go, note the
             # nearest "safe start point" before the current location
             # (a point where the decoder has nothing buffered, so seek()
             # can safely start from there and advance to this location).
-            next_byte = bytearray(1)
-            for next_byte[0] in next_input:
+            bytes_fed = 0
+            need_eof = 0
+            # Chars decoded since `start_pos`
+            chars_decoded = 0
+            for i in range(skip_bytes, len(next_input)):
                 bytes_fed += 1
-                chars_decoded += len(decoder.decode(next_byte))
+                chars_decoded += len(decoder.decode(next_input[i:i+1]))
                 dec_buffer, dec_flags = decoder.getstate()
                 if not dec_buffer and chars_decoded <= chars_to_skip:
                     # Decoder buffer is empty, so this is a safe start point.

Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS	(original)
+++ python/branches/py3k/Misc/NEWS	Fri Feb 25 21:27:33 2011
@@ -35,6 +35,10 @@
 Library
 -------
 
+- Issue #11114: Fix catastrophic performance of tell() on text files (up
+  to 1000x faster in some cases).  It is still one to two order of magnitudes
+  slower than binary tell().
+
 - Issue 10882: Add os.sendfile function.
 
 - Issue #10868: Allow usage of the register method of an ABC as a class

Modified: python/branches/py3k/Modules/_io/textio.c
==============================================================================
--- python/branches/py3k/Modules/_io/textio.c	(original)
+++ python/branches/py3k/Modules/_io/textio.c	Fri Feb 25 21:27:33 2011
@@ -678,12 +678,16 @@
     PyObject *pending_bytes;       /* list of bytes objects waiting to be
                                       written, or NULL */
     Py_ssize_t pending_bytes_count;
-    PyObject *snapshot;
+
     /* snapshot is either None, or a tuple (dec_flags, next_input) where
      * dec_flags is the second (integer) item of the decoder state and
      * next_input is the chunk of input bytes that comes next after the
      * snapshot point.  We use this to reconstruct decoder states in tell().
      */
+    PyObject *snapshot;
+    /* Bytes-to-characters ratio for the current chunk. Serves as input for
+       the heuristic in tell(). */
+    double b2cratio;
 
     /* Cache raw object if it's a FileIO object */
     PyObject *raw;
@@ -850,6 +854,7 @@
     self->decoded_chars_used = 0;
     self->pending_bytes_count = 0;
     self->encodefunc = NULL;
+    self->b2cratio = 0.0;
 
     if (encoding == NULL) {
         /* Try os.device_encoding(fileno) */
@@ -1390,6 +1395,7 @@
     PyObject *dec_flags = NULL;
     PyObject *input_chunk = NULL;
     PyObject *decoded_chars, *chunk_size;
+    Py_ssize_t nbytes, nchars;
     int eof;
 
     /* The return value is True unless EOF was reached.  The decoded string is
@@ -1435,7 +1441,8 @@
         goto fail;
     assert(PyBytes_Check(input_chunk));
 
-    eof = (PyBytes_Size(input_chunk) == 0);
+    nbytes = PyBytes_Size(input_chunk);
+    eof = (nbytes == 0);
 
     if (Py_TYPE(self->decoder) == &PyIncrementalNewlineDecoder_Type) {
         decoded_chars = _PyIncrementalNewlineDecoder_decode(
@@ -1450,7 +1457,12 @@
     if (decoded_chars == NULL)
         goto fail;
     textiowrapper_set_decoded_chars(self, decoded_chars);
-    if (PyUnicode_GET_SIZE(decoded_chars) > 0)
+    nchars = PyUnicode_GET_SIZE(decoded_chars);
+    if (nchars > 0)
+        self->b2cratio = (double) nbytes / nchars;
+    else
+        self->b2cratio = 0.0;
+    if (nchars > 0)
         eof = 0;
 
     if (self->telling) {
@@ -2139,8 +2151,12 @@
     cookie_type cookie = {0,0,0,0,0};
     PyObject *next_input;
     Py_ssize_t chars_to_skip, chars_decoded;
+    Py_ssize_t skip_bytes, skip_back;
     PyObject *saved_state = NULL;
     char *input, *input_end;
+    char *dec_buffer;
+    Py_ssize_t dec_buffer_len;
+    int dec_flags;
 
     CHECK_INITIALIZED(self);
     CHECK_CLOSED(self);
@@ -2176,6 +2192,7 @@
 #else
     cookie.start_pos = PyLong_AsLong(posobj);
 #endif
+    Py_DECREF(posobj);
     if (PyErr_Occurred())
         goto fail;
 
@@ -2190,57 +2207,99 @@
     /* How many decoded characters have been used up since the snapshot? */
     if (self->decoded_chars_used == 0)  {
         /* We haven't moved from the snapshot point. */
-        Py_DECREF(posobj);
         return textiowrapper_build_cookie(&cookie);
     }
 
     chars_to_skip = self->decoded_chars_used;
 
-    /* Starting from the snapshot position, we will walk the decoder
-     * forward until it gives us enough decoded characters.
-     */
+    /* Decoder state will be restored at the end */
     saved_state = PyObject_CallMethodObjArgs(self->decoder,
                                              _PyIO_str_getstate, NULL);
     if (saved_state == NULL)
         goto fail;
 
-    /* Note our initial start point. */
-    if (_textiowrapper_decoder_setstate(self, &cookie) < 0)
-        goto fail;
+#define DECODER_GETSTATE() do { \
+        PyObject *_state = PyObject_CallMethodObjArgs(self->decoder, \
+            _PyIO_str_getstate, NULL); \
+        if (_state == NULL) \
+            goto fail; \
+        if (!PyArg_Parse(_state, "(y#i)", &dec_buffer, &dec_buffer_len, &dec_flags)) { \
+            Py_DECREF(_state); \
+            goto fail; \
+        } \
+        Py_DECREF(_state); \
+    } while (0)
+
+    /* TODO: replace assert with exception */
+#define DECODER_DECODE(start, len, res) do { \
+        PyObject *_decoded = PyObject_CallMethod( \
+            self->decoder, "decode", "y#", start, len); \
+        if (_decoded == NULL) \
+            goto fail; \
+        assert (PyUnicode_Check(_decoded)); \
+        res = PyUnicode_GET_SIZE(_decoded); \
+        Py_DECREF(_decoded); \
+    } while (0)
+
+    /* Fast search for an acceptable start point, close to our
+       current pos */
+    skip_bytes = (Py_ssize_t) (self->b2cratio * chars_to_skip);
+    skip_back = 1;
+    assert(skip_back <= PyBytes_GET_SIZE(next_input));
+    input = PyBytes_AS_STRING(next_input);
+    while (skip_bytes > 0) {
+        /* Decode up to temptative start point */
+        if (_textiowrapper_decoder_setstate(self, &cookie) < 0)
+            goto fail;
+        DECODER_DECODE(input, skip_bytes, chars_decoded);
+        if (chars_decoded <= chars_to_skip) {
+            DECODER_GETSTATE();
+            if (dec_buffer_len == 0) {
+                /* Before pos and no bytes buffered in decoder => OK */
+                cookie.dec_flags = dec_flags;
+                chars_to_skip -= chars_decoded;
+                break;
+            }
+            /* Skip back by buffered amount and reset heuristic */
+            skip_bytes -= dec_buffer_len;
+            skip_back = 1;
+        }
+        else {
+            /* We're too far ahead, skip back a bit */
+            skip_bytes -= skip_back;
+            skip_back *= 2;
+        }
+    }
+    if (skip_bytes <= 0) {
+        skip_bytes = 0;
+        if (_textiowrapper_decoder_setstate(self, &cookie) < 0)
+            goto fail;
+    }
 
-    /* Feed the decoder one byte at a time.  As we go, note the
-     * nearest "safe start point" before the current location
-     * (a point where the decoder has nothing buffered, so seek()
+    /* Note our initial start point. */
+    cookie.start_pos += skip_bytes;
+    cookie.chars_to_skip = chars_to_skip;
+    if (chars_to_skip == 0)
+        goto finally;
+
+    /* We should be close to the desired position.  Now feed the decoder one
+     * byte at a time until we reach the `chars_to_skip` target.
+     * As we go, note the nearest "safe start point" before the current
+     * location (a point where the decoder has nothing buffered, so seek()
      * can safely start from there and advance to this location).
      */
     chars_decoded = 0;
     input = PyBytes_AS_STRING(next_input);
     input_end = input + PyBytes_GET_SIZE(next_input);
+    input += skip_bytes;
     while (input < input_end) {
-        PyObject *state;
-        char *dec_buffer;
-        Py_ssize_t dec_buffer_len;
-        int dec_flags;
-
-        PyObject *decoded = PyObject_CallMethod(
-            self->decoder, "decode", "y#", input, 1);
-        if (decoded == NULL)
-            goto fail;
-        assert (PyUnicode_Check(decoded));
-        chars_decoded += PyUnicode_GET_SIZE(decoded);
-        Py_DECREF(decoded);
+        Py_ssize_t n;
 
+        DECODER_DECODE(input, 1, n);
+        /* We got n chars for 1 byte */
+        chars_decoded += n;
         cookie.bytes_to_feed += 1;
-
-        state = PyObject_CallMethodObjArgs(self->decoder,
-                                           _PyIO_str_getstate, NULL);
-        if (state == NULL)
-            goto fail;
-        if (!PyArg_Parse(state, "(y#i)", &dec_buffer, &dec_buffer_len, &dec_flags)) {
-            Py_DECREF(state);
-            goto fail;
-        }
-        Py_DECREF(state);
+        DECODER_GETSTATE();
 
         if (dec_buffer_len == 0 && chars_decoded <= chars_to_skip) {
             /* Decoder buffer is empty, so this is a safe start point. */
@@ -2272,8 +2331,7 @@
         }
     }
 
-    /* finally */
-    Py_XDECREF(posobj);
+finally:
     res = PyObject_CallMethod(self->decoder, "setstate", "(O)", saved_state);
     Py_DECREF(saved_state);
     if (res == NULL)
@@ -2284,8 +2342,7 @@
     cookie.chars_to_skip = Py_SAFE_DOWNCAST(chars_to_skip, Py_ssize_t, int);
     return textiowrapper_build_cookie(&cookie);
 
-  fail:
-    Py_XDECREF(posobj);
+fail:
     if (saved_state) {
         PyObject *type, *value, *traceback;
         PyErr_Fetch(&type, &value, &traceback);