[Python-checkins] r70208 - in python/branches/py3k: Lib/test/test_io.py Modules/_textio.c

antoine.pitrou python-checkins at python.org
Sat Mar 7 00:40:56 CET 2009


Author: antoine.pitrou
Date: Sat Mar  7 00:40:56 2009
New Revision: 70208

Log:
Issue #5433: Excessive newline detection optimization in IncrementalNewlineDecoder



Modified:
   python/branches/py3k/Lib/test/test_io.py
   python/branches/py3k/Modules/_textio.c

Modified: python/branches/py3k/Lib/test/test_io.py
==============================================================================
--- python/branches/py3k/Lib/test/test_io.py	(original)
+++ python/branches/py3k/Lib/test/test_io.py	Sat Mar  7 00:40:56 2009
@@ -1915,6 +1915,19 @@
         decoder = self.IncrementalNewlineDecoder(decoder, translate=True)
         self.check_newline_decoding_utf8(decoder)
 
+    def test_newline_bytes(self):
+        # Issue 5433: Excessive optimization in IncrementalNewlineDecoder
+        def _check(dec):
+            self.assertEquals(dec.newlines, None)
+            self.assertEquals(dec.decode("\u0D00"), "\u0D00")
+            self.assertEquals(dec.newlines, None)
+            self.assertEquals(dec.decode("\u0A00"), "\u0A00")
+            self.assertEquals(dec.newlines, None)
+        dec = self.IncrementalNewlineDecoder(None, translate=False)
+        _check(dec)
+        dec = self.IncrementalNewlineDecoder(None, translate=True)
+        _check(dec)
+
 class CIncrementalNewlineDecoderTest(IncrementalNewlineDecoderTest):
     pass
 

Modified: python/branches/py3k/Modules/_textio.c
==============================================================================
--- python/branches/py3k/Modules/_textio.c	(original)
+++ python/branches/py3k/Modules/_textio.c	Sat Mar  7 00:40:56 2009
@@ -305,22 +305,40 @@
            for the \r *byte* with the libc's optimized memchr.
            */
         if (seennl == SEEN_LF || seennl == 0) {
-            int has_cr, has_lf;
-            has_lf = (seennl == SEEN_LF) ||
-                    (memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL);
-            has_cr = (memchr(in_str, '\r', len * sizeof(Py_UNICODE)) != NULL);
-            if (has_lf && !has_cr) {
-                only_lf = 1;
-                seennl = SEEN_LF;
-            }
+            only_lf = !(memchr(in_str, '\r', len * sizeof(Py_UNICODE)) != NULL);
         }
 
-        if (!self->translate) {
+        if (only_lf) {
+            /* If not already seen, quick scan for a possible "\n" character.
+               (there's nothing else to be done, even when in translation mode)
+            */
+            if (seennl == 0 &&
+                memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL) {
+                Py_UNICODE *s, *end;
+                s = in_str;
+                end = in_str + len;
+                for (;;) {
+                    Py_UNICODE c;
+                    /* Fast loop for non-control characters */
+                    while (*s > '\n')
+                        s++;
+                    c = *s++;
+                    if (c == '\n') {
+                        seennl |= SEEN_LF;
+                        break;
+                    }
+                    if (s > end)
+                        break;
+                }
+            }
+            /* Finished: we have scanned for newlines, and none of them
+               need translating */
+        }
+        else if (!self->translate) {
             Py_UNICODE *s, *end;
+            /* We have already seen all newline types, no need to scan again */
             if (seennl == SEEN_ALL)
                 goto endscan;
-            if (only_lf)
-                goto endscan;
             s = in_str;
             end = in_str + len;
             for (;;) {
@@ -347,7 +365,7 @@
         endscan:
             ;
         }
-        else if (!only_lf) {
+        else {
             PyObject *translated = NULL;
             Py_UNICODE *out_str;
             Py_UNICODE *in, *out, *end;


More information about the Python-checkins mailing list