[Python-checkins] cpython (2.7): Issue #15068: Got rid of excessive buffering in the fileinput module.

serhiy.storchaka python-checkins at python.org
Tue Mar 8 11:37:18 EST 2016


https://hg.python.org/cpython/rev/5fbd16326353
changeset:   100459:5fbd16326353
branch:      2.7
parent:      100453:ff2368d5c1d3
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Tue Mar 08 18:35:45 2016 +0200
summary:
  Issue #15068: Got rid of excessive buffering in the fileinput module.
The bufsize parameter is no longer used.

files:
  Doc/library/fileinput.rst  |    6 +
  Lib/fileinput.py           |  150 +++++++++++-------------
  Lib/test/test_fileinput.py |   72 +++++++++++-
  Misc/NEWS                  |    3 +
  4 files changed, 147 insertions(+), 84 deletions(-)


diff --git a/Doc/library/fileinput.rst b/Doc/library/fileinput.rst
--- a/Doc/library/fileinput.rst
+++ b/Doc/library/fileinput.rst
@@ -60,6 +60,9 @@
    .. versionchanged:: 2.5
       Added the *mode* and *openhook* parameters.
 
+   .. versionchanged:: 2.7.12
+      The *bufsize* parameter is no longer used.
+
 The following functions use the global state created by :func:`fileinput.input`;
 if there is no active state, :exc:`RuntimeError` is raised.
 
@@ -143,6 +146,9 @@
    .. versionchanged:: 2.5
       Added the *mode* and *openhook* parameters.
 
+   .. versionchanged:: 2.7.12
+      The *bufsize* parameter is no longer used.
+
 **Optional in-place filtering:** if the keyword argument ``inplace=1`` is passed
 to :func:`fileinput.input` or to the :class:`FileInput` constructor, the file is
 moved to a backup file and standard output is directed to the input file (if a
diff --git a/Lib/fileinput.py b/Lib/fileinput.py
--- a/Lib/fileinput.py
+++ b/Lib/fileinput.py
@@ -64,13 +64,6 @@
 disabled when standard input is read.  XXX The current implementation
 does not work for MS-DOS 8+3 filesystems.
 
-Performance: this module is unfortunately one of the slower ways of
-processing large numbers of input lines.  Nevertheless, a significant
-speed-up has been obtained by using readlines(bufsize) instead of
-readline().  A new keyword argument, bufsize=N, is present on the
-input() function and the FileInput() class to override the default
-buffer size.
-
 XXX Possible additions:
 
 - optional getopt argument processing
@@ -86,6 +79,7 @@
 
 _state = None
 
+# No longer used
 DEFAULT_BUFSIZE = 8*1024
 
 def input(files=None, inplace=0, backup="", bufsize=0,
@@ -207,17 +201,15 @@
         self._files = files
         self._inplace = inplace
         self._backup = backup
-        self._bufsize = bufsize or DEFAULT_BUFSIZE
         self._savestdout = None
         self._output = None
         self._filename = None
-        self._lineno = 0
+        self._startlineno = 0
         self._filelineno = 0
         self._file = None
+        self._readline = self._start_readline
         self._isstdin = False
         self._backupfilename = None
-        self._buffer = []
-        self._bufindex = 0
         # restrict mode argument to reading modes
         if mode not in ('r', 'rU', 'U', 'rb'):
             raise ValueError("FileInput opening mode must be one of "
@@ -242,22 +234,18 @@
         return self
 
     def next(self):
-        try:
-            line = self._buffer[self._bufindex]
-        except IndexError:
-            pass
-        else:
-            self._bufindex += 1
-            self._lineno += 1
+        line = self._readline()
+        if line:
             self._filelineno += 1
             return line
-        line = self.readline()
-        if not line:
+        if not self._file:
             raise StopIteration
-        return line
+        self.nextfile()
+        # Recursive call
+        return self.next()
 
     def __getitem__(self, i):
-        if i != self._lineno:
+        if i != self.lineno():
             raise RuntimeError, "accessing lines out of order"
         try:
             return self.next()
@@ -277,7 +265,8 @@
                 output.close()
         finally:
             file = self._file
-            self._file = 0
+            self._file = None
+            self._readline = self._start_readline
             try:
                 if file and not self._isstdin:
                     file.close()
@@ -289,75 +278,72 @@
                     except OSError: pass
 
                 self._isstdin = False
-                self._buffer = []
-                self._bufindex = 0
 
     def readline(self):
-        try:
-            line = self._buffer[self._bufindex]
-        except IndexError:
-            pass
+        while 1:
+            line = self._readline()
+            if line:
+                self._filelineno += 1
+                return line
+            if not self._file:
+                return line
+            self.nextfile()
+            # repeat with next file
+
+    def _start_readline(self):
+        if not self._files:
+            return ""
+        self._filename = self._files[0]
+        self._files = self._files[1:]
+        self._startlineno = self.lineno()
+        self._filelineno = 0
+        self._file = None
+        self._isstdin = False
+        self._backupfilename = 0
+        if self._filename == '-':
+            self._filename = '<stdin>'
+            self._file = sys.stdin
+            self._isstdin = True
         else:
-            self._bufindex += 1
-            self._lineno += 1
-            self._filelineno += 1
-            return line
-        if not self._file:
-            if not self._files:
-                return ""
-            self._filename = self._files[0]
-            self._files = self._files[1:]
-            self._filelineno = 0
-            self._file = None
-            self._isstdin = False
-            self._backupfilename = 0
-            if self._filename == '-':
-                self._filename = '<stdin>'
-                self._file = sys.stdin
-                self._isstdin = True
+            if self._inplace:
+                self._backupfilename = (
+                    self._filename + (self._backup or os.extsep+"bak"))
+                try: os.unlink(self._backupfilename)
+                except os.error: pass
+                # The next few lines may raise IOError
+                os.rename(self._filename, self._backupfilename)
+                self._file = open(self._backupfilename, self._mode)
+                try:
+                    perm = os.fstat(self._file.fileno()).st_mode
+                except OSError:
+                    self._output = open(self._filename, "w")
+                else:
+                    fd = os.open(self._filename,
+                                    os.O_CREAT | os.O_WRONLY | os.O_TRUNC,
+                                    perm)
+                    self._output = os.fdopen(fd, "w")
+                    try:
+                        if hasattr(os, 'chmod'):
+                            os.chmod(self._filename, perm)
+                    except OSError:
+                        pass
+                self._savestdout = sys.stdout
+                sys.stdout = self._output
             else:
-                if self._inplace:
-                    self._backupfilename = (
-                        self._filename + (self._backup or os.extsep+"bak"))
-                    try: os.unlink(self._backupfilename)
-                    except os.error: pass
-                    # The next few lines may raise IOError
-                    os.rename(self._filename, self._backupfilename)
-                    self._file = open(self._backupfilename, self._mode)
-                    try:
-                        perm = os.fstat(self._file.fileno()).st_mode
-                    except OSError:
-                        self._output = open(self._filename, "w")
-                    else:
-                        fd = os.open(self._filename,
-                                     os.O_CREAT | os.O_WRONLY | os.O_TRUNC,
-                                     perm)
-                        self._output = os.fdopen(fd, "w")
-                        try:
-                            if hasattr(os, 'chmod'):
-                                os.chmod(self._filename, perm)
-                        except OSError:
-                            pass
-                    self._savestdout = sys.stdout
-                    sys.stdout = self._output
+                # This may raise IOError
+                if self._openhook:
+                    self._file = self._openhook(self._filename, self._mode)
                 else:
-                    # This may raise IOError
-                    if self._openhook:
-                        self._file = self._openhook(self._filename, self._mode)
-                    else:
-                        self._file = open(self._filename, self._mode)
-        self._buffer = self._file.readlines(self._bufsize)
-        self._bufindex = 0
-        if not self._buffer:
-            self.nextfile()
-        # Recursive call
-        return self.readline()
+                    self._file = open(self._filename, self._mode)
+
+        self._readline = self._file.readline
+        return self._readline()
 
     def filename(self):
         return self._filename
 
     def lineno(self):
-        return self._lineno
+        return self._startlineno + self._filelineno
 
     def filelineno(self):
         return self._filelineno
diff --git a/Lib/test/test_fileinput.py b/Lib/test/test_fileinput.py
--- a/Lib/test/test_fileinput.py
+++ b/Lib/test/test_fileinput.py
@@ -5,7 +5,7 @@
 
 import unittest
 from test.test_support import verbose, TESTFN, run_unittest
-from test.test_support import unlink as safe_unlink
+from test.test_support import unlink as safe_unlink, check_warnings
 import sys, re
 from StringIO import StringIO
 from fileinput import FileInput, hook_encoded
@@ -28,6 +28,42 @@
     for name in names:
         safe_unlink(name)
 
+class LineReader:
+
+    def __init__(self):
+        self._linesread = []
+
+    @property
+    def linesread(self):
+        try:
+            return self._linesread[:]
+        finally:
+            self._linesread = []
+
+    def openhook(self, filename, mode):
+        self.it = iter(filename.splitlines(True))
+        return self
+
+    def readline(self, size=None):
+        line = next(self.it, '')
+        self._linesread.append(line)
+        return line
+
+    def readlines(self, hint=-1):
+        lines = []
+        size = 0
+        while True:
+            line = self.readline()
+            if not line:
+                return lines
+            lines.append(line)
+            size += len(line)
+            if size >= hint:
+                return lines
+
+    def close(self):
+        pass
+
 class BufferSizesTests(unittest.TestCase):
     def test_buffer_sizes(self):
         # First, run the tests with default and teeny buffer size.
@@ -228,7 +264,7 @@
             f.write('\x80')
         self.addCleanup(safe_unlink, TESTFN)
 
-        fi = FileInput(files=TESTFN, openhook=hook_encoded('ascii'), bufsize=8)
+        fi = FileInput(files=TESTFN, openhook=hook_encoded('ascii'))
         # The most likely failure is a UnicodeDecodeError due to the entire
         # file being read when it shouldn't have been.
         self.assertEqual(fi.readline(), u'A\n')
@@ -239,6 +275,38 @@
             list(fi)
         fi.close()
 
+    def test_readline_buffering(self):
+        src = LineReader()
+        fi = FileInput(files=['line1\nline2', 'line3\n'], openhook=src.openhook)
+        self.assertEqual(src.linesread, [])
+        self.assertEqual(fi.readline(), 'line1\n')
+        self.assertEqual(src.linesread, ['line1\n'])
+        self.assertEqual(fi.readline(), 'line2')
+        self.assertEqual(src.linesread, ['line2'])
+        self.assertEqual(fi.readline(), 'line3\n')
+        self.assertEqual(src.linesread, ['', 'line3\n'])
+        self.assertEqual(fi.readline(), '')
+        self.assertEqual(src.linesread, [''])
+        self.assertEqual(fi.readline(), '')
+        self.assertEqual(src.linesread, [])
+        fi.close()
+
+    def test_iteration_buffering(self):
+        src = LineReader()
+        fi = FileInput(files=['line1\nline2', 'line3\n'], openhook=src.openhook)
+        self.assertEqual(src.linesread, [])
+        self.assertEqual(next(fi), 'line1\n')
+        self.assertEqual(src.linesread, ['line1\n'])
+        self.assertEqual(next(fi), 'line2')
+        self.assertEqual(src.linesread, ['line2'])
+        self.assertEqual(next(fi), 'line3\n')
+        self.assertEqual(src.linesread, ['', 'line3\n'])
+        self.assertRaises(StopIteration, next, fi)
+        self.assertEqual(src.linesread, [''])
+        self.assertRaises(StopIteration, next, fi)
+        self.assertEqual(src.linesread, [])
+        fi.close()
+
 class Test_hook_encoded(unittest.TestCase):
     """Unit tests for fileinput.hook_encoded()"""
 
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -58,6 +58,9 @@
 Library
 -------
 
+- Issue #15068: Got rid of excessive buffering in the fileinput module.
+  The bufsize parameter is no longer used.
+
 - Issue #2202: Fix UnboundLocalError in
   AbstractDigestAuthHandler.get_algorithm_impls.  Initial patch by Mathieu Dupuy.
 

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list