[pypy-svn] pypy default: More optimizations to csv.Reader

amauryfa commits-noreply at bitbucket.org
Tue Mar 1 15:35:21 CET 2011


Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: 
Changeset: r42366:ed70fe1739c6
Date: 2011-03-01 14:46 +0100
http://bitbucket.org/pypy/pypy/changeset/ed70fe1739c6/

Log:	More optimizations to csv.Reader

diff --git a/lib_pypy/_csv.py b/lib_pypy/_csv.py
--- a/lib_pypy/_csv.py
+++ b/lib_pypy/_csv.py
@@ -234,11 +234,12 @@
 
             self.line_num += 1
 
-            for c in line:
-                if c == '\0':
-                    raise Error("line contains NULL byte")
-                self._parse_process_char(c)
-            self._parse_process_char('\0')
+            if '\0' in line:
+                raise Error("line contains NULL byte")
+            pos = 0
+            while pos < len(line):
+                pos = self._parse_process_char(line, pos)
+            self._parse_eol()
 
             if self.state == self.START_RECORD:
                 break
@@ -247,46 +248,46 @@
         self.fields = []
         return fields
             
-    def _parse_process_char(self, c):
+    def _parse_process_char(self, line, pos):
+        c = line[pos]
         if self.state == self.IN_FIELD:
             # in unquoted field
-            if c in '\n\r\0':
-                # end of line - return [fields]
-                self._parse_save_field()
-                if c == '\0':
-                    self.state = self.START_RECORD
+            pos2 = pos
+            while True:
+                if c in '\n\r':
+                    # end of line - return [fields]
+                    self._parse_save_field()
+                    self.state = self.EAT_CRNL
+                elif c == self.dialect.escapechar:
+                    # possible escaped character
+                    self.state = self.ESCAPED_CHAR
+                elif c == self.dialect.delimiter:
+                    # save field - wait for new field
+                    self._parse_save_field()
+                    self.state = self.START_FIELD
                 else:
-                    self.state = self.EAT_CRNL
-            elif c == self.dialect.escapechar:
-                # possible escaped character
-                self.state = self.ESCAPED_CHAR
-            elif c == self.dialect.delimiter:
-                # save field - wait for new field
-                self._parse_save_field()
-                self.state = self.START_FIELD
-            else:
-                # normal character - save in field
-                self._parse_add_char(c)
+                    # normal character - save in field
+                    pos2 += 1
+                    c = line[pos2]
+                    continue
+                break
+            if pos2 > pos:
+                self._parse_add_char(line[pos:pos2])
+                pos = pos2
                 
         elif self.state == self.START_RECORD:
-            if c == '\0':
-                # empty line - return []
-                pass
-            elif c in '\n\r':
+            if c in '\n\r':
                 self.state = self.EAT_CRNL
             else:
                 self.state = self.START_FIELD
                 # restart process
-                self._parse_process_char(c)
+                self._parse_process_char(line, pos)
 
         elif self.state == self.START_FIELD:
-            if c in '\n\r\0':
+            if c in '\n\r':
                 # save empty field - return [fields]
                 self._parse_save_field()
-                if c == '\0':
-                    self.state = self.START_RECORD
-                else:
-                    self.state = self.EAT_CRNL
+                self.state = self.EAT_CRNL
             elif (c == self.dialect.quotechar
                   and self.dialect.quoting != QUOTE_NONE):
                 # start quoted field
@@ -308,15 +309,11 @@
                 self.state = self.IN_FIELD
         
         elif self.state == self.ESCAPED_CHAR:
-            if c == '\0':
-                c = '\n'
             self._parse_add_char(c)
             self.state = self.IN_FIELD
         
         elif self.state == self.IN_QUOTED_FIELD:
-            if c == '\0':
-                pass
-            elif c == self.dialect.escapechar:
+            if c == self.dialect.escapechar:
                 # possible escape character
                 self.state = self.ESCAPE_IN_QUOTED_FIELD
             elif (c == self.dialect.quotechar
@@ -332,8 +329,6 @@
                 self._parse_add_char(c)
                 
         elif self.state == self.ESCAPE_IN_QUOTED_FIELD:
-            if c == '\0':
-                c = '\n'
             self._parse_add_char(c)
             self.state = self.IN_QUOTED_FIELD
                 
@@ -348,13 +343,10 @@
                 # save field - wait for new field
                 self._parse_save_field()
                 self.state = self.START_FIELD
-            elif c in '\r\n\0':
+            elif c in '\r\n':
                 # end of line - return [fields]
                 self._parse_save_field()
-                if c == '\0':
-                    self.state = self.START_RECORD
-                else:
-                    self.state = self.EAT_CRNL
+                self.state = self.EAT_CRNL
             elif not self.dialect.strict:
                 self._parse_add_char(c)
                 self.state = self.IN_FIELD
@@ -365,8 +357,6 @@
         elif self.state == self.EAT_CRNL:
             if c in '\r\n':
                 pass
-            elif c == '\0':
-                self.state = self.START_RECORD
             else:
                 raise Error("new-line character seen in unquoted field - "
                             "do you need to open the file "
@@ -375,6 +365,38 @@
         else:
             raise RuntimeError("unknown state: %r" % (self.state,))
 
+        return pos + 1
+
+    def _parse_eol(self):
+        if self.state == self.EAT_CRNL:
+            self.state = self.START_RECORD
+        elif self.state == self.START_RECORD:
+            # empty line - return []
+            pass
+        elif self.state == self.IN_FIELD:
+            # in unquoted field
+            # end of line - return [fields]
+            self._parse_save_field()
+            self.state = self.START_RECORD
+        elif self.state == self.START_FIELD:
+            # save empty field - return [fields]
+            self._parse_save_field()
+            self.state = self.START_RECORD
+        elif self.state == self.ESCAPED_CHAR:
+            self._parse_add_char('\n')
+            self.state = self.IN_FIELD
+        elif self.state == self.IN_QUOTED_FIELD:
+            pass
+        elif self.state == self.ESCAPE_IN_QUOTED_FIELD:
+            self._parse_add_char('\n')
+            self.state = self.IN_QUOTED_FIELD
+        elif self.state == self.QUOTE_IN_QUOTED_FIELD:
+            # end of line - return [fields]
+            self._parse_save_field()
+            self.state = self.START_RECORD
+        else:
+            raise RuntimeError("unknown state: %r" % (self.state,))
+
     def _parse_save_field(self):
         field, self.field = self.field, ''
         if self.numeric_field:
@@ -383,7 +405,7 @@
         self.fields.append(field)
 
     def _parse_add_char(self, c):
-        if len(self.field) >= _field_limit:
+        if len(self.field) + len(c) > _field_limit:
             raise Error("field larget than field limit (%d)" % (_field_limit))
         self.field += c
         


More information about the Pypy-commit mailing list