[pypy-svn] pypy default: More optimizations to csv.Reader
amauryfa
commits-noreply at bitbucket.org
Tue Mar 1 15:35:21 CET 2011
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch:
Changeset: r42366:ed70fe1739c6
Date: 2011-03-01 14:46 +0100
http://bitbucket.org/pypy/pypy/changeset/ed70fe1739c6/
Log: More optimizations to csv.Reader
diff --git a/lib_pypy/_csv.py b/lib_pypy/_csv.py
--- a/lib_pypy/_csv.py
+++ b/lib_pypy/_csv.py
@@ -234,11 +234,12 @@
self.line_num += 1
- for c in line:
- if c == '\0':
- raise Error("line contains NULL byte")
- self._parse_process_char(c)
- self._parse_process_char('\0')
+ if '\0' in line:
+ raise Error("line contains NULL byte")
+ pos = 0
+ while pos < len(line):
+ pos = self._parse_process_char(line, pos)
+ self._parse_eol()
if self.state == self.START_RECORD:
break
@@ -247,46 +248,46 @@
self.fields = []
return fields
- def _parse_process_char(self, c):
+ def _parse_process_char(self, line, pos):
+ c = line[pos]
if self.state == self.IN_FIELD:
# in unquoted field
- if c in '\n\r\0':
- # end of line - return [fields]
- self._parse_save_field()
- if c == '\0':
- self.state = self.START_RECORD
+ pos2 = pos
+ while True:
+ if c in '\n\r':
+ # end of line - return [fields]
+ self._parse_save_field()
+ self.state = self.EAT_CRNL
+ elif c == self.dialect.escapechar:
+ # possible escaped character
+ self.state = self.ESCAPED_CHAR
+ elif c == self.dialect.delimiter:
+ # save field - wait for new field
+ self._parse_save_field()
+ self.state = self.START_FIELD
else:
- self.state = self.EAT_CRNL
- elif c == self.dialect.escapechar:
- # possible escaped character
- self.state = self.ESCAPED_CHAR
- elif c == self.dialect.delimiter:
- # save field - wait for new field
- self._parse_save_field()
- self.state = self.START_FIELD
- else:
- # normal character - save in field
- self._parse_add_char(c)
+ # normal character - save in field
+ pos2 += 1
+ c = line[pos2]
+ continue
+ break
+ if pos2 > pos:
+ self._parse_add_char(line[pos:pos2])
+ pos = pos2
elif self.state == self.START_RECORD:
- if c == '\0':
- # empty line - return []
- pass
- elif c in '\n\r':
+ if c in '\n\r':
self.state = self.EAT_CRNL
else:
self.state = self.START_FIELD
# restart process
- self._parse_process_char(c)
+ self._parse_process_char(line, pos)
elif self.state == self.START_FIELD:
- if c in '\n\r\0':
+ if c in '\n\r':
# save empty field - return [fields]
self._parse_save_field()
- if c == '\0':
- self.state = self.START_RECORD
- else:
- self.state = self.EAT_CRNL
+ self.state = self.EAT_CRNL
elif (c == self.dialect.quotechar
and self.dialect.quoting != QUOTE_NONE):
# start quoted field
@@ -308,15 +309,11 @@
self.state = self.IN_FIELD
elif self.state == self.ESCAPED_CHAR:
- if c == '\0':
- c = '\n'
self._parse_add_char(c)
self.state = self.IN_FIELD
elif self.state == self.IN_QUOTED_FIELD:
- if c == '\0':
- pass
- elif c == self.dialect.escapechar:
+ if c == self.dialect.escapechar:
# possible escape character
self.state = self.ESCAPE_IN_QUOTED_FIELD
elif (c == self.dialect.quotechar
@@ -332,8 +329,6 @@
self._parse_add_char(c)
elif self.state == self.ESCAPE_IN_QUOTED_FIELD:
- if c == '\0':
- c = '\n'
self._parse_add_char(c)
self.state = self.IN_QUOTED_FIELD
@@ -348,13 +343,10 @@
# save field - wait for new field
self._parse_save_field()
self.state = self.START_FIELD
- elif c in '\r\n\0':
+ elif c in '\r\n':
# end of line - return [fields]
self._parse_save_field()
- if c == '\0':
- self.state = self.START_RECORD
- else:
- self.state = self.EAT_CRNL
+ self.state = self.EAT_CRNL
elif not self.dialect.strict:
self._parse_add_char(c)
self.state = self.IN_FIELD
@@ -365,8 +357,6 @@
elif self.state == self.EAT_CRNL:
if c in '\r\n':
pass
- elif c == '\0':
- self.state = self.START_RECORD
else:
raise Error("new-line character seen in unquoted field - "
"do you need to open the file "
@@ -375,6 +365,38 @@
else:
raise RuntimeError("unknown state: %r" % (self.state,))
+ return pos + 1
+
+ def _parse_eol(self):
+ if self.state == self.EAT_CRNL:
+ self.state = self.START_RECORD
+ elif self.state == self.START_RECORD:
+ # empty line - return []
+ pass
+ elif self.state == self.IN_FIELD:
+ # in unquoted field
+ # end of line - return [fields]
+ self._parse_save_field()
+ self.state = self.START_RECORD
+ elif self.state == self.START_FIELD:
+ # save empty field - return [fields]
+ self._parse_save_field()
+ self.state = self.START_RECORD
+ elif self.state == self.ESCAPED_CHAR:
+ self._parse_add_char('\n')
+ self.state = self.IN_FIELD
+ elif self.state == self.IN_QUOTED_FIELD:
+ pass
+ elif self.state == self.ESCAPE_IN_QUOTED_FIELD:
+ self._parse_add_char('\n')
+ self.state = self.IN_QUOTED_FIELD
+ elif self.state == self.QUOTE_IN_QUOTED_FIELD:
+ # end of line - return [fields]
+ self._parse_save_field()
+ self.state = self.START_RECORD
+ else:
+ raise RuntimeError("unknown state: %r" % (self.state,))
+
def _parse_save_field(self):
field, self.field = self.field, ''
if self.numeric_field:
@@ -383,7 +405,7 @@
self.fields.append(field)
def _parse_add_char(self, c):
- if len(self.field) >= _field_limit:
+ if len(self.field) + len(c) > _field_limit:
raise Error("field larget than field limit (%d)" % (_field_limit))
self.field += c
More information about the Pypy-commit
mailing list