[Python-checkins] python/nondist/sandbox/csv/util sniffer.py,1.6,1.7
cliffwells18@users.sourceforge.net
cliffwells18@users.sourceforge.net
Tue, 18 Mar 2003 16:29:14 -0800
Update of /cvsroot/python/python/nondist/sandbox/csv/util
In directory sc8-pr-cvs1:/tmp/cvs-serv11486
Modified Files:
sniffer.py
Log Message:
Made hasHeaders() a method of Sniffer class. Changed Sniffer.sniff to return
a class rather than a class instance, to be compatible with
csv.register_dialect().
Index: sniffer.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/csv/util/sniffer.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** sniffer.py 15 Mar 2003 01:15:03 -0000 1.6
--- sniffer.py 19 Mar 2003 00:29:12 -0000 1.7
***************
*** 27,30 ****
--- 27,33 ----
Takes a file-like object and returns a dialect (or None)
"""
+
+ self.fileobj = fileobj
+
data = fileobj.read(self.sample)
***************
*** 37,41 ****
lineterminator = '\r\n'
quoting = csv.QUOTE_MINIMAL
! escapechar = ''
doublequote = False
Dialect.delimiter = delimiter
--- 40,44 ----
lineterminator = '\r\n'
quoting = csv.QUOTE_MINIMAL
! # escapechar = ''
doublequote = False
Dialect.delimiter = delimiter
***************
*** 43,49 ****
Dialect.skipinitialspace = skipinitialspace
! return Dialect()
def _guessQuoteAndDelimiter(self, data):
"""
--- 46,61 ----
Dialect.skipinitialspace = skipinitialspace
! self.dialect = Dialect
! return self.dialect
+ def hasHeaders(self):
+ return self._hasHeaders(self.fileobj, self.dialect)
+
+
+ def register_dialect(self, name = 'sniffed'):
+ csv.register_dialect(name, self.dialect)
+
+
def _guessQuoteAndDelimiter(self, data):
"""
***************
*** 200,276 ****
! # ------------------------------------------------------------------------------
! def hasHeaders(fileObj, dialect):
! # Algorithm: creates a dictionary of types of data in each column. If any column
! # is of a single type (say, integers), *except* for the first row, then the first
! # row is presumed to be labels. If the type can't be determined, it is assumed to
! # be a string in which case the length of the string is the determining factor: if
! # all of the rows except for the first are the same length, it's a header.
! # Finally, a 'vote' is taken at the end for each column, adding or subtracting from
! # the likelihood of the first row being a header.
! def seval(item):
! """
! Strips parens from item prior to calling eval in an attempt to make it safer
! """
! return eval(item.replace('(', '').replace(')', ''))
! reader = csv.reader(fileObj,
! delimiter = dialect.delimiter,
! quotechar = dialect.quotechar,
! skipinitialspace = dialect.skipinitialspace)
! header = reader.next() # assume first row is header
! columns = len(header)
! columnTypes = {}
! for i in range(columns): columnTypes[i] = None
! checked = 0
! for row in reader:
! if checked > 20: # arbitrary number of rows to check, to keep it sane
! break
! checked += 1
!
! if len(row) != columns:
! continue # skip rows that have irregular number of columns
!
! for col in columnTypes.keys():
! try:
try:
! # is it a built-in type (besides string)?
! thisType = type(seval(row[col]))
! except OverflowError:
! # a long int?
! thisType = type(seval(row[col] + 'L'))
! thisType = type(0) # treat long ints as int
! except:
! # fallback to length of string
! thisType = len(row[col])
! if thisType != columnTypes[col]:
! if columnTypes[col] is None: # add new column type
! columnTypes[col] = thisType
! else: # type is inconsistent, remove column from consideration
! del columnTypes[col]
!
! # finally, compare results against first row and "vote" on whether it's a header
! hasHeader = 0
! for col, colType in columnTypes.items():
! if type(colType) == type(0): # it's a length
! if len(header[col]) != colType:
! hasHeader += 1
! else:
! hasHeader -= 1
! else: # attempt typecast
! try:
! eval("%s(%s)" % (colType.__name__, header[col]))
! except:
! hasHeader += 1
! else:
! hasHeader -= 1
- return hasHeader > 0
-
--- 212,289 ----
! def _hasHeaders(self, fileobj, dialect):
! # Creates a dictionary of types of data in each column. If any column
! # is of a single type (say, integers), *except* for the first row, then the first
! # row is presumed to be labels. If the type can't be determined, it is assumed to
! # be a string in which case the length of the string is the determining factor: if
! # all of the rows except for the first are the same length, it's a header.
! # Finally, a 'vote' is taken at the end for each column, adding or subtracting from
! # the likelihood of the first row being a header.
! def seval(item):
! """
! Strips parens from item prior to calling eval in an attempt to make it safer
! """
! return eval(item.replace('(', '').replace(')', ''))
! fileobj.seek(0) # rewind the fileobj - this might not work for some file-like objects...
! reader = csv.reader(fileobj,
! delimiter = dialect.delimiter,
! quotechar = dialect.quotechar,
! skipinitialspace = dialect.skipinitialspace)
! header = reader.next() # assume first row is header
! columns = len(header)
! columnTypes = {}
! for i in range(columns): columnTypes[i] = None
!
! checked = 0
! for row in reader:
! if checked > 20: # arbitrary number of rows to check, to keep it sane
! break
! checked += 1
!
! if len(row) != columns:
! continue # skip rows that have irregular number of columns
!
! for col in columnTypes.keys():
try:
! try:
! # is it a built-in type (besides string)?
! thisType = type(seval(row[col]))
! except OverflowError:
! # a long int?
! thisType = type(seval(row[col] + 'L'))
! thisType = type(0) # treat long ints as int
! except:
! # fallback to length of string
! thisType = len(row[col])
! if thisType != columnTypes[col]:
! if columnTypes[col] is None: # add new column type
! columnTypes[col] = thisType
! else: # type is inconsistent, remove column from consideration
! del columnTypes[col]
!
! # finally, compare results against first row and "vote" on whether it's a header
! hasHeader = 0
! for col, colType in columnTypes.items():
! if type(colType) == type(0): # it's a length
! if len(header[col]) != colType:
! hasHeader += 1
! else:
! hasHeader -= 1
! else: # attempt typecast
! try:
! eval("%s(%s)" % (colType.__name__, header[col]))
! except:
! hasHeader += 1
! else:
! hasHeader -= 1
!
! return hasHeader > 0