[Python-checkins] python/nondist/sandbox/csv/util sniffer.py,1.6,1.7

Tue, 18 Mar 2003 16:29:14 -0800

Update of /cvsroot/python/python/nondist/sandbox/csv/util
In directory sc8-pr-cvs1:/tmp/cvs-serv11486

Modified Files:
	sniffer.py 
Log Message:
Made hasHeaders() a method of Sniffer class.  Changed Sniffer.sniff to return
a class rather than a class instance, to be compatible with 
csv.register_dialect().

Index: sniffer.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/csv/util/sniffer.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** sniffer.py	15 Mar 2003 01:15:03 -0000	1.6
--- sniffer.py	19 Mar 2003 00:29:12 -0000	1.7
***************
*** 27,30 ****
--- 27,33 ----
          Takes a file-like object and returns a dialect (or None)
          """
+         
+         self.fileobj = fileobj
+         
          data = fileobj.read(self.sample)

***************
*** 37,41 ****
              lineterminator = '\r\n'
              quoting = csv.QUOTE_MINIMAL
!             escapechar = ''
              doublequote = False
          Dialect.delimiter = delimiter
--- 40,44 ----
              lineterminator = '\r\n'
              quoting = csv.QUOTE_MINIMAL
!             # escapechar = ''
              doublequote = False
          Dialect.delimiter = delimiter
***************
*** 43,49 ****
          Dialect.skipinitialspace = skipinitialspace

!         return Dialect()

      def _guessQuoteAndDelimiter(self, data):
          """
--- 46,61 ----
          Dialect.skipinitialspace = skipinitialspace

!         self.dialect = Dialect
!         return self.dialect

+     def hasHeaders(self):
+         return self._hasHeaders(self.fileobj, self.dialect)
+     
+ 
+     def register_dialect(self, name = 'sniffed'):
+         csv.register_dialect(name, self.dialect)
+     
+ 
      def _guessQuoteAndDelimiter(self, data):
          """
***************
*** 200,276 ****

! # ------------------------------------------------------------------------------
! def hasHeaders(fileObj, dialect):
!     # Algorithm: creates a dictionary of types of data in each column. If any column
!     # is of a single type (say, integers), *except* for the first row, then the first
!     # row is presumed to be labels. If the type can't be determined, it is assumed to
!     # be a string in which case the length of the string is the determining factor: if
!     # all of the rows except for the first are the same length, it's a header.
!     # Finally, a 'vote' is taken at the end for each column, adding or subtracting from
!     # the likelihood of the first row being a header. 

!     def seval(item):
!         """
!         Strips parens from item prior to calling eval in an attempt to make it safer
!         """
!         return eval(item.replace('(', '').replace(')', ''))

!     reader = csv.reader(fileObj,
!                         delimiter = dialect.delimiter,
!                         quotechar = dialect.quotechar,
!                         skipinitialspace = dialect.skipinitialspace)

!     header = reader.next() # assume first row is header

!     columns = len(header)
!     columnTypes = {}
!     for i in range(columns): columnTypes[i] = None

!     checked = 0
!     for row in reader:
!         if checked > 20: # arbitrary number of rows to check, to keep it sane
!             break
!         checked += 1
!         
!         if len(row) != columns:
!             continue # skip rows that have irregular number of columns
!         
!         for col in columnTypes.keys():
!             try:
                  try:
!                     # is it a built-in type (besides string)?
!                     thisType = type(seval(row[col]))
!                 except OverflowError:
!                     # a long int?
!                     thisType = type(seval(row[col] + 'L'))
!                     thisType = type(0) # treat long ints as int
!             except:
!                 # fallback to length of string
!                 thisType = len(row[col])

!             if thisType != columnTypes[col]:
!                 if columnTypes[col] is None: # add new column type
!                     columnTypes[col] = thisType
!                 else: # type is inconsistent, remove column from consideration
!                     del columnTypes[col]
!                     
!     # finally, compare results against first row and "vote" on whether it's a header
!     hasHeader = 0
!     for col, colType in columnTypes.items():
!         if type(colType) == type(0): # it's a length
!             if len(header[col]) != colType:
!                 hasHeader += 1
!             else:
!                 hasHeader -= 1
!         else: # attempt typecast
!             try:
!                 eval("%s(%s)" % (colType.__name__, header[col]))
!             except:
!                 hasHeader += 1
!             else:
!                 hasHeader -= 1

-     return hasHeader > 0

-         
--- 212,289 ----

!     def _hasHeaders(self, fileobj, dialect):
!         # Creates a dictionary of types of data in each column. If any column
!         # is of a single type (say, integers), *except* for the first row, then the first
!         # row is presumed to be labels. If the type can't be determined, it is assumed to
!         # be a string in which case the length of the string is the determining factor: if
!         # all of the rows except for the first are the same length, it's a header.
!         # Finally, a 'vote' is taken at the end for each column, adding or subtracting from
!         # the likelihood of the first row being a header. 

!         def seval(item):
!             """
!             Strips parens from item prior to calling eval in an attempt to make it safer
!             """
!             return eval(item.replace('(', '').replace(')', ''))

!         fileobj.seek(0) # rewind the fileobj - this might not work for some file-like objects...

!         reader = csv.reader(fileobj,
!                             delimiter = dialect.delimiter,
!                             quotechar = dialect.quotechar,
!                             skipinitialspace = dialect.skipinitialspace)

!         header = reader.next() # assume first row is header

!         columns = len(header)
!         columnTypes = {}
!         for i in range(columns): columnTypes[i] = None
! 
!         checked = 0
!         for row in reader:
!             if checked > 20: # arbitrary number of rows to check, to keep it sane
!                 break
!             checked += 1
! 
!             if len(row) != columns:
!                 continue # skip rows that have irregular number of columns
! 
!             for col in columnTypes.keys():
                  try:
!                     try:
!                         # is it a built-in type (besides string)?
!                         thisType = type(seval(row[col]))
!                     except OverflowError:
!                         # a long int?
!                         thisType = type(seval(row[col] + 'L'))
!                         thisType = type(0) # treat long ints as int
!                 except:
!                     # fallback to length of string
!                     thisType = len(row[col])

!                 if thisType != columnTypes[col]:
!                     if columnTypes[col] is None: # add new column type
!                         columnTypes[col] = thisType
!                     else: # type is inconsistent, remove column from consideration
!                         del columnTypes[col]
! 
!         # finally, compare results against first row and "vote" on whether it's a header
!         hasHeader = 0
!         for col, colType in columnTypes.items():
!             if type(colType) == type(0): # it's a length
!                 if len(header[col]) != colType:
!                     hasHeader += 1
!                 else:
!                     hasHeader -= 1
!             else: # attempt typecast
!                 try:
!                     eval("%s(%s)" % (colType.__name__, header[col]))
!                 except:
!                     hasHeader += 1
!                 else:
!                     hasHeader -= 1
! 
!         return hasHeader > 0