Re: [Numpy-discussion] More loadtxt() changes

25 Nov 2008

Pierre GM wrote:
...
Sounds like a plan. Wouldn't mind getting more feedback from fellow  
users before we get too deep, however...
Ok, I've attached, as a first cut, a diff against SVN HEAD that does (I 
think) what I'm looking for.  It passes all of the old tests and passes 
my own quick test.  A more rigorous test suite will follow, but I want 
this out the door before I need to leave for the day.

What this changeset essentially does is just add support for automatic 
dtypes along with supplying/reading names for flexible dtypes.  It 
leverages StringConverter heavily, using a few tweaks so that old 
behavior is kept.  This is by no means a final version.

Probably the biggest change from what I mentioned earlier is that 
instead of dtype='auto', I've used dtype=None to signal the detection 
code, since dtype=='auto' causes problems.

I welcome any and all suggestions here, both on the code and on the 
original idea of adding these capabilities to loadtxt().

Ryan

-- 
Ryan May
Graduate Research Assistant
School of Meteorology
University of Oklahoma

Index: lib/io.py
===================================================================

--- lib/io.py	(revision 6099)
+++ lib/io.py	(working copy)
@@ -233,29 +233,138 @@
     for name in todel:
         os.remove(name)
 
-# Adapted from matplotlib
+def _string_like(obj):
+    try: obj + ''
+    except (TypeError, ValueError): return False
+    return True
 
-def _getconv(dtype):
-    typ = dtype.type
-    if issubclass(typ, np.bool_):
-        return lambda x: bool(int(x))
-    if issubclass(typ, np.integer):
-        return lambda x: int(float(x))
-    elif issubclass(typ, np.floating):
-        return float
-    elif issubclass(typ, np.complex):
-        return complex
+def str2bool(value):
+    """
+    Tries to transform a string supposed to represent a boolean to a boolean.
+    
+    Raises
+    ------
+    ValueError
+        If the string is not 'True' or 'False' (case independent)
+    """
+    value = value.upper()
+    if value == 'TRUE':
+        return True
+    elif value == 'FALSE':
+        return False
     else:
-        return str
+        return int(bool(value))
 
+class StringConverter(object):
+    """
+    Factory class for function transforming a string into another object (int,
+    float).
 
-def _string_like(obj):
-    try: obj + ''
-    except (TypeError, ValueError): return 0
-    return 1
+    After initialization, an instance can be called to transform a string 
+    into another object. If the string is recognized as representing a missing
+    value, a default value is returned.
 
+    Parameters
+    ----------
+    dtype : dtype, optional
+        Input data type, used to define a basic function and a default value
+        for missing data. For example, when `dtype` is float, the :attr:`func`
+        attribute is set to ``float`` and the default value to `np.nan`.
+    missing_values : sequence, optional
+        Sequence of strings indicating a missing value.
+
+    Attributes
+    ----------
+    func : function
+        Function used for the conversion
+    default : var
+        Default value to return when the input corresponds to a missing value.
+    mapper : sequence of tuples
+        Sequence of tuples (function, default value) to evaluate in order.
+
+    """
+    from numpy.core import nan # To avoid circular import
+    mapper = [(str2bool, None),
+              (lambda x: int(float(x)), -1),
+              (float, nan),
+              (complex, nan+0j),
+              (str, '???')]
+
+    def __init__(self, dtype=None, missing_values=None):
+        if dtype is None:
+            self.func = str2bool
+            self.default = None
+            self._status = 0
+        else:
+            dtype = np.dtype(dtype).type
+            self.func,self.default,self._status = self._get_from_dtype(dtype)
+
+        # Store the list of strings corresponding to missing values.
+        if missing_values is None:
+            self.missing_values = []
+        else:
+            self.missing_values = set(list(missing_values) + [''])
+
+    def __call__(self, value):
+        if value in self.missing_values:
+            return self.default
+        return self.func(value)
+
+    def upgrade(self, value):
+        """
+    Tries to find the best converter for `value`, by testing different
+    converters in order.
+    The order in which the converters are tested is read from the
+    :attr:`_status` attribute of the instance.
+        """
+        try:
+            self.__call__(value)
+        except ValueError:
+            _statusmax = len(self.mapper)
+            if self._status == _statusmax:
+                raise ValueError("Could not find a valid conversion function")
+            elif self._status < _statusmax - 1:
+                self._status += 1
+            (self.func, self.default) = self.mapper[self._status]
+            self.upgrade(value)
+
+    def _get_from_dtype(self, dtype):
+        """
+    Sets the :attr:`func` and :attr:`default` attributes for a given dtype.
+        """
+        dtype = np.dtype(dtype).type
+        if issubclass(dtype, np.bool_):
+            return (str2bool, 0, 0)
+        elif issubclass(dtype, np.integer):
+            return (lambda x: int(float(x)), -1, 1)
+        elif issubclass(dtype, np.floating):
+            return (float, np.nan, 2)
+        elif issubclass(dtype, np.complex):
+            return (complex, np.nan + 0j, 3)
+        else:
+            return (str, '???', -1)
+
+    def update(self, func, default=None, locked=False):
+        """
+    Sets the :attr:`func` and :attr:`default` attributes directly.
+
+    Parameters
+    ----------
+    func : function
+        Conversion function.
+    default : var, optional
+        Default value to return when a missing value is encountered.
+    locked : bool, optional
+        Whether this should lock in the function so that no upgrading is
+        possible.
+        """
+        self.func = func
+        self.default = default
+        if locked:
+            self._status = len(self.mapper)
+
 def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None,
-            skiprows=0, usecols=None, unpack=False):
+            skiprows=0, usecols=None, unpack=False, names=None):
     """
     Load data from a text file.
 
@@ -333,11 +442,10 @@
             fh = gzip.open(fname)
         else:
             fh = file(fname)
-    elif hasattr(fname, 'seek'):
+    elif hasattr(fname, 'readline'):
         fh = fname
     else:
         raise ValueError('fname must be a string or file handle')
-    X = []
 
     def flatten_dtype(dt):
         """Unpack a structured data-type."""
@@ -359,10 +467,6 @@
         else:
             return []
 
-    # Make sure we're dealing with a proper dtype
-    dtype = np.dtype(dtype)
-    defconv = _getconv(dtype)
-
     # Skip the first `skiprows` lines
     for i in xrange(skiprows):
         fh.readline()
@@ -377,37 +481,76 @@
         first_vals = split_line(first_line)
     N = len(usecols or first_vals)
 
-    dtype_types = flatten_dtype(dtype)
-    if len(dtype_types) > 1:
-        # We're dealing with a structured array, each field of
-        # the dtype matches a column
-        converters = [_getconv(dt) for dt in dtype_types]
+    # If names is True, read the field names from the first line
+    if names == True:
+        names = first_vals
+        first_line = ''
+
+    # Make sure we're dealing with a proper dtype
+    if dtype is None:
+        converters = [StringConverter() for i in xrange(N)]
     else:
-        # All fields have the same dtype
-        converters = [defconv for i in xrange(N)]
+        dtype = np.dtype(dtype)
+        dtype_types = flatten_dtype(dtype)
+        if len(dtype_types) > 1:
+            # We're dealing with a structured array, each field of
+            # the dtype matches a column
+            converters = [StringConverter(dt) for dt in dtype_types]
+            names = list(dtype.names)
+        else:
+            # All fields have the same dtype
+            converters = [StringConverter(dtype) for i in xrange(N)]
 
+    # If usecols contains a list of names, convert them to column indices
+    if usecols and _string_like(usecols[0]):
+        usecols = [names.index(_) for _ in usecols]
+
     # By preference, use the converters specified by the user
     for i, conv in (user_converters or {}).iteritems():
+        # If the converter is specified by column number, convert it to an index
+        if _string_like(i):
+            i = names.index(i)
         if usecols:
             try:
                 i = usecols.index(i)
             except ValueError:
                 # Unused converter specified
                 continue
-        converters[i] = conv
+        converters[i].update(conv, None)
 
     # Parse each line, including the first
+    rows = []
     for i, line in enumerate(itertools.chain([first_line], fh)):
         vals = split_line(line)
         if len(vals) == 0:
             continue
 
         if usecols:
-            vals = [vals[i] for i in usecols]
+            vals = [vals[_] for _ in usecols]
 
-        # Convert each value according to its column and store
-        X.append(tuple([conv(val) for (conv, val) in zip(converters, vals)]))
+        if dtype is None:
+            for converter, item in zip(converters, row):
+                if len(item.strip()):
+                    converter.upgrade(item)
 
+        # Store the values
+        rows.append(tuple(vals))
+
+    # Convert each value according to its column and store
+    for i,vals in enumerate(rows):
+        rows[i] = tuple([conv(val) for (conv, val) in zip(converters, vals)])
+
+    #Construct final dtype if necessary
+    if dtype is None:
+        dtype_types = [np.array(val).dtype for val in rows[0]]
+        uniform_dtype = all([dtype_types[0] == dt for dt in dtype_types])
+        if uniform_dtype and not names:
+            dtype = dtype_types[0]
+        else:
+            if not names:
+                names = ['column_%d' for i in xrange(N)]
+            dtype = zip(names, dtype_types)
+
     if len(dtype_types) > 1:
         # We're dealing with a structured array, with a dtype such as
         # [('x', int), ('y', [('s', int), ('t', float)])]
@@ -416,16 +559,16 @@
         # [('x', int), ('s', int), ('t', float)]
         #
         # Then, view the array using the specified dtype.
-        X = np.array(X, dtype=np.dtype([('', t) for t in dtype_types]))
-        X = X.view(dtype)
+        rows = np.array(rows, dtype=np.dtype([('', t) for t in dtype_types]))
+        rows = rows.view(dtype)
     else:
-        X = np.array(X, dtype)
+        rows = np.array(rows, dtype)
 
-    X = np.squeeze(X)
+    rows = np.squeeze(rows)
     if unpack:
-        return X.T
+        return rows.T
     else:
-        return X
+        return rows
 
 
 def savetxt(fname, X, fmt='%.18e',delimiter=' '):