Pierre GM wrote:
Sounds like a plan. Wouldn't mind getting more feedback from fellow users before we get too deep, however...
Ok, I've attached, as a first cut, a diff against SVN HEAD that does (I think) what I'm looking for. It passes all of the old tests and passes my own quick test. A more rigorous test suite will follow, but I want this out the door before I need to leave for the day. What this changeset essentially does is just add support for automatic dtypes along with supplying/reading names for flexible dtypes. It leverages StringConverter heavily, using a few tweaks so that old behavior is kept. This is by no means a final version. Probably the biggest change from what I mentioned earlier is that instead of dtype='auto', I've used dtype=None to signal the detection code, since dtype=='auto' causes problems. I welcome any and all suggestions here, both on the code and on the original idea of adding these capabilities to loadtxt(). Ryan -- Ryan May Graduate Research Assistant School of Meteorology University of Oklahoma Index: lib/io.py =================================================================== --- lib/io.py (revision 6099) +++ lib/io.py (working copy) @@ -233,29 +233,138 @@ for name in todel: os.remove(name) -# Adapted from matplotlib +def _string_like(obj): + try: obj + '' + except (TypeError, ValueError): return False + return True -def _getconv(dtype): - typ = dtype.type - if issubclass(typ, np.bool_): - return lambda x: bool(int(x)) - if issubclass(typ, np.integer): - return lambda x: int(float(x)) - elif issubclass(typ, np.floating): - return float - elif issubclass(typ, np.complex): - return complex +def str2bool(value): + """ + Tries to transform a string supposed to represent a boolean to a boolean. + + Raises + ------ + ValueError + If the string is not 'True' or 'False' (case independent) + """ + value = value.upper() + if value == 'TRUE': + return True + elif value == 'FALSE': + return False else: - return str + return int(bool(value)) +class StringConverter(object): + """ + Factory class for function transforming a string into another object (int, + float). -def _string_like(obj): - try: obj + '' - except (TypeError, ValueError): return 0 - return 1 + After initialization, an instance can be called to transform a string + into another object. If the string is recognized as representing a missing + value, a default value is returned. + Parameters + ---------- + dtype : dtype, optional + Input data type, used to define a basic function and a default value + for missing data. For example, when `dtype` is float, the :attr:`func` + attribute is set to ``float`` and the default value to `np.nan`. + missing_values : sequence, optional + Sequence of strings indicating a missing value. + + Attributes + ---------- + func : function + Function used for the conversion + default : var + Default value to return when the input corresponds to a missing value. + mapper : sequence of tuples + Sequence of tuples (function, default value) to evaluate in order. + + """ + from numpy.core import nan # To avoid circular import + mapper = [(str2bool, None), + (lambda x: int(float(x)), -1), + (float, nan), + (complex, nan+0j), + (str, '???')] + + def __init__(self, dtype=None, missing_values=None): + if dtype is None: + self.func = str2bool + self.default = None + self._status = 0 + else: + dtype = np.dtype(dtype).type + self.func,self.default,self._status = self._get_from_dtype(dtype) + + # Store the list of strings corresponding to missing values. + if missing_values is None: + self.missing_values = [] + else: + self.missing_values = set(list(missing_values) + ['']) + + def __call__(self, value): + if value in self.missing_values: + return self.default + return self.func(value) + + def upgrade(self, value): + """ + Tries to find the best converter for `value`, by testing different + converters in order. + The order in which the converters are tested is read from the + :attr:`_status` attribute of the instance. + """ + try: + self.__call__(value) + except ValueError: + _statusmax = len(self.mapper) + if self._status == _statusmax: + raise ValueError("Could not find a valid conversion function") + elif self._status < _statusmax - 1: + self._status += 1 + (self.func, self.default) = self.mapper[self._status] + self.upgrade(value) + + def _get_from_dtype(self, dtype): + """ + Sets the :attr:`func` and :attr:`default` attributes for a given dtype. + """ + dtype = np.dtype(dtype).type + if issubclass(dtype, np.bool_): + return (str2bool, 0, 0) + elif issubclass(dtype, np.integer): + return (lambda x: int(float(x)), -1, 1) + elif issubclass(dtype, np.floating): + return (float, np.nan, 2) + elif issubclass(dtype, np.complex): + return (complex, np.nan + 0j, 3) + else: + return (str, '???', -1) + + def update(self, func, default=None, locked=False): + """ + Sets the :attr:`func` and :attr:`default` attributes directly. + + Parameters + ---------- + func : function + Conversion function. + default : var, optional + Default value to return when a missing value is encountered. + locked : bool, optional + Whether this should lock in the function so that no upgrading is + possible. + """ + self.func = func + self.default = default + if locked: + self._status = len(self.mapper) + def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, - skiprows=0, usecols=None, unpack=False): + skiprows=0, usecols=None, unpack=False, names=None): """ Load data from a text file. @@ -333,11 +442,10 @@ fh = gzip.open(fname) else: fh = file(fname) - elif hasattr(fname, 'seek'): + elif hasattr(fname, 'readline'): fh = fname else: raise ValueError('fname must be a string or file handle') - X = [] def flatten_dtype(dt): """Unpack a structured data-type.""" @@ -359,10 +467,6 @@ else: return [] - # Make sure we're dealing with a proper dtype - dtype = np.dtype(dtype) - defconv = _getconv(dtype) - # Skip the first `skiprows` lines for i in xrange(skiprows): fh.readline() @@ -377,37 +481,76 @@ first_vals = split_line(first_line) N = len(usecols or first_vals) - dtype_types = flatten_dtype(dtype) - if len(dtype_types) > 1: - # We're dealing with a structured array, each field of - # the dtype matches a column - converters = [_getconv(dt) for dt in dtype_types] + # If names is True, read the field names from the first line + if names == True: + names = first_vals + first_line = '' + + # Make sure we're dealing with a proper dtype + if dtype is None: + converters = [StringConverter() for i in xrange(N)] else: - # All fields have the same dtype - converters = [defconv for i in xrange(N)] + dtype = np.dtype(dtype) + dtype_types = flatten_dtype(dtype) + if len(dtype_types) > 1: + # We're dealing with a structured array, each field of + # the dtype matches a column + converters = [StringConverter(dt) for dt in dtype_types] + names = list(dtype.names) + else: + # All fields have the same dtype + converters = [StringConverter(dtype) for i in xrange(N)] + # If usecols contains a list of names, convert them to column indices + if usecols and _string_like(usecols[0]): + usecols = [names.index(_) for _ in usecols] + # By preference, use the converters specified by the user for i, conv in (user_converters or {}).iteritems(): + # If the converter is specified by column number, convert it to an index + if _string_like(i): + i = names.index(i) if usecols: try: i = usecols.index(i) except ValueError: # Unused converter specified continue - converters[i] = conv + converters[i].update(conv, None) # Parse each line, including the first + rows = [] for i, line in enumerate(itertools.chain([first_line], fh)): vals = split_line(line) if len(vals) == 0: continue if usecols: - vals = [vals[i] for i in usecols] + vals = [vals[_] for _ in usecols] - # Convert each value according to its column and store - X.append(tuple([conv(val) for (conv, val) in zip(converters, vals)])) + if dtype is None: + for converter, item in zip(converters, row): + if len(item.strip()): + converter.upgrade(item) + # Store the values + rows.append(tuple(vals)) + + # Convert each value according to its column and store + for i,vals in enumerate(rows): + rows[i] = tuple([conv(val) for (conv, val) in zip(converters, vals)]) + + #Construct final dtype if necessary + if dtype is None: + dtype_types = [np.array(val).dtype for val in rows[0]] + uniform_dtype = all([dtype_types[0] == dt for dt in dtype_types]) + if uniform_dtype and not names: + dtype = dtype_types[0] + else: + if not names: + names = ['column_%d' for i in xrange(N)] + dtype = zip(names, dtype_types) + if len(dtype_types) > 1: # We're dealing with a structured array, with a dtype such as # [('x', int), ('y', [('s', int), ('t', float)])] @@ -416,16 +559,16 @@ # [('x', int), ('s', int), ('t', float)] # # Then, view the array using the specified dtype. - X = np.array(X, dtype=np.dtype([('', t) for t in dtype_types])) - X = X.view(dtype) + rows = np.array(rows, dtype=np.dtype([('', t) for t in dtype_types])) + rows = rows.view(dtype) else: - X = np.array(X, dtype) + rows = np.array(rows, dtype) - X = np.squeeze(X) + rows = np.squeeze(rows) if unpack: - return X.T + return rows.T else: - return X + return rows def savetxt(fname, X, fmt='%.18e',delimiter=' '):