[Numpy-discussion] convert csv file into recarray without pre-specifying dtypes and variable names

Timothy Hochberg tim.hochberg at ieee.org
Sun Jul 8 23:25:11 EDT 2007


On 7/8/07, Vincent Nijs <v-nijs at kellogg.northwestern.edu> wrote:
>
> Thanks for looking into this Torgil! I agree that this is a much more
> complicated setup. I'll check if there is anything I can do on the data
> end.
> Otherwise I'll go with Timothy's suggestion and read in numbers as floats
> and convert to int later as needed.


Here is a strategy that should allow auto detection without too much in the
way of inefficiency. The basic idea is to convert till you run into a
problem, store that data away, and continue the conversion with a new dtype.
At the end you assemble all the chunks of data you've accumulated into one
large array. It should be reasonably efficient in terms of both memory and
speed.

The implementation is a little rough, but it should get the idea across.

-- 
.  __
.   |-\
.
.  tim.hochberg at ieee.org

========================================================================

def find_formats(items, last):
    formats = []
    for i, x in enumerate(items):
        dt, cvt = string_to_dt_cvt(x)
        if last is not None:
            last_cvt, last_dt = last[i]
            if last_cvt is float and cvt is int:
                cvt = float
        formats.append((dt, cvt))
    return formats

class LoadInfo(object):
    def __init__(self, row0):
        self.done = False
        self.lastcols = None
        self.row0 = row0

def data_iterator(lines, converters, delim, info):
    yield tuple(f(x) for f, x in zip(converters, info.row0.split(delim)))
    try:
        for row in lines:
            yield tuple(f(x) for f, x in zip(converters, row.split(delim)))
    except:
        info.row0 = row
    else:
        info.done = True

def load2(fname,delim = ',', has_varnm = True, prn_report = True):
    """
    Loading data from a file using the csv module. Returns a recarray.
    """
    f=open(fname,'rb')

    if has_varnm:
        varnames = [i.strip() for i in f.next().split(delim)]
    else:
        varnames = None


    info = LoadInfo(f.next())
    chunks = []

    while not info.done:
        row0 = info.row0.split(delim)
        formats = find_formats(row0, info.lastcols)
        if varnames is None:
            varnames = varnm = ['col%s' % str(i+1) for i, _ in
enumerate(formate)]
        descr=[]
        conversion_functions=[]
        for name, (dtype, cvt_fn) in zip(varnames, formats):
            descr.append((name,dtype))
            conversion_functions.append(cvt_fn)

        chunks.append(N.fromiter(data_iterator(f, conversion_functions,
delim, info), descr))

    if len(chunks) > 1:
        n = sum(len(x) for x in chunks)
        data = N.zeros([n], chunks[-1].dtype)
        offset = 0
        for x in chunks:
            delta = len(x)
            data[offset:offset+delta] = x
            offset += delta
    else:
        [data] = chunks

    # load report
    if prn_report:
        print "##########################################\n"
        print "Loaded file: %s\n" % fname
        print "Nr obs: %s\n" % data.shape[0]
        print "Variables and datatypes:\n"
        for i in data.dtype.descr:
            print "Varname: %s, Type: %s, Sample: %s" % (i[0], i[1],
str(data[i[0]][0:3]))
            print "\n##########################################\n"

    return data
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/numpy-discussion/attachments/20070708/53dd8bc4/attachment.html>


More information about the NumPy-Discussion mailing list