# CSV performance

Scott David Daniels Scott.Daniels at Acm.Org
Mon Apr 27 20:20:50 CEST 2009

```psaffrey at googlemail.com wrote:
> Thanks for your replies. Many apologies for not including the right
Here is another way to try (untested):

import numpy
import time

chrommap = dict(chrY='y', chrX='x', chr13='c', chr12='b', chr11='a',
chr10='0', chr17='g', chr16='f', chr15='e', chr14='d',
chr19='i', chr18='h', chrM='m', chr22='l', chr20='j',
chr21='k', chr7='7', chr6='6', chr5='5', chr4='4',
chr3='3', chr2='2', chr1='1', chr9='9', chr8='8')

def consume_file(file_name, chunks)
numpy.zeros(size_guess)
lx = []
cx = []
px = []
block = []
with open(file_name) as fh:
for line in enumerate(fh):
chrom, coord, point = row.split()
lx.append(chrommap[chrom])
cx.append(coord)
px.append(point)
if len(cx) >= chunks:
block.append(.''.join(lx))
block.append(numpy.array(cx, dtype=int))
block.append(numpy.array(px, dtype=float))
lx = []
cx = []
px = []
if lx:
block.append(.''.join(lx))
block.append(numpy.array(cx))
block.append(numpy.array(px))

return (''.join(block[0::3]),
numpy.concatenate(block[1::3]),
numpy.concatenate(block[2::3]))

# The following repeats 128, to avoid initial read issues.
for CHUNKS in 128, 128, 256, 1024, 4096, 16384:
t0 = time.clock()
letters, coords, points = consume_file("largefile.txt", CHUNKS)
t1 = time.clock()
print "finished %s in %s chunks: %f.2" % (
len(letters), CHUNKS, t1 - t0)

--Scott David Daniels
Scott.Daniels at Acm.Org

```