Not sure why this is filling my sys memory
Vincent Davis
vincent at vincentdavis.net
Sat Feb 20 20:07:59 EST 2010
> Code is below, The files are about 5mb and 230,000 rows. When I have 43
> files of them and when I get to the 35th (reading it in) my system gets so
> slow that it is nearly functionless. I am on a mac and activity monitor
> shows that python is using 2.99GB of memory (of 4GB). (python 2.6 64bit).
> The getsizeof() returns 6424 bytes for the alldata . So I am not sure what
> is happening.
> Any ideas
> Thanks
>
> import csv, os, glob
import sys
def read_data_file(filename):
reader = csv.reader(open(filename, "U"),delimiter='\t')
data = []
mask = []
outliers = []
modified = []
data_append = data.append
mask_append = mask.append
outliers_append = outliers.append
modified_append = modified.append
maskcount = 0
outliercount = 0
modifiedcount = 0
for row in reader:
if '[MASKS]' in row:
maskcount += 1
if '[OUTLIERS]' in row:
outliercount += 1
if '[MODIFIED]' in row:
modifiedcount += 1
if not any((maskcount, outliercount, modifiedcount, not row)):
data_append(row)
elif not any((outliercount, modifiedcount, not row)):
mask_append(row)
elif not any((modifiedcount, not row)):
outliers_append(row)
else:
if row: modified_append(row)
data = data[1:]
mask = mask[3:]
outliers = outliers[3:]
modified = modified[3:]
return [data, mask, outliers, modified]
def ImportDataFrom(folder):
print 'Importing files from: ', folder
alldata = dict()
infolder = glob.glob( os.path.join(folder, '*.txt') )
numfiles = len(infolder)
print 'Importing ' + str(numfiles) + ' files from: ', folder
for infile in infolder:
print "Loading into memory: " + os.path.split(infile)[1]
fname = os.path.split(infile)[1]
filedata = dict(zip([fname + '_data', fname + '_mask', fname +
'_outliers', fname+'_modified'], read_data_file(infile)))
print fname + ' has ' + str(len(filedata[fname + '_data'])) + ' rows
of data'
print fname + ' has ' + str(len(filedata[fname + '_mask'])) + ' rows
of masked data'
print fname + ' has ' + str(len(filedata[fname + '_outliers'])) + '
rows of outliers'
print fname + ' has ' + str(len(filedata[fname +'_modified'])) + '
modified rows of data'
print str(sys.getsizeof(filedata)) +'bytes'' of memory used for '+
fname
print ' '
alldata.update(filedata)
print str(len(alldata)/4) + ' files of ' + str(numfiles) + ' using '
+ str(sys.getsizeof(alldata)) + ' bytes of memory'
return alldata
ImportDataFrom("/Users/vmd/Dropbox/dna/data/rawdata")
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/python-list/attachments/20100220/96772dfc/attachment-0001.html>
More information about the Python-list
mailing list