Hello,
I have a problem with numarray and especially the function numarray.all.
I want to compare two files to do this I read the files with a function
readcol2 who can put them in a list or numarray format (string or
numerical).
I'm doing a comparaison on each line of the file.
If I'm using the array format and the numarray.all function, that take
forever to do the comparaison for 2 big files. If I'm using python list
object, it's very fast. I think there are some problem or at least some
improvement to do. If I understand correctly the goal of numarray, it
has been write to speed up some part of python but here it slow down a lot.
An very simple sample to see the effect is at the bottom of this mail.
Thanks for numarray, I hope to not bother you. My comments are more to
improve numarray than other things. I have been able to find the problem
so no I can avoied it.
H.
def
readcol(fname,comments='%',columns=None,delimiter=None,dep=0,arraytype='list'):
"""
Load ASCII data from fname into an array and return the array.
The data must be regular, same number of values in every row
fname can be a filename or a file handle.
Input:
- Fname : the name of the file to read
Optionnal input:
- comments : a string to indicate the charactor to delimit the domments.
the default is the matlab character '%'.
- columns : list or tuple ho contains the columns to use.
- delimiter : a string to delimit the columns
- dep : an integer to indicate from which line you want to begin
to use the file (useful to avoid the descriptions lines)
- arraytype : a string to indicate which kind of array you want ot
have: numeric array (numeric) or character array
(numstring) or list (list). By default it's the
list mode used
matfile data is not currently supported, but see
Nigel Wade's matfile ftp://ion.le.ac.uk/matfile/matfile.tar.gz
Example usage:
x,y = transpose(readcol('test.dat')) # data in two columns
X = readcol('test.dat') # a matrix of data
x = readcol('test.dat') # a single column of data
x = readcol('test.dat,'#') # the character use like a comment
delimiter is '#'
initial function from pylab (J.Hunter). Change by myself for my
specific need
"""
from numarray import array,transpose
fh = file(fname)
X = []
numCols = None
nline = 0
if columns is None:
for line in fh:
nline += 1
if dep is not None and nline <= dep: continue
line = line[:line.find(comments)].strip()
if not len(line): continue
if arraytype=='numeric':
row = [float(val) for val in line.split(delimiter)]
else:
row = [val.strip() for val in line.split(delimiter)]
thisLen = len(row)
if numCols is not None and thisLen != numCols:
raise ValueError('All rows must have the same number of
columns')
X.append(row)
else:
for line in fh:
nline +=1
if dep is not None and nline <= dep: continue
line = line[:line.find(comments)].strip()
if not len(line): continue
row = line.split(delimiter)
if arraytype=='numeric':
row = [float(row[i-1]) for i in columns]
elif arraytype=='numstring':
row = [row[i-1].strip() for i in columns]
else:
row = [row[i-1].strip() for i in columns]
thisLen = len(row)
if numCols is not None and thisLen != numCols:
raise ValueError('All rows must have the same number of
columns')
X.append(row)
if arraytype=='numeric':
X = array(X)
r,c = X.shape
if r==1 or c==1:
X.shape = max([r,c]),
elif arraytype == 'numstring':
import numarray.strings # pb if numeric+pylab
X = numarray.strings.array(X)
r,c = X.shape
if r==1 or c==1:
X.shape = max([r,c]),
return X
-------------------------------------------
files_test_creation.py
-------------------------------------------
f1 = file('test1.dat','w')
for i in range(10000):
f1.write(str(i)+' '+str(i+1)+' '+str(i+2)+'\n')
f1.close()
f2 = file('test2.dat','w')
for i in range(10000):
f2.write(str(i)+' '+str(i+1)+' '+str(i+2)+'\n')
f2.close()
-------------------------------------------
numarray_pb_sample.py
-------------------------------------------
import numarray
data1 =
readcol2.readcol('test1.dat',columns=[1,2,3],comments='#',delimiter='
',dep=1,arraytype='numstring')
data2 =
readcol2.readcol('test2.dat',columns=[1,2,3],comments='#',delimiter='
',dep=1,arraytype='numstring')
#or in non string array form (same result)
## data1 =
readcol2.readcol('test1.dat',columns=[1,2,3],comments='#',delimiter='
',dep=1,arraytype='numeric')
## data2 =
readcol2.readcol('test2.dat',columns=[1,2,3],comments='#',delimiter='
',dep=1,arraytype='numeric')
for a_i in range(data1.shape[0]):
for b_i in range(data2.shape[0]):
if numarray.all(data1[a_i,:] == data2[b_i,:]):
print a_i,b_i
-------------------------------------------
python_list_sample.py
-------------------------------------------
data1 =
readcol2.readcol('test1.dat',columns=[1,2,3],comments='#',delimiter='
',dep=1,arraytype='list')
data2 =
readcol2.readcol('test2.dat',columns=[1,2,3],comments='#',delimiter='
',dep=1,arraytype='list')
for a_i in range(len(data1)):
for b_i in range(len(data2)):
if data1[a_i] == data2[b_i]:
print a_i,b_i