Hi,
I think there are a problem with numarray (not sure).
I'm trying to correlate two differents file to find the same object in
both. To do this I wrote some ugly software and I'm using the
readcol2.py to read the file in a numarray, numarray string or list format.
The cross_name.py is doing the cross correlation when I'm using the
numarray string format. I'm using three parameters at differents columns
and I compare all of them with something like:
numarray.all(a[i,:] == b[j,:])
I saw that my script is very very slow or to be more precise became to
be slow. It's seems ok at the beginning but little by little is slow
down by a huge amount. I let it turn all the week end and it found ~40
000 objects (both files are ~200000 lines...) in common in two days.
I change the software to use the list in python and in some minutes
I'have ~20 000 objects found in common. So I think there are a big
problem probably: 1) in my script, perhaps 2) in numarray or 3) in both.
I hope to have explain the problem clearly ...
N.
ps: I print an output for the script cross_name.py to visually see the
slow down and that appeard to became slow around the 700 objects in
common but it's gradully decline.
pps: I join the different file I used. The cross_name.py is the function
with the problem.
-------------------------------------
#readcol2.py
-------------------------------------
def
readcol(fname,comments='%',columns=None,delimiter=None,dep=0,arraytype='list'):
"""
Load ASCII data from fname into an array and return the array.
The data must be regular, same number of values in every row
fname can be a filename or a file handle.
Input:
- Fname : the name of the file to read
Optionnal input:
- comments : a string to indicate the charactor to delimit the
domments.
the default is the matlab character '%'.
- columns : list or tuple ho contains the columns to use.
- delimiter : a string to delimit the columns
- dep : an integer to indicate from which line you want to begin
to use the file (useful to avoid the descriptions lines)
- arraytype : a string to indicate which kind of array you want ot
have: numeric array (numeric) or character array
(numstring) or list (list). By default it's the
list mode used
matfile data is not currently supported, but see
Nigel Wade's matfile ftp://ion.le.ac.uk/matfile/matfile.tar.gz
Example usage:
x,y = transpose(readcol('test.dat')) # data in two columns
X = readcol('test.dat') # a matrix of data
x = readcol('test.dat') # a single column of data
x = readcol('test.dat,'#') # the character use like a comment
delimiter is '#'
initial function from pylab, improve by myself for my need
"""
from numarray import array,transpose
fh = file(fname)
X = []
numCols = None
nline = 0
if columns is None:
for line in fh:
nline += 1
if dep is not None and nline <= dep: continue
line = line[:line.find(comments)].strip()
if not len(line): continue
if arraytype=='numeric':
row = [float(val) for val in line.split(delimiter)]
else:
row = [val.strip() for val in line.split(delimiter)]
thisLen = len(row)
if numCols is not None and thisLen != numCols:
raise ValueError('All rows must have the same number of
columns')
X.append(row)
else:
for line in fh:
nline +=1
if dep is not None and nline <= dep: continue
line = line[:line.find(comments)].strip()
if not len(line): continue
row = line.split(delimiter)
if arraytype=='numeric':
row = [float(row[i-1]) for i in columns]
elif arraytype=='numstring':
row = [row[i-1].strip() for i in columns]
else:
row = [row[i-1].strip() for i in columns]
thisLen = len(row)
if numCols is not None and thisLen != numCols:
raise ValueError('All rows must have the same number of
columns')
X.append(row)
if arraytype=='numeric':
X = array(X)
r,c = X.shape
if r==1 or c==1:
X.shape = max([r,c]),
elif arraytype == 'numstring':
import numarray.strings # pb si numeric+pylab
X = numarray.strings.array(X)
r,c = X.shape
if r==1 or c==1:
X.shape = max([r,c]),
return X
----------------------------------------------------------------
#cross_name.py
----------------------------------------------------------------
#/usr/bin/env python
'''
Software to cross correlate two files. To use it you had to file a
params file
who contains the information of the file you want to correlate.
The information must have the format:
namefile = list of column ; delimiter
example:
file1 = 1,2,3 ;
file2 = 20,19,21 ; ,
no delimiter = blanck
'''
# there are a big problem of efficiency. The software is far to long
with big file like SDSS.
# I had to find where is the problem
import sys
import numarray
import string
#read the params file
params = {}
for line in file(sys.argv[1],'rU'):
line = line.strip() # delete the end of line (\n on unix)
if not len(line): continue # is line empty do nothing and pass to
the next line
if line.startswith('#'): continue # test if the line is a comments
(# is the character to signal it)
tup = line.split('=',1) # split the line, the delimiter is the
sign =
columns = [int(i) for i in
tup[1].strip().split(';')[0].strip().split(',')] # creat a list who
contains
# the columns we want to use
delimiter = tup[1].strip().split(';')[1].strip() # check the
delimiter of the data file (generally space or coma)
if not len(delimiter): delimiter = None
params[tup[0].strip()] = { 'columns' : columns, 'delimiter' :
delimiter}
# Read the data files (only the columns ask in the params file)
debut_data = 1
data = []
for namefile in params.iterkeys():
import readcol2 #import the function to read the files
#data.append(readcol2.readcol(namefile,columns=params[namefile]['columns'],comments='#',delimiter=params[namefile]['delimiter'],dep=1,arraytype='character'))
params[namefile]['data'] =
readcol2.readcol(namefile,columns=params[namefile]['columns'],comments='#',delimiter=params[namefile]['delimiter'],dep=debut_data,arraytype='character')
# Read another times the data files to have all the lines!
# Question: like it's a dictionnary are we sure that the file are in the
same order... Check it!!!!!!!!!
if len(params.keys()) == 2:
namefile,data,delimiter = [],[],[]
for keys in params.iterkeys():
namefile.append(keys)
data.append(params[keys]['data'])
delim = params[keys]['delimiter']
if delim != None:
delimiter.append(params[keys]['delimiter'])
else:
delimiter.append(' ')
#res_a = []
#res_b = []
f1_ini = file(namefile[0]).readlines()[debut_data:]
f2_ini = file(namefile[1]).readlines()[debut_data:]
#f1_ini = [line for line in file(namefile[0])][debut_data:]
#f2_ini = [line for line in file(namefile[1])][debut_data:]
f1=open('cross'+namefile[0],'w')
f2=open('cross'+namefile[1],'w')
f3=open('pastecross'+namefile[0]+namefile[1],'w')
b_i = 0
for a_i in range(data[0].shape[0]):
for b_i in range(b_i,data[1].shape[0]):
if numarray.all(data[0][a_i,:] == data[1][b_i,:]):
f1.write(f1_ini[a_i])
f2.write(f2_ini[b_i])
f3.write(f1_ini[a_i].strip()+delimiter[0]+'
'+string.replace(f2_ini[b_i],delimiter[1],delimiter[0]))
del f2_ini[b_i]
break
#res_a.append(a_i)
#res_b.append(b_i)
f1.close()
f2.close()
f3.close()
else:
print "too much file: only two allowed for the moment"
#save the results in 3 files: 2 with the common objects from each file.
# one with a paste of the lines of the 2 initial files.
-----------------------------------------------------------------------
#cross_name2.py
---------------------------------------------------------------------
#/usr/bin/env python
'''
Software to cross correlate two files. To use it you had to file a
params file
who contains the information of the file you want to correlate.
The information must have the format:
namefile = list of column ; delimiter
example:
file1 = 1,2,3 ;
file2 = 20,19,21 ; ,
no delimiter = blanck
'''
# there are a big problem of efficiency. The software is far to long
with big file like SDSS.
# I had to find where is the problem
import sys
import numarray
import string
#read the params file
params = {}
for line in file(sys.argv[1],'rU'):
line = line.strip() # delete the end of line (\n on unix)
if not len(line): continue # is line empty do nothing and pass to
the next line
if line.startswith('#'): continue # test if the line is a comments
(# is the character to signal it)
tup = line.split('=',1) # split the line, the delimiter is the
sign =
columns = [int(i) for i in
tup[1].strip().split(';')[0].strip().split(',')] # creat a list who
contains
# the columns we want to use
delimiter = tup[1].strip().split(';')[1].strip() # check the
delimiter of the data file (generally space or coma)
if not len(delimiter): delimiter = None
params[tup[0].strip()] = { 'columns' : columns, 'delimiter' :
delimiter}
# Read the data files (only the columns ask in the params file)
debut_data = 1
data = []
for namefile in params.iterkeys():
import readcol2 #import the function to read the files
#data.append(readcol2.readcol(namefile,columns=params[namefile]['columns'],comments='#',delimiter=params[namefile]['delimiter'],dep=1,arraytype='character'))
params[namefile]['data'] =
readcol2.readcol(namefile,columns=params[namefile]['columns'],comments='#',delimiter=params[namefile]['delimiter'],dep=debut_data,arraytype='list')
# Read another times the data files to have all the lines!
# Question: like it's a dictionnary are we sure that the file are in the
same order... Check it!!!!!!!!!
if len(params.keys()) == 2:
namefile,data,delimiter = [],[],[]
for keys in params.iterkeys():
namefile.append(keys)
data.append(params[keys]['data'])
delim = params[keys]['delimiter']
if delim != None:
delimiter.append(params[keys]['delimiter'])
else:
delimiter.append(' ')
#res_a = []
#res_b = []
f1_ini = file(namefile[0]).readlines()[debut_data:]
f2_ini = file(namefile[1]).readlines()[debut_data:]
#f1_ini = [line for line in file(namefile[0])][debut_data:]
#f2_ini = [line for line in file(namefile[1])][debut_data:]
f1=open('cross'+namefile[0],'w')
f2=open('cross'+namefile[1],'w')
f3=open('pastecross'+namefile[0]+namefile[1],'w')
# i=0
# for a_i in range(len(data[0])):
# #print data[0][a_i,:]
# for b_i in range(len(data[1])):
# if data[0][a_i] == data[1][b_i]:
# print data[0][a_i],data[1][b_i]
# i+=1
# print i
# break
b_i=0
for a_i in range(len(data[0])):
for b_i in range(b_i,len(data[1])):
if data[0][a_i] == data[1][b_i]:
f1.write(f1_ini[a_i])
f2.write(f2_ini[b_i])
f3.write(f1_ini[a_i].strip()+delimiter[0]+'
'+string.replace(f2_ini[b_i],delimiter[1],delimiter[0]))
del f2_ini[b_i]
break
#res_a.append(a_i)
#res_b.append(b_i)
f1.close()
f2.close()
f3.close()
else:
print "too much file: only two allowed for the moment"
#save the results in 3 files: 2 with the common objects from each file.
# one with a paste of the lines of the 2 initial files.