Hi, I think there are a problem with numarray (not sure). I'm trying to correlate two differents file to find the same object in both. To do this I wrote some ugly software and I'm using the readcol2.py to read the file in a numarray, numarray string or list format. The cross_name.py is doing the cross correlation when I'm using the numarray string format. I'm using three parameters at differents columns and I compare all of them with something like: numarray.all(a[i,:] == b[j,:]) I saw that my script is very very slow or to be more precise became to be slow. It's seems ok at the beginning but little by little is slow down by a huge amount. I let it turn all the week end and it found ~40 000 objects (both files are ~200000 lines...) in common in two days. I change the software to use the list in python and in some minutes I'have ~20 000 objects found in common. So I think there are a big problem probably: 1) in my script, perhaps 2) in numarray or 3) in both. I hope to have explain the problem clearly ... N. ps: I print an output for the script cross_name.py to visually see the slow down and that appeard to became slow around the 700 objects in common but it's gradully decline. def readcol(fname,comments='%',columns=None,delimiter=None,dep=0,arraytype='list'): """ Load ASCII data from fname into an array and return the array. The data must be regular, same number of values in every row fname can be a filename or a file handle. Input: - Fname : the name of the file to read Optionnal input: - comments : a string to indicate the charactor to delimit the domments. the default is the matlab character '%'. - columns : list or tuple ho contains the columns to use. - delimiter : a string to delimit the columns - dep : an integer to indicate from which line you want to begin to use the file (useful to avoid the descriptions lines) - arraytype : a string to indicate which kind of array you want ot have: numeric array (numeric) or character array (numstring) or list (list). By default it's the list mode used matfile data is not currently supported, but see Nigel Wade's matfile ftp://ion.le.ac.uk/matfile/matfile.tar.gz Example usage: x,y = transpose(readcol('test.dat')) # data in two columns X = readcol('test.dat') # a matrix of data x = readcol('test.dat') # a single column of data x = readcol('test.dat,'#') # the character use like a comment delimiter is '#' """ from numarray import array,transpose fh = file(fname) X = [] numCols = None nline = 0 if columns is None: for line in fh: nline += 1 if dep is not None and nline <= dep: continue line = line[:line.find(comments)].strip() if not len(line): continue if arraytype=='numeric': row = [float(val) for val in line.split(delimiter)] else: row = [val.strip() for val in line.split(delimiter)] thisLen = len(row) if numCols is not None and thisLen != numCols: raise ValueError('All rows must have the same number of columns') X.append(row) else: for line in fh: nline +=1 if dep is not None and nline <= dep: continue line = line[:line.find(comments)].strip() if not len(line): continue row = line.split(delimiter) if arraytype=='numeric': row = [float(row[i-1]) for i in columns] elif arraytype=='numstring': row = [row[i-1].strip() for i in columns] else: row = [row[i-1].strip() for i in columns] thisLen = len(row) if numCols is not None and thisLen != numCols: raise ValueError('All rows must have the same number of columns') X.append(row) if arraytype=='numeric': X = array(X) r,c = X.shape if r==1 or c==1: X.shape = max([r,c]), elif arraytype == 'numstring': import numarray.strings # pb si numeric+pylab X = numarray.strings.array(X) r,c = X.shape if r==1 or c==1: X.shape = max([r,c]), return X #/usr/bin/env python ''' Software to cross correlate two files. To use it you had to file a params file who contains the information of the file you want to correlate. The information must have the format: namefile = list of column ; delimiter example: file1 = 1,2,3 ; file2 = 20,19,21 ; , no delimiter = blanck ''' # there are a big problem of efficiency. The software is far to long with big file like SDSS. # I had to find where is the problem import sys import numarray import string #read the params file params = {} for line in file(sys.argv[1],'rU'): line = line.strip() # delete the end of line (\n on unix) if not len(line): continue # is line empty do nothing and pass to the next line if line.startswith('#'): continue # test if the line is a comments (# is the character to signal it) tup = line.split('=',1) # split the line, the delimiter is the sign = columns = [int(i) for i in tup[1].strip().split(';')[0].strip().split(',')] # creat a list who contains # the columns we want to use delimiter = tup[1].strip().split(';')[1].strip() # check the delimiter of the data file (generally space or coma) if not len(delimiter): delimiter = None params[tup[0].strip()] = { 'columns' : columns, 'delimiter' : delimiter} # Read the data files (only the columns ask in the params file) debut_data = 1 data = [] for namefile in params.iterkeys(): import readcol2 #import the function to read the files #data.append(readcol2.readcol(namefile,columns=params[namefile]['columns'],comments='#',delimiter=params[namefile]['delimiter'],dep=1,arraytype='character')) params[namefile]['data'] = readcol2.readcol(namefile,columns=params[namefile]['columns'],comments='#',delimiter=params[namefile]['delimiter'],dep=debut_data,arraytype='character') # Read another times the data files to have all the lines! # Question: like it's a dictionnary are we sure that the file are in the same order... Check it!!!!!!!!! if len(params.keys()) == 2: namefile,data,delimiter = [],[],[] for keys in params.iterkeys(): namefile.append(keys) data.append(params[keys]['data']) delim = params[keys]['delimiter'] if delim != None: delimiter.append(params[keys]['delimiter']) else: delimiter.append(' ') #res_a = [] #res_b = [] f1_ini = file(namefile[0]).readlines()[debut_data:] f2_ini = file(namefile[1]).readlines()[debut_data:] #f1_ini = [line for line in file(namefile[0])][debut_data:] #f2_ini = [line for line in file(namefile[1])][debut_data:] f1=open('cross'+namefile[0],'w') f2=open('cross'+namefile[1],'w') f3=open('pastecross'+namefile[0]+namefile[1],'w') for a_i in range(data[0].shape[0]): for b_i in range(data[1].shape[0]): if numarray.all(data[0][a_i,:] == data[1][b_i,:]): f1.write(f1_ini[a_i]) f2.write(f2_ini[b_i]) f3.write(f1_ini[a_i].strip()+delimiter[0]+' '+string.replace(f2_ini[b_i],delimiter[1],delimiter[1])) del f2_ini[b_i] break #res_a.append(a_i) #res_b.append(b_i) f1.close() f2.close() f3.close() else: print "too much file: only two allowed for the moment" #save the results in 3 files: 2 with the common objects from each file. # one with a paste of the lines of the 2 initial files. #/usr/bin/env python ''' Software to cross correlate two files. To use it you had to file a params file who contains the information of the file you want to correlate. The information must have the format: namefile = list of column ; delimiter example: file1 = 1,2,3 ; file2 = 20,19,21 ; , no delimiter = blanck ''' # there are a big problem of efficiency. The software is far to long with big file like SDSS. # I had to find where is the problem import sys import numarray import string #read the params file params = {} for line in file(sys.argv[1],'rU'): line = line.strip() # delete the end of line (\n on unix) if not len(line): continue # is line empty do nothing and pass to the next line if line.startswith('#'): continue # test if the line is a comments (# is the character to signal it) tup = line.split('=',1) # split the line, the delimiter is the sign = columns = [int(i) for i in tup[1].strip().split(';')[0].strip().split(',')] # creat a list who contains # the columns we want to use delimiter = tup[1].strip().split(';')[1].strip() # check the delimiter of the data file (generally space or coma) if not len(delimiter): delimiter = None params[tup[0].strip()] = { 'columns' : columns, 'delimiter' : delimiter} # Read the data files (only the columns ask in the params file) debut_data = 1 data = [] for namefile in params.iterkeys(): import readcol2 #import the function to read the files #data.append(readcol2.readcol(namefile,columns=params[namefile]['columns'],comments='#',delimiter=params[namefile]['delimiter'],dep=1,arraytype='character')) params[namefile]['data'] = readcol2.readcol(namefile,columns=params[namefile]['columns'],comments='#',delimiter=params[namefile]['delimiter'],dep=debut_data,arraytype='list') # Read another times the data files to have all the lines! # Question: like it's a dictionnary are we sure that the file are in the same order... Check it!!!!!!!!! if len(params.keys()) == 2: namefile,data,delimiter = [],[],[] for keys in params.iterkeys(): namefile.append(keys) data.append(params[keys]['data']) delim = params[keys]['delimiter'] if delim != None: delimiter.append(params[keys]['delimiter']) else: delimiter.append(' ') #res_a = [] #res_b = [] f1_ini = file(namefile[0]).readlines()[debut_data:] f2_ini = file(namefile[1]).readlines()[debut_data:] #f1_ini = [line for line in file(namefile[0])][debut_data:] #f2_ini = [line for line in file(namefile[1])][debut_data:] f1=open('cross'+namefile[0],'w') f2=open('cross'+namefile[1],'w') f3=open('pastecross'+namefile[0]+namefile[1],'w') # i=0 # for a_i in range(len(data[0])): # #print data[0][a_i,:] # for b_i in range(len(data[1])): # if data[0][a_i] == data[1][b_i]: # print data[0][a_i],data[1][b_i] # i+=1 # print i # break for a_i in range(len(data[0])): for b_i in range(len(data[1])): if data[0][a_i] == data[1][b_i]: f1.write(f1_ini[a_i]) f2.write(f2_ini[b_i]) f3.write(f1_ini[a_i].strip()+delimiter[0]+' '+string.replace(f2_ini[b_i],delimiter[1],delimiter[1])) del f2_ini[b_i] break #res_a.append(a_i) #res_b.append(b_i) f1.close() f2.close() f3.close() else: print "too much file: only two allowed for the moment" #save the results in 3 files: 2 with the common objects from each file. # one with a paste of the lines of the 2 initial files.
participants (1)
-
Nicolas Gruel