Please have a look at version1 and version2. What are my other options
here? Do I need to go the cython route here? Thanks, Siegfried
==
My array is as follows (shown here for dummy values; and yes this kind
of arrays do exist: 150 observations x 8 years x 366 days x 24 hours x
7 model levels):
data = np.random.random((150,8,366,24,7))
My function "np.apply_along_axis(my_moment,4,data)" takes 15 minutes.
I thought making use of masked arrays "my_moment_fast(data,axis=4)"
will speed up things but
1. It blows up the memory consumption to 6 GB at times
and
2. It also takes... I do not know as I killed it after 20 minutes (it
hangs at the median print statement).
The calculation of the median is the bottleneck here.
==
import numpy as np
def my_moment(data_in,nan_val=-999.0):
tmp = data_in[np.where(data_in<>nan_val)]
erg = np.array([np.min(tmp),np.mean(tmp),np.median(tmp),\
np.max(tmp),np.std(tmp),np.size(tmp)])
return erg
def my_moment_fast(data_in,nan_val=-999.0,axis=4):
import numpy as np
print 'min max',np.min(data_in),np.max(data_in)
mx = np.ma.masked_where((data_in<=0.0)&(data_in<=nan_val), data_in)
print 'shape mx',np.shape(mx),np.min(mx),np.max(mx)
print 'min'
tmp_min = np.ma.min(mx,axis=axis)
print 'max'
tmp_max = np.ma.max(mx,axis=axis)
print 'mean'
tmp_mean = np.ma.mean(mx,axis=axis)
print 'median'
#tmp_median = np.ma.sort(mx,axis=axis)
tmp_median = np.ma.median(mx,axis=axis)
print 'std'
tmp_std = np.ma.std(mx,axis=axis)
print 'N'
tmp_N = np.ones(np.shape(mx))
tmp_N[mx.mask] = 0.0e0
tmp_N = np.ma.sum(tmp_N,axis=axis)
print 'shape min',np.shape(tmp_min),np.min(tmp_min),np.max(tmp_min)
print 'shape max',np.shape(tmp_max),np.min(tmp_max),np.max(tmp_min)
print 'shape mean',np.shape(tmp_mean),np.min(tmp_mean),np.max(tmp_mean)
print 'shape median', np.shape(tmp_median), np.min(tmp_median),
np.max(tmp_median)
print 'shape std',np.shape(tmp_std),np.min(tmp_std),np.max(tmp_std)
print 'shape N', np.shape(tmp_N), np.min(tmp_N), np.max(tmp_N),
np.shape(mx.mask)
return np.array([tmp_min,tmp_mean,tmp_median,tmp_max,tmp_std,tmp_N])
data = np.random.random((150,8,366,24,7))
data[134,5,300,:,2] = -999.0
data[14,3,300,:,0] = -999.0
version1 = my_moment_fast(data,axis=4)
exit()
version2 = np.apply_along_axis(my_moment,4,data)
==
What am I doing wrong here? I haven't tested it againts Fortran and
have got no idea if sorting for fetching the median would be faster.
Thanks,
Siegfried
==
--
The University of Edinburgh is a charitable body, registered in
Scotland, with registration number SC005336.