
Ralf Gommers skrev:
Sure, I know. The docs I linked have improvements over svn. If you merge your changes, there'll be a conflict and those docs will not get merged soon - I was hoping to avoid that.
I also wanted to draw your attention to the doc wiki, since in this case you redid work that was already done by someone else on the wiki before. Hopefully this will save you some effort next time.
Well, I changed all the code to C++, to use the std::complex type, std::vector instead of malloc, and templates for specializing the filter functions to all dtypes supported by lfilter. I'm quite happy with the way the C++ looks :-) It seems to be faster than signal.lfilter in most circumstances, except one (annoyingly the most common). The most extreme case were complex64 dtype with inplace filtering, for which I got 369% speed improvement. Here are some timings (speed improvement over signal.lfilter in percent): <type 'numpy.float32'> axis=0, shape=(1000000,), speed: 240 axis=0, shape=(1000, 1000), speed: 149 axis=1, shape=(1000, 1000), speed: 238 <type 'numpy.float64'> axis=0, shape=(1000000,), speed: 151 axis=0, shape=(1000, 1000), speed: 93 axis=1, shape=(1000, 1000), speed: 146 <type 'numpy.complex64'> axis=0, shape=(1000000,), speed: 297 axis=0, shape=(1000, 1000), speed: 204 axis=1, shape=(1000, 1000), speed: 292 <type 'numpy.complex128'> axis=0, shape=(1000000,), speed: 209 axis=0, shape=(1000, 1000), speed: 137 axis=1, shape=(1000, 1000), speed: 193 Inplace filter: <type 'numpy.float32'> axis=0, shape=(1000000,), speed: 291 axis=0, shape=(1000, 1000), speed: 182 axis=1, shape=(1000, 1000), speed: 300 <type 'numpy.float64'> axis=0, shape=(1000000,), speed: 227 axis=0, shape=(1000, 1000), speed: 137 axis=1, shape=(1000, 1000), speed: 228 <type 'numpy.complex64'> axis=0, shape=(1000000,), speed: 369 axis=0, shape=(1000, 1000), speed: 257 axis=1, shape=(1000, 1000), speed: 345 <type 'numpy.complex128'> axis=0, shape=(1000000,), speed: 288 axis=0, shape=(1000, 1000), speed: 179 axis=1, shape=(1000, 1000), speed: 276 To build on Windows I did this, using GCC 4.4.1 from http://www.equation.com/servlet/equation.cmd?fa=fortran g++ -c -O3 -ffast-math -msse3 -march=core2 -Ic:/Python26/include -Ic:/Python26/Lib/site-packages/numpy/core/include _linear_filter.cpp cython.py linear_filter.pyx gcc -c -O2 -ffast-math -msse3 -march=core2 -Ic:/Python26/include -Ic:/Python26/Lib/site-packages/numpy/core/include linear_filter.c g++ -shared -o linear_filter.pyd -Lc:/Python26/libs _linear_filter.o linear_filter.o -lpython26 -lmsvcr90 Best regards, Sturla Molden /* Copyright (C) 2009 Sturla Molden */ #include <vector> #include <complex> typedef std::complex<float> complex_float; typedef std::complex<double> complex_double; typedef std::complex<long double> complex_long_double; #include <Python.h> #include <numpy/arrayobject.h> #include <cstring> typedef PyArrayObject ndarray; /* PyArrayObject is an ugly name */ typedef Py_ssize_t integer; /* Py_ssize_t of int for 64 bit support, npy_intp is typedef'ed incorrectly. */ /* copy data to and from temporary work arrays */ template<class T> void copy_in(T *y, T *x, integer n, integer s) { integer i; char *tmp; if (s == sizeof(T)) std::memcpy(y, x, n*sizeof(T)); else { tmp = (char *)x; for (i=0; i<n; i++) { *y++ = *x; tmp += s; x = (T *)tmp; } } } template<class T> void copy_out(T *y, T *x, integer n, integer s) { integer i; char *tmp; if (s == sizeof(T)) std::memcpy(y, x, n*sizeof(T)); else { tmp = (char *)y; for (i=0; i<n; i++) { *y = *x++; tmp += s; y = (T *)tmp; } } } /* this computes the start adress for every vector along a dimension (axis) of an ndarray */ template<class T> inline T *datapointer(ndarray *a, integer *indices, T dummy) { char *data = a->data; int i; integer j; for (i=0; i < a->nd; i++) { j = indices[i]; data += j * a->strides[i]; } return (T *)data; } template<class T> T ** _get_axis_pointer(integer *indices, int axis, ndarray *a, T **axis_ptr_array, int dim, T dummy) { /* recursive axis pointer search for 4 dimensions or more */ integer i; T *axis_ptr; if (dim == a->nd) { /* done recursing dimensions, compute address of this axis */ axis_ptr = datapointer(a, indices, dummy); *axis_ptr_array = axis_ptr; return (axis_ptr_array + 1); } else if (dim == axis) { /* use first element only */ indices[dim] = 0; axis_ptr_array = _get_axis_pointer(indices, axis, a, axis_ptr_array, dim+1, dummy); return axis_ptr_array; } else { /* iterate range of indices */ integer length = a->dimensions[dim]; for (i=0; i < length; i++) { indices[dim] = i; axis_ptr_array = _get_axis_pointer(indices, axis, a, axis_ptr_array, dim+1, dummy); } return axis_ptr_array; } } template<class T> std::vector<T*> get_axis_pointers(ndarray *a, int axis, integer *count, T dummy) { std::vector<integer> indices(a->nd); integer i, j; j = 1; for (i=0; i < a->nd; i++) { if (i != axis) j *= a->dimensions[i]; } *count = j; std::vector<T*> out(*count); T **tmp = &out[0]; switch (a->nd) { /* for one dimension we just return the data pointer */ case 1: *tmp = (T *)a->data; break; /* specialized for two dimensions */ case 2: switch (axis) { case 0: for (i=0; i<a->dimensions[1]; i++) *tmp++ = (T *)(a->data + i*a->strides[1]); break; case 1: for (i=0; i<a->dimensions[0]; i++) *tmp++ = (T *)(a->data + i*a->strides[0]); break; } break; /* specialized for three dimensions */ case 3: switch (axis) { case 0: for (i=0; i<a->dimensions[1]; i++) for (j=0; j<a->dimensions[2]; j++) *tmp++ = (T *)(a->data + i*a->strides[1] + j*a->strides[2]); break; case 1: for (i=0; i<a->dimensions[0]; i++) for (j=0; j<a->dimensions[2]; j++) *tmp++ = (T *)(a->data + i*a->strides[0] + j*a->strides[2]); break; case 2: for (i=0; i<a->dimensions[0]; i++) for (j=0; j<a->dimensions[1]; j++) *tmp++ = (T *)(a->data + i*a->strides[0] + j*a->strides[1]); } break; /* four or more dimensions: use recursion */ default: for (i=0; i<a->nd; i++) indices[i] = 0; integer *pind = &indices[0]; _get_axis_pointer(pind, axis, a, tmp, 0, (T)0); } return out; } /* applies a linear filter along one dimension */ template<class T> static void lfilter_1d( const T *b, const T *a, T *x, T *y, T *zi, integer K, integer N) { std::vector<T> Z(K); T *pz = zi, *z = &Z[0]; register integer n, k; for (n=0; n<N; n++) { register T yn, xn = x[n]; yn = y[n] = b[0] * xn + pz[0]; for (k=0; k<K-1; k++) z[k] = b[k+1] * xn + pz[k+1] - a[k+1]*yn; z[K-1] = b[K]*xn - a[K]*yn; T *tmp = pz; pz = z; z = tmp; } if (pz != zi) std::memcpy(zi, pz, K*sizeof(T)); } template<class T> static void rev_lfilter_1d( const T *b, const T *a, T *x, T *y, T *zi, integer K, integer N) { std::vector<T> Z(K); T *pz = zi, *z = &Z[0]; register integer n, k; for (n=N-1; n >= 0; n--) { register T yn, xn = x[n]; yn = y[n] = b[0] * xn + pz[0]; for (k=0; k<K-1; k++) z[k] = b[k+1]*xn + pz[k+1] - a[k+1]*yn; z[K-1] = b[K]*xn - a[K]*yn; T *tmp = pz; pz = z; z = tmp; } if (pz != zi) std::memcpy(zi, pz, K*sizeof(T)); } template<class T> static void apply_filter(ndarray *x, ndarray *y, ndarray *z, T *b, T *a, int axis, int direction, integer N, integer K, T dummy) { int retval = 0; /* extract axis pointers */ integer xcount, ycount, zcount; std::vector<T*> x_axis_ptrs = get_axis_pointers(x, axis, &xcount, (T)0); std::vector<T*> y_axis_ptrs = get_axis_pointers(y, axis, &ycount, (T)0); std::vector<T*> z_axis_ptrs = get_axis_pointers(z, axis, &zcount, (T)0); /* all ok, we can start ... */ integer xstride, ystride, zstride; xstride = x->strides[axis]; ystride = y->strides[axis]; zstride = z->strides[axis]; if ((xstride==sizeof(T)) && (ystride==sizeof(T)) && (xstride==sizeof(T))) { /* axis is contiguous, do not copy */ for (integer i=0; i < xcount; i++) { T *xdata = x_axis_ptrs[i]; T *ydata = y_axis_ptrs[i]; T *zdata = z_axis_ptrs[i]; if (direction < 0) rev_lfilter_1d(b, a, xdata, ydata, zdata, K, N); else lfilter_1d(b, a, xdata, ydata, zdata, K, N); } } else { /* axis is strided, use copy-in copy-out */ std::vector<T> wx(N); std::vector<T> wy(N); std::vector<T> wz(K); for (integer i=0; i < xcount; i++) { /* get pointer to the first element of the axis */ T *xdata = x_axis_ptrs[i]; T *ydata = y_axis_ptrs[i]; T *zdata = z_axis_ptrs[i]; /* copy to local contiguous buffers */ copy_in(&wx[0], xdata, N, xstride); copy_in(&wz[0], zdata, K, zstride); /* filter */ if (direction < 0) rev_lfilter_1d(b, a, &wx[0], &wy[0], &wz[0], K, N); else lfilter_1d(b, a, &wx[0], &wy[0], &wz[0], K, N); /* copy data back */ copy_out(ydata, &wy[0], N, ystride); copy_out(zdata, &wz[0], K, zstride); } } } static int __linear_filter(ndarray *a, ndarray *b, ndarray *x, ndarray *y, ndarray *z, int axis, int direction) { int retval = 0; integer K = a->dimensions[0] - 1; integer N = x->dimensions[axis]; /* proceed to filter */ switch (a->descr->type_num) { case NPY_FLOAT: apply_filter<float>(x, y, z, (float *)(b->data), (float *)(a->data), axis, direction, N, K, (float)0); break; case NPY_DOUBLE: apply_filter(x, y, z, (double *)(b->data), (double *)(a->data), axis, direction, N, K, (double)0); break; case NPY_CFLOAT: apply_filter(x, y, z, (complex_float *)(b->data), (complex_float *)(a->data), axis, direction, N, K, (complex_float)0); break; case NPY_CDOUBLE: apply_filter(x, y, z, (complex_double *)(b->data), (complex_double *)(a->data), axis, direction, N, K, (complex_double)0); break; case NPY_LONGDOUBLE: apply_filter(x, y, z, (long double *)(b->data), (long double *)(a->data), axis, direction, N, K, (long double)0); break; case NPY_CLONGDOUBLE: apply_filter(x, y, z, (complex_long_double *)(b->data), (complex_long_double *)(a->data), axis, direction, N, K, (complex_long_double)0); break; default: retval = -1; } return retval; } extern "C" int _linear_filter(ndarray *a, ndarray *b, ndarray *x, ndarray *y, ndarray *z, int axis, int direction) { int retval = 0; Py_BEGIN_ALLOW_THREADS try { retval = __linear_filter(a, b, x, y, z, axis, direction); } catch (...) { /* possibly out of memory */ retval = -2; } Py_END_ALLOW_THREADS return retval; } # Copyright (C) 2009 Sturla Molden import numpy as np cimport numpy as np cdef extern int _linear_filter( object a, object b, object x, object y, object z, int axis, int direction) def lfilter(object B, object A, object X, object zi=None, object axis = None, int direction=1, object inplace=False): """ Filter data along one-dimension with an IIR or FIR filter. Description =========== Filter a data sequence, x, using a digital filter. This works for many fundamental data types (including Object type). The filter is a direct form II transposed implementation of the standard difference equation (see "Algorithm"). Inputs: ======= b : vector-like The numerator coefficient vector in a 1-D sequence. a : vector-like The denominator coefficient vector in a 1-D sequence. If a[0] is not 1, then both a and b are normalized by a[0]. x : array-like An N-dimensional input array. axis : int or None The axis of the input data array along which to apply the linear filter. The filter is applied to each subarray along this axis. If axis is None, the filter is applied to x flattened. Default: axis=None. zi : array-like Initial conditions for the filter delays. It is a vector (or array of vectors for an N-dimensional input) of length max(len(a),len(b)). If zi=None or is not given then initial rest is assumed. If axis is None, zi should be a vector of length K = max(M,N), where M=len(b)-1 and N=len(a)-1. If an axis is not None, zi should either be a vector of length K, or an array with same shape as x, except for zi.shape[axis] which should be K. If zi is a vector, the same initial conditions are applied to each vector along the specified axis. Default: zi=None (initial rest). See signal.lfiltic for more information. direction : int If negative, the filter is applied in reverse direction along the axis. Default: direction=1 (filter forward). inplace : bool If True, allow x or zi to be modified inplace if possible. Default: False. Outputs: (y, {zf}) ======= y : ndarray The output of the digital filter. zf : ndarray If zi is None, this is not returned, otherwise, zf holds the final filter delay values. Algorithm: ========== The filter function is implemented as a direct II transposed structure. This means that the filter implements a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[nb]*x[n-nb] - a[1]*y[n-1] - ... - a[na]*y[n-na] using the following difference equations: y[m] = b[0]*x[m] + z[0,m-1] z[0,m] = b[1]*x[m] + z[1,m-1] - a[1]*y[m] ... z[n-3,m] = b[n-2]*x[m] + z[n-2,m-1] - a[n-2]*y[m] z[n-2,m] = b[n-1]*x[m] - a[n-1]*y[m] where m is the output sample number and n=max(len(a),len(b)) is the model order. The rational transfer function describing this filter in the z-transform domain is -1 -nb b[0] + b[1]z + ... + b[nb] z Y(z) = ---------------------------------- X(z) -1 -na a[0] + a[1]z + ... + a[na] z """ cdef np.ndarray b, a, x, y, z cdef np.npy_intp i, ierr cdef object tmp, K, zshape, iaxis, dtype X = np.asarray(X) dtype = X.dtype # get filter coeffs b, a A = np.asarray(A, dtype=dtype) B = np.asarray(B, dtype=dtype) if (A.ndim != 1) or (B.ndim != 1): raise ValueError, 'a and b must be vectors' K = max(len(A)-1,len(B)-1) b = np.zeros(K+1, dtype=dtype) a = np.zeros(K+1, dtype=dtype) a[:A.shape[0]] = A[:] b[:B.shape[0]] = B[:] # normalize by a[0] tmp = a[0] a = a / tmp b = b / tmp # set up input signal if axis is None: X = X.ravel() elif type(axis) != int: raise ValueError, 'axis must be None or an integer >= 0' elif axis < 0: raise ValueError, 'axis must be None or an integer >= 0' x = X # set up output signal if inplace: y = X else: y = np.zeros(X.shape, dtype=dtype) # set up filter delay buffer iaxis = 0 if (axis is None) else axis if zi is None: # zi not specified, assume initial rest zshape = list(X.shape) zshape[iaxis] = K z = np.zeros(zshape, dtype=dtype) else: zi = np.asarray(zi, dtype=np.float64) # x and zi are both vectors if (x.ndim == 1) and (zi.ndim == 1): if (zi.shape[0] != K): raise ValueError, 'length of zi must be K' if inplace: z = zi else: z = zi.copy() # x is nd array, zi is vector # broadcast zi (cannot modify inplace) elif (x.ndim > 1) and (zi.ndim == 1): if (zi.shape[0] != K): raise ValueError, 'length of zi must be K' zshape = list(X.shape) zshape[iaxis] = K z = np.zeros(zshape, dtype=dtype) zshape = [1] * X.ndim zshape[iaxis] = K z = z + zi.reshape(zshape) # x and zi are both nd arrays else: zshape = list(X.shape) zshape[iaxis] = K zshape = tuple(zshape) if (zi.shape != zshape): raise ValueError, ('bad shape of zi: expected %r, got %r' % (zshape,zi.shape)) if inplace: z = zi else: z = zi.copy() # apply the linear filter ierr = _linear_filter(a, b, x, y, z, <int> iaxis, direction) # check for error conditions on return if ierr == -2: raise MemoryError, 'C++ exception trapped, possibly out of memory.' if ierr == -1: raise TypeError, ('dtype %r not supported' % (a.dtype)) # return y or (y, z) depending on zi # return (y if (zi is None) else (y, z)) ## Cython fails on this if (zi is None): return y else: return (y,z)