
Sturla Molden skrev:
Well, I changed all the code to C++, to use the std::complex type, std::vector instead of malloc, and templates for specializing the filter functions to all dtypes supported by lfilter. I'm quite happy with the way the C++ looks :-)
I forgot to add support for the object dtype. This also fixes a bug in the current signal.lfilter. This will crash the interpreter: import numpy as np import scipy from scipy.signal import butter, lfilter b,a = butter(4, .25) x = np.random.randn(1000000).astype(object).reshape((1000,1000)) fx = lfilter(b,a,x, axis=1) The C++ version here does not crash on this. :-) I just ripped some code from current lfilter and cleaned it up slightly; I hope that is ok. (Cython was not happy about "cdef object *ptr", so I reluctantly had to put it all in C++.) The reason my code does not crash like current lfilter, is copy-in copy-out for strided vectors. It seems lfilter messes up the strides and then segfaults. <type 'object'> axis=0, shape=(1000000,), speed: 102 axis=0, shape=(1000, 1000), speed: 105 axis=1, shape=(1000, 1000), scipy.signal.lfilter crashes the interpreter!!! Building on Windows: g++ -c -O3 -ffast-math -msse3 -march=core2 -Ic:/Python26/include -Ic:/Python26/Lib/site-packages/numpy/core/include _linear_filter.cpp cython.py --cplus linear_filter.pyx g++ -c -O2 -ffast-math -msse3 -march=core2 -Ic:/Python26/include -Ic:/Python26/Lib/site-packages/numpy/core/include linear_filter.cpp g++ -shared -o linear_filter.pyd -Lc:/Python26/libs _linear_filter.o linear_filter.o -lpython26 -lmsvcr90 Regards, Sturla Molden // Copyright (C) 2009 Sturla Molden // SciPy license #include <vector> #include <complex> typedef std::complex<float> complex_float; typedef std::complex<double> complex_double; typedef std::complex<long double> complex_long_double; #include <Python.h> #include <numpy/arrayobject.h> #include <cstring> typedef PyArrayObject ndarray; /* PyArrayObject is an ugly name */ typedef Py_ssize_t integer; /* Py_ssize_t of int for 64 bit support, npy_intp is typedef'ed incorrectly. */ typedef PyObject *object; /* copy data to and from temporary work arrays */ template<class T> void copy_in(T *y, T *x, integer n, integer s) { integer i; char *tmp; if (s == sizeof(T)) std::memcpy(y, x, n*sizeof(T)); else { tmp = (char *)x; for (i=0; i<n; i++) { *y++ = *x; tmp += s; x = (T *)tmp; } } } template<class T> void copy_out(T *y, T *x, integer n, integer s) { integer i; char *tmp; if (s == sizeof(T)) std::memcpy(y, x, n*sizeof(T)); else { tmp = (char *)y; for (i=0; i<n; i++) { *y = *x++; tmp += s; y = (T *)tmp; } } } /* this computes the start adress for every vector along a dimension (axis) of an ndarray */ template<class T> inline T *datapointer(ndarray *a, integer *indices, T dummy) { char *data = a->data; int i; integer j; for (i=0; i < a->nd; i++) { j = indices[i]; data += j * a->strides[i]; } return (T *)data; } template<class T> T ** _get_axis_pointer(integer *indices, int axis, ndarray *a, T **axis_ptr_array, int dim, T dummy) { /* recursive axis pointer search for 4 dimensions or more */ integer i; T *axis_ptr; if (dim == a->nd) { /* done recursing dimensions, compute address of this axis */ axis_ptr = datapointer(a, indices, dummy); *axis_ptr_array = axis_ptr; return (axis_ptr_array + 1); } else if (dim == axis) { /* use first element only */ indices[dim] = 0; axis_ptr_array = _get_axis_pointer(indices, axis, a, axis_ptr_array, dim+1, dummy); return axis_ptr_array; } else { /* iterate range of indices */ integer length = a->dimensions[dim]; for (i=0; i < length; i++) { indices[dim] = i; axis_ptr_array = _get_axis_pointer(indices, axis, a, axis_ptr_array, dim+1, dummy); } return axis_ptr_array; } } template<class T> std::vector<T*> get_axis_pointers(ndarray *a, int axis, integer *count, T dummy) { std::vector<integer> indices(a->nd); integer i, j; j = 1; for (i=0; i < a->nd; i++) { if (i != axis) j *= a->dimensions[i]; } *count = j; std::vector<T*> out(*count); T **tmp = &out[0]; switch (a->nd) { /* for one dimension we just return the data pointer */ case 1: *tmp = (T *)a->data; break; /* specialized for two dimensions */ case 2: switch (axis) { case 0: for (i=0; i<a->dimensions[1]; i++) *tmp++ = (T *)(a->data + i*a->strides[1]); break; case 1: for (i=0; i<a->dimensions[0]; i++) *tmp++ = (T *)(a->data + i*a->strides[0]); break; } break; /* specialized for three dimensions */ case 3: switch (axis) { case 0: for (i=0; i<a->dimensions[1]; i++) for (j=0; j<a->dimensions[2]; j++) *tmp++ = (T *)(a->data + i*a->strides[1] + j*a->strides[2]); break; case 1: for (i=0; i<a->dimensions[0]; i++) for (j=0; j<a->dimensions[2]; j++) *tmp++ = (T *)(a->data + i*a->strides[0] + j*a->strides[2]); break; case 2: for (i=0; i<a->dimensions[0]; i++) for (j=0; j<a->dimensions[1]; j++) *tmp++ = (T *)(a->data + i*a->strides[0] + j*a->strides[1]); } break; /* four or more dimensions: use recursion */ default: for (i=0; i<a->nd; i++) indices[i] = 0; integer *pind = &indices[0]; _get_axis_pointer(pind, axis, a, tmp, 0, (T)0); } return out; } /* linear filter along one dimension */ template<class T> void lfilter_1d( const T *b, const T *a, T *x, T *y, T *zi, integer K, integer N) { std::vector<T> Z(K); T *pz = zi, *z = &Z[0]; register integer n, k; for (n=0; n<N; n++) { register T yn, xn = x[n]; yn = y[n] = b[0] * xn + pz[0]; for (k=0; k<K-1; k++) z[k] = b[k+1] * xn + pz[k+1] - a[k+1]*yn; z[K-1] = b[K]*xn - a[K]*yn; T *tmp = pz; pz = z; z = tmp; } if (pz != zi) std::memcpy(zi, pz, K*sizeof(T)); } template<class T> void rev_lfilter_1d( const T *b, const T *a, T *x, T *y, T *zi, integer K, integer N) { std::vector<T> Z(K); T *pz = zi, *z = &Z[0]; register integer n, k; for (n=N-1; n >= 0; n--) { register T yn, xn = x[n]; yn = y[n] = b[0] * xn + pz[0]; for (k=0; k<K-1; k++) z[k] = b[k+1]*xn + pz[k+1] - a[k+1]*yn; z[K-1] = b[K]*xn - a[K]*yn; T *tmp = pz; pz = z; z = tmp; } if (pz != zi) std::memcpy(zi, pz, K*sizeof(T)); } /* template specialization for the general object case modified from current scipy/signal/lfilter.c.src */ template<> void lfilter_1d<object>(const object *b, const object *a, object *x, object *y, object *Z, integer K, integer N) { object *ptr_x = x, *ptr_y = y; object *ptr_Z; const object *ptr_b; const object *ptr_a; object *xn, *yn; const object *a0 = a; object tmp1, tmp2, tmp3; integer n; integer k; /* My reference counting might not be right */ for (n = 0; n < N; n++) { ptr_b = b; /* Reset a and b pointers */ ptr_a = a; xn = ptr_x; yn = ptr_y; if (K) { ptr_Z = Z; /* Calculate first delay (output) */ tmp1 = PyNumber_Multiply(*ptr_b, *xn); tmp2 = PyNumber_Divide(tmp1, *a0); tmp3 = PyNumber_Add(tmp2, *ptr_Z); Py_XDECREF(*yn); *yn = tmp3; Py_DECREF(tmp1); Py_DECREF(tmp2); ptr_b++; ptr_a++; /* Fill in middle delays */ for (k = 0; k < K-1; k++) { tmp1 = PyNumber_Multiply(*xn, *ptr_b); tmp2 = PyNumber_Divide(tmp1, *a0); tmp3 = PyNumber_Add(tmp2, ptr_Z[1]); Py_DECREF(tmp1); Py_DECREF(tmp2); tmp1 = PyNumber_Multiply(*yn, *ptr_a); tmp2 = PyNumber_Divide(tmp1, *a0); Py_DECREF(tmp1); Py_XDECREF(*ptr_Z); *ptr_Z = PyNumber_Subtract(tmp3, tmp2); Py_DECREF(tmp2); Py_DECREF(tmp3); ptr_b++; ptr_a++; ptr_Z++; } /* Calculate last delay */ tmp1 = PyNumber_Multiply(*xn, *ptr_b); tmp3 = PyNumber_Divide(tmp1, *a0); Py_DECREF(tmp1); tmp1 = PyNumber_Multiply(*yn, *ptr_a); tmp2 = PyNumber_Divide(tmp1, *a0); Py_DECREF(tmp1); Py_XDECREF(*ptr_Z); *ptr_Z = PyNumber_Subtract(tmp3, tmp2); Py_DECREF(tmp2); Py_DECREF(tmp3); } else { tmp1 = PyNumber_Multiply(*xn, *ptr_b); Py_XDECREF(*yn); *yn = PyNumber_Divide(tmp1, *a0); Py_DECREF(tmp1); } ptr_y++; /* Move to next input/output point */ ptr_x++; } } template<> void rev_lfilter_1d<object>(const object *b, const object *a, object *x, object *y, object *Z, integer K, integer N) { object *ptr_x = x + N - 1, *ptr_y = y + N - 1; object *ptr_Z; const object *ptr_b; const object *ptr_a; object *xn, *yn; const object *a0 = a; object tmp1, tmp2, tmp3; integer n; integer k; /* My reference counting might not be right */ for (n = 0; n < N; n++) { ptr_b = b; /* Reset a and b pointers */ ptr_a = a; xn = ptr_x; yn = ptr_y; if (K) { ptr_Z = Z; /* Calculate first delay (output) */ tmp1 = PyNumber_Multiply(*ptr_b, *xn); tmp2 = PyNumber_Divide(tmp1, *a0); tmp3 = PyNumber_Add(tmp2, *ptr_Z); Py_XDECREF(*yn); *yn = tmp3; Py_DECREF(tmp1); Py_DECREF(tmp2); ptr_b++; ptr_a++; /* Fill in middle delays */ for (k = 0; k < K-1; k++) { tmp1 = PyNumber_Multiply(*xn, *ptr_b); tmp2 = PyNumber_Divide(tmp1, *a0); tmp3 = PyNumber_Add(tmp2, ptr_Z[1]); Py_DECREF(tmp1); Py_DECREF(tmp2); tmp1 = PyNumber_Multiply(*yn, *ptr_a); tmp2 = PyNumber_Divide(tmp1, *a0); Py_DECREF(tmp1); Py_XDECREF(*ptr_Z); *ptr_Z = PyNumber_Subtract(tmp3, tmp2); Py_DECREF(tmp2); Py_DECREF(tmp3); ptr_b++; ptr_a++; ptr_Z++; } /* Calculate last delay */ tmp1 = PyNumber_Multiply(*xn, *ptr_b); tmp3 = PyNumber_Divide(tmp1, *a0); Py_DECREF(tmp1); tmp1 = PyNumber_Multiply(*yn, *ptr_a); tmp2 = PyNumber_Divide(tmp1, *a0); Py_DECREF(tmp1); Py_XDECREF(*ptr_Z); *ptr_Z = PyNumber_Subtract(tmp3, tmp2); Py_DECREF(tmp2); Py_DECREF(tmp3); } else { tmp1 = PyNumber_Multiply(*xn, *ptr_b); Py_XDECREF(*yn); *yn = PyNumber_Divide(tmp1, *a0); Py_DECREF(tmp1); } ptr_y--; /* Move to next input/output point */ ptr_x--; } } /* this function does copy-in copy-out optimization and applies the filter to the signal */ template<class T> static void apply_filter(ndarray *x, ndarray *y, ndarray *z, T *b, T *a, int axis, int direction, integer N, integer K, T dummy) { int retval = 0; /* extract axis pointers */ integer xcount, ycount, zcount; std::vector<T*> x_axis_ptrs = get_axis_pointers(x, axis, &xcount, (T)0); std::vector<T*> y_axis_ptrs = get_axis_pointers(y, axis, &ycount, (T)0); std::vector<T*> z_axis_ptrs = get_axis_pointers(z, axis, &zcount, (T)0); /* all ok, we can start ... */ integer xstride, ystride, zstride; xstride = x->strides[axis]; ystride = y->strides[axis]; zstride = z->strides[axis]; if ((xstride==sizeof(T)) && (ystride==sizeof(T)) && (xstride==sizeof(T))) { /* axis is contiguous, do not copy */ for (integer i=0; i < xcount; i++) { T *xdata = x_axis_ptrs[i]; T *ydata = y_axis_ptrs[i]; T *zdata = z_axis_ptrs[i]; if (direction < 0) rev_lfilter_1d(b, a, xdata, ydata, zdata, K, N); else lfilter_1d(b, a, xdata, ydata, zdata, K, N); } } else { /* axis is strided, use copy-in copy-out */ std::vector<T> wx(N); std::vector<T> wy(N); std::vector<T> wz(K); for (integer i=0; i < xcount; i++) { /* get pointer to the first element of the axis */ T *xdata = x_axis_ptrs[i]; T *ydata = y_axis_ptrs[i]; T *zdata = z_axis_ptrs[i]; /* copy to local contiguous buffers */ copy_in(&wx[0], xdata, N, xstride); copy_in(&wz[0], zdata, K, zstride); /* filter */ if (direction < 0) rev_lfilter_1d(b, a, &wx[0], &wy[0], &wz[0], K, N); else lfilter_1d(b, a, &wx[0], &wy[0], &wz[0], K, N); /* copy data back */ copy_out(ydata, &wy[0], N, ystride); copy_out(zdata, &wz[0], K, zstride); } } } static int __linear_filter(ndarray *a, ndarray *b, ndarray *x, ndarray *y, ndarray *z, int axis, int direction) { int retval = 0; integer K = a->dimensions[0] - 1; integer N = x->dimensions[axis]; /* proceed to filter */ switch (a->descr->type_num) { case NPY_FLOAT: Py_BEGIN_ALLOW_THREADS apply_filter<float>(x, y, z, (float *)(b->data), (float *)(a->data), axis, direction, N, K, (float)0); Py_END_ALLOW_THREADS break; case NPY_DOUBLE: Py_BEGIN_ALLOW_THREADS apply_filter(x, y, z, (double *)(b->data), (double *)(a->data), axis, direction, N, K, (double)0); Py_END_ALLOW_THREADS break; case NPY_CFLOAT: Py_BEGIN_ALLOW_THREADS apply_filter(x, y, z, (complex_float *)(b->data), (complex_float *)(a->data), axis, direction, N, K, (complex_float)0); Py_END_ALLOW_THREADS break; case NPY_CDOUBLE: Py_BEGIN_ALLOW_THREADS apply_filter(x, y, z, (complex_double *)(b->data), (complex_double *)(a->data), axis, direction, N, K, (complex_double)0); Py_END_ALLOW_THREADS break; case NPY_LONGDOUBLE: Py_BEGIN_ALLOW_THREADS apply_filter(x, y, z, (long double *)(b->data), (long double *)(a->data), axis, direction, N, K, (long double)0); Py_END_ALLOW_THREADS break; case NPY_CLONGDOUBLE: Py_BEGIN_ALLOW_THREADS apply_filter(x, y, z, (complex_long_double *)(b->data), (complex_long_double *)(a->data), axis, direction, N, K, (complex_long_double)0); Py_END_ALLOW_THREADS break; case NPY_OBJECT: apply_filter(x, y, z, (object *)(b->data), (object *)(a->data), axis, direction, N, K, (object)0); break; default: retval = -1; /* unsupported type */ } return retval; } extern "C" int _linear_filter(ndarray *a, ndarray *b, ndarray *x, ndarray *y, ndarray *z, int axis, int direction) { int retval = 0; try { retval = __linear_filter(a, b, x, y, z, axis, direction); } catch (int a) { /* Python exception */ retval = -3; } catch (...) { /* possibly out of memory */ retval = -2; } return retval; } # Copyright (C) 2009 Sturla Molden # SciPy license import numpy as np cimport numpy as np cdef extern int _linear_filter( object a, object b, object x, object y, object z, int axis, int direction) def lfilter(object B, object A, object X, object zi=None, object axis = None, int direction=1, object inplace=False): cdef np.ndarray b, a, x, y, z cdef np.npy_intp i, ierr cdef object tmp, K, zshape, iaxis, dtype X = np.asarray(X) dtype = X.dtype # get filter coeffs b, a A = np.asarray(A, dtype=dtype) B = np.asarray(B, dtype=dtype) if (A.ndim != 1) or (B.ndim != 1): raise ValueError, 'a and b must be vectors' K = max(len(A)-1,len(B)-1) b = np.zeros(K+1, dtype=dtype) a = np.zeros(K+1, dtype=dtype) a[:A.shape[0]] = A[:] b[:B.shape[0]] = B[:] # normalize by a[0] tmp = a[0] a = a / tmp b = b / tmp # set up input signal if axis is None: X = X.ravel() elif type(axis) != int: raise ValueError, 'axis must be None or an integer >= 0' elif axis < 0: raise ValueError, 'axis must be None or an integer >= 0' x = X # set up output signal if inplace: y = X else: y = np.zeros(X.shape, dtype=dtype) # set up filter delay buffer iaxis = 0 if (axis is None) else axis if zi is None: # zi not specified, assume initial rest zshape = list(X.shape) zshape[iaxis] = K z = np.zeros(zshape, dtype=dtype) else: zi = np.asarray(zi, dtype=np.float64) # x and zi are both vectors if (x.ndim == 1) and (zi.ndim == 1): if (zi.shape[0] != K): raise ValueError, 'length of zi must be K' if inplace: z = zi else: z = zi.copy() # x is nd array, zi is vector # broadcast zi (cannot modify inplace) elif (x.ndim > 1) and (zi.ndim == 1): if (zi.shape[0] != K): raise ValueError, 'length of zi must be K' zshape = list(X.shape) zshape[iaxis] = K z = np.zeros(zshape, dtype=dtype) zshape = [1] * X.ndim zshape[iaxis] = K z = z + zi.reshape(zshape) # x and zi are both nd arrays else: zshape = list(X.shape) zshape[iaxis] = K zshape = tuple(zshape) if (zi.shape != zshape): raise ValueError, ('bad shape of zi: expected %r, got %r' % (zshape,zi.shape)) if inplace: z = zi else: z = zi.copy() # apply the linear filter ierr = _linear_filter(a, b, x, y, z, <int> iaxis, direction) # check for error conditions on return if ierr == -3: raise Exception, 'Unknown Python exception propagated via C++.' if ierr == -2: raise MemoryError, 'C++ exception trapped, possibly out of memory.' if ierr == -1: raise TypeError, ('dtype %r not supported' % (a.dtype)) # return y or (y, z) depending on zi # return (y if (zi is None) else (y, z)) ## Cython fails on this if (zi is None): return y else: return (y,z)