
Looking at this atrocious piece of C: http://svn.scipy.org/svn/scipy/trunk/scipy/signal/lfilter.c.src I got some inspiration for reusing some of my other C code to reimplement scipy.signal.lfilter. I basically just had to write this small C function + a Cython wrapper: static void lfilter_1d(double *restrict b, double *restrict a, double *x, double *y, double *zi, npy_intp K, npy_intp N) { double Z[K]; double *restrict pz = zi, *restrict z = Z; register npy_intp n, k; for (n=0; n<N; n++) { register double yn, xn = x[n]; yn = y[n] = b[0] * xn + pz[0]; for (k=0; k<K-1; k++) z[k] = b[k+1] * xn + pz[k+1] - a[k+1]*yn; z[K-1] = b[K]*xn - a[K]*yn; double *tmp = pz; pz = z; z = tmp; } if (pz != zi) memcpy(zi, pz, K*sizeof(double)); } Relative speed compared to scipy.signal.lfilter: One processor: 1D array, 1000000 floats: 150% 2D array, 1000 x 1000 floats, axis=0: 80% 2D array, 1000 x 1000 floats, axis=1: 150% It is faster than lfilter except for 2D with axis=0. For 2D with axis=1 it is probably faster than lfilter due to copy-in copy-out optimization. For arrays with more than one dimension, multithreading helps as well: Four processors: 1D array, 1000000 floats: 150% 2D array, 1000 x 1000 floats, axis=0: 160% 2D array, 1000 x 1000 floats, axis=1: 250% Multithreading obviously does not help for filtering single vector, as there is no work to share. Some other improvements over signal.lfilter, apart from readable Cython code: * GIL is released. The C code is not littered with Python C API calls that prevents this. * Filtering can be done in reverse, i.e. no need for fliplr or flipud for filtering backwards. * Arrays can be filtered in-place. * Cleaned up docstring. Still it only work as a "proof of concept" with double precision floats. Adding support for other dtypes would be easy, just replace double with any of: float float _Complex double _Complex long double long double _Complex Regards, Sturla Molden # Copyright (C) 2009 Sturla Molden import numpy as np cimport numpy as np cdef extern int _linear_filter( object a, object b, object x, object y, object z, int axis, int direction) def lfilter(object B, object A, object X, object zi=None, object axis = None, int direction=1, object inplace=False): """ Filter data along one-dimension with an IIR or FIR filter. Description =========== Filter a data sequence, x, using a digital filter. This works for many fundamental data types (including Object type). The filter is a direct form II transposed implementation of the standard difference equation (see "Algorithm"). Inputs: ======= b : vector-like The numerator coefficient vector in a 1-D sequence. a : vector-like The denominator coefficient vector in a 1-D sequence. If a[0] is not 1, then both a and b are normalized by a[0]. x : array-like An N-dimensional input array. axis : int or None The axis of the input data array along which to apply the linear filter. The filter is applied to each subarray along this axis. If axis is None, the filter is applied to x flattened. Default: axis=None. zi : array-like Initial conditions for the filter delays. It is a vector (or array of vectors for an N-dimensional input) of length max(len(a),len(b)). If zi=None or is not given then initial rest is assumed. If axis is None, zi should be a vector of length K = max(M,N), where M=len(b)-1 and N=len(a)-1. If an axis is not None, zi should either be a vector of length K, or an array with same shape as x, except for zi.shape[axis] which should be K. If zi is a vector, the same initial conditions are applied to each vector along the specified axis. Default: zi=None (initial rest). See signal.lfiltic for more information. direction : int If negative, the filter is applied in reverse direction along the axis. Default: direction=1 (filter forward). inplace : bool If True, allow x or zi to be modified inplace if possible. Default: False. Outputs: (y, {zf}) ======= y : ndarray The output of the digital filter. zf : ndarray If zi is None, this is not returned, otherwise, zf holds the final filter delay values. Algorithm: ========== The filter function is implemented as a direct II transposed structure. This means that the filter implements a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[nb]*x[n-nb] - a[1]*y[n-1] - ... - a[na]*y[n-na] using the following difference equations: y[m] = b[0]*x[m] + z[0,m-1] z[0,m] = b[1]*x[m] + z[1,m-1] - a[1]*y[m] ... z[n-3,m] = b[n-2]*x[m] + z[n-2,m-1] - a[n-2]*y[m] z[n-2,m] = b[n-1]*x[m] - a[n-1]*y[m] where m is the output sample number and n=max(len(a),len(b)) is the model order. The rational transfer function describing this filter in the z-transform domain is -1 -nb b[0] + b[1]z + ... + b[nb] z Y(z) = ---------------------------------- X(z) -1 -na a[0] + a[1]z + ... + a[na] z """ cdef np.ndarray b, a, x, y, z cdef np.npy_intp i, ierr cdef double tmp cdef object K, zshape, iaxis # get filter coeffs b, a A = np.asarray(A) B = np.asarray(B) if (A.ndim != 1) or (B.ndim != 1): raise ValueError, 'a and b must be vectors' K = max(len(A)-1,len(B)-1) b = np.zeros(K+1) a = np.zeros(K+1) a[:A.shape[0]] = A[:] b[:B.shape[0]] = B[:] # normalize by a[0] if a[0] != 1.0: tmp = a[0] for i in range(a.shape[0]): a[i] /= tmp b[i] /= tmp # set up input signal X = np.asarray(X, dtype=np.float64) if axis is None: X = X.ravel() elif type(axis) != int: raise ValueError, 'axis must be None or an integer >= 0' elif axis < 0: raise ValueError, 'axis must be None or an integer >= 0' x = X # set up output signal if inplace: y = X else: y = np.zeros(X.shape, dtype=np.float64) # set up filter delay buffer iaxis = 0 if (axis is None) else axis if zi is None: # zi not specified, assume initial rest zshape = list(X.shape) zshape[iaxis] = K z = np.zeros(zshape, dtype=np.float64) else: zi = np.asarray(zi, dtype=np.float64) # x and zi are both vectors if (x.ndim == 1) and (zi.ndim == 1): if (zi.shape[0] != K): raise ValueError, 'length of zi must be K' if inplace: z = zi else: z = zi.copy() # x is nd array, zi is vector # broadcast zi (cannot modify inplace) elif (x.ndim > 1) and (zi.ndim == 1): if (zi.shape[0] != K): raise ValueError, 'length of zi must be K' zshape = list(X.shape) zshape[iaxis] = K z = np.zeros(zshape, dtype=np.float64) zshape = [1] * X.ndim zshape[iaxis] = K z = z + zi.reshape(zshape) # x and zi are both nd arrays else: zshape = list(X.shape) zshape[iaxis] = K zshape = tuple(zshape) if (zi.shape != zshape): raise ValueError, ('bad shape of zi: expected %r, got %r' % (zshape,zi.shape)) if inplace: z = zi else: z = zi.copy() # apply the linear filter ierr = _linear_filter(a, b, x, y, z, <int> iaxis, direction) # check for error conditions on return if ierr == -1: raise MemoryError, 'malloc returned NULL' if ierr == -2: raise ValueError, 'shape of x, y, and z did not match... should never happen, debug please.' if ierr == -3: raise ValueError, 'shape of a and b did not match... should never happen, debug please.' # return y or (y, z) depending on zi # return (y if (zi is None) else (y, z)) ## Cython fails on this if (zi is None): return y else: return (y,z) /* Copyright (C) 2009 Sturla Molden */ #include <Python.h> #include <numpy/arrayobject.h> #include <string.h> #include <stdlib.h> typedef PyArrayObject ndarray; /* PyArrayObject is an ugly name */ typedef Py_ssize_t integer; /* Py_ssize_t of int for 64 bit support, npy_intp is typedef'ed incorrectly. */ /* copy data to and from temporary work arrays */ static void copy_in(double *restrict y, double *restrict x, integer n, integer s) { integer i; char *tmp; if (s == sizeof(double)) memcpy(y, x, n*sizeof(double)); else { tmp = (char *)x; for (i=0; i<n; i++) { *y++ = *x; tmp += s; x = (double *)tmp; } } } static void copy_out(double *restrict y, double *restrict x, integer n, integer s) { integer i; char *tmp; if (s == sizeof(double)) memcpy(y, x, n*sizeof(double)); else { tmp = (char *)y; for (i=0; i<n; i++) { *y = *x++; tmp += s; y = (double *)tmp; } } } /* this computes the start adress for every vector along a dimension (axis) of an ndarray */ inline char *datapointer(ndarray *a, integer *indices) { char *data = a->data; int i; integer j; for (i=0; i < a->nd; i++) { j = indices[i]; data += j * a->strides[i]; } return data; } static double ** _get_axis_pointer(integer *indices, int axis, ndarray *a, double **axis_ptr_array, int dim) { /* recursive axis pointer search for 4 dimensions or more */ integer i; double *axis_ptr; if (dim == a->nd) { /* done recursing dimensions, compute address of this axis */ axis_ptr = (double *) datapointer(a, indices); *axis_ptr_array = axis_ptr; return (axis_ptr_array + 1); } else if (dim == axis) { /* use first element only */ indices[dim] = 0; axis_ptr_array = _get_axis_pointer(indices, axis, a, axis_ptr_array, dim+1); return axis_ptr_array; } else { /* iterate range of indices */ integer length = a->dimensions[dim]; for (i=0; i < length; i++) { indices[dim] = i; axis_ptr_array = _get_axis_pointer(indices, axis, a, axis_ptr_array, dim+1); } return axis_ptr_array; } } static double **get_axis_pointers(ndarray *a, int axis, integer *count) { integer indices[a->nd]; double **out, **tmp; integer i, j; j = 1; for (i=0; i < a->nd; i++) { if (i != axis) j *= a->dimensions[i]; } *count = j; out = (double **) malloc(*count * sizeof(double*)); if (out == NULL) { *count = 0; return NULL; } tmp = out; switch (a->nd) { /* for one dimension we just return the data pointer */ case 1: *tmp = (double *)a->data; break; /* specialized for two dimensions */ case 2: switch (axis) { case 0: for (i=0; i<a->dimensions[1]; i++) *tmp++ = (double *)(a->data + i*a->strides[1]); break; case 1: for (i=0; i<a->dimensions[0]; i++) *tmp++ = (double *)(a->data + i*a->strides[0]); break; } break; /* specialized for three dimensions */ case 3: switch (axis) { case 0: for (i=0; i<a->dimensions[1]; i++) for (j=0; j<a->dimensions[2]; j++) *tmp++ = (double *)(a->data + i*a->strides[1] + j*a->strides[2]); break; case 1: for (i=0; i<a->dimensions[0]; i++) for (j=0; j<a->dimensions[2]; j++) *tmp++ = (double *)(a->data + i*a->strides[0] + j*a->strides[2]); break; case 2: for (i=0; i<a->dimensions[0]; i++) for (j=0; j<a->dimensions[1]; j++) *tmp++ = (double *)(a->data + i*a->strides[0] + j*a->strides[1]); } break; /* four or more dimensions: use recursion */ default: for (i=0; i<a->nd; i++) indices[i] = 0; _get_axis_pointer(indices, axis, a, out, 0); } done: return out; } /* applies a linear filter along one dimension */ static void lfilter_1d( double *restrict b, double *restrict a, double *x, double *y, double *zi, integer K, integer N) { double Z[K]; double *restrict pz = zi, *restrict z = Z; register integer n, k; for (n=0; n<N; n++) { register double yn, xn = x[n]; yn = y[n] = b[0] * xn + pz[0]; for (k=0; k<K-1; k++) z[k] = b[k+1] * xn + pz[k+1] - a[k+1]*yn; z[K-1] = b[K]*xn - a[K]*yn; double *tmp = pz; pz = z; z = tmp; } if (pz != zi) memcpy(zi, pz, K*sizeof(double)); } static void rev_lfilter_1d( double *restrict b, double *restrict a, double *restrict x, double *restrict y, double *zi, integer K, integer N) { double Z[K]; double *restrict pz = zi, *restrict z = Z; register integer n, k; for (n=N-1; n >= 0; n--) { register double yn, xn = x[n]; yn = y[n] = b[0] * xn + pz[0]; for (k=0; k<K-1; k++) z[k] = b[k+1]*xn + pz[k+1] - a[k+1]*yn; z[K-1] = b[K]*xn - a[K]*yn; double *tmp = pz; pz = z; z = tmp; } if (pz != zi) memcpy(zi, pz, K*sizeof(double)); } typedef void (*lfilterfun_t)(double *restrict b, double *restrict a, double *restrict x, double *restrict y, double *zi, integer K, integer N); int _linear_filter(ndarray *a, ndarray *b, ndarray *x, ndarray *y, ndarray *z, int axis, int direction) { int retval = 0; double *wx, *wy, *wz, *data_a, *data_b; integer xcount, ycount, zcount; integer xstride, ystride, zstride; double *xdata, *ydata, *zdata; double **x_axis_ptrs = NULL, **y_axis_ptrs = NULL, **z_axis_ptrs = NULL; lfilterfun_t lfilter; integer i, K, N; /* release the GIL */ Py_BEGIN_ALLOW_THREADS /* sanity check on a and b (these should never fail) */ if (a->nd != 1) goto error_ab; if (b->nd != 1) goto error_ab; if (b->dimensions[0] != a->dimensions[0]) goto error_ab; data_a = (double *)a->data; data_b = (double *)b->data; K = a->dimensions[0] - 1; /* sanity ckeck on x, y and z */ if ((x->nd != y->nd) || (y->nd != z->nd)) goto error_xyz; for (int d=0; d < a->nd; d++) { if (d == axis) continue; if ((x->dimensions[d] != y->dimensions[d]) || (y->dimensions[d] != z->dimensions[d])) goto error_xyz; } N = x->dimensions[axis]; /* extract axis pointers */ x_axis_ptrs = get_axis_pointers(x, axis, &xcount); y_axis_ptrs = get_axis_pointers(y, axis, &ycount); z_axis_ptrs = get_axis_pointers(z, axis, &zcount); /* sanity check */ if (!x_axis_ptrs) goto error_malloc; if (!y_axis_ptrs) goto error_malloc; if (!z_axis_ptrs) goto error_malloc; if ((xcount != ycount)||(ycount != zcount)) goto error_xyz; /* all ok, we can start ... */ lfilter = direction < 0 ? rev_lfilter_1d : lfilter_1d; xstride = x->strides[axis]; ystride = y->strides[axis]; zstride = z->strides[axis]; #pragma omp parallel \ firstprivate(lfilter, N, K, xcount, x_axis_ptrs, y_axis_ptrs, z_axis_ptrs, \ xstride, ystride, zstride, data_b, data_a) \ private(i, xdata, ydata, zdata, wx, wy, wz) \ shared(retval) \ default(none) { if ((xstride==sizeof(double)) && (ystride==sizeof(double)) && (xstride==sizeof(double))) { #pragma omp for schedule(static) for (i=0; i < xcount; i++) { xdata = x_axis_ptrs[i]; ydata = y_axis_ptrs[i]; zdata = z_axis_ptrs[i]; /* strictly speaking we could be aliasing xdata and ydata here, but it should not matter. */ lfilter(data_b, data_a, xdata, ydata, zdata, K, N); } } else { /* axis is strided, use copy-in copy-out */ wx = (double *)malloc((2*N+K)*sizeof(double)); if (!wx) { #pragma omp critical retval = -1; /* error_malloc */ } #pragma omp barrier #pragma omp flush(retval) if (retval < 0) /* malloc failed in some thread */ goto filtering_complete; wy = wx + N; wz = wy + N; #pragma omp for schedule(static) for (i=0; i < xcount; i++) { /* get pointe to the first element of the axis */ xdata = x_axis_ptrs[i]; ydata = y_axis_ptrs[i]; zdata = z_axis_ptrs[i]; /* copy to local contiguous buffers */ copy_in(wx, xdata, N, xstride); copy_in(wz, zdata, K, zstride); /* filter */ lfilter(data_b, data_a, wx, wy, wz, K, N); /* copy data back */ copy_out(ydata, wy, N, ystride); copy_out(zdata, wz, K, zstride); } filtering_complete: if (wx) free(wx); } } /* we are done... */ goto done; /* error conditions */ error_ab: retval = -3; goto done; error_xyz: retval = -2; goto done; error_malloc: retval = -1; /* clean up and exit */ done: if (x_axis_ptrs) free(x_axis_ptrs); if (y_axis_ptrs) free(y_axis_ptrs); if (z_axis_ptrs) free(z_axis_ptrs); /* get the GIL back and return */ Py_END_ALLOW_THREADS return retval; }