[Scipy-svn] r4602 - trunk/scipy/stats/models
scipy-svn at scipy.org
scipy-svn at scipy.org
Tue Aug 5 18:56:24 EDT 2008
Author: tom.waite
Date: 2008-08-05 17:56:21 -0500 (Tue, 05 Aug 2008)
New Revision: 4602
Added:
trunk/scipy/stats/models/_bspline.py
Log:
rename bspline to make private. part of weave to extension conversion.
Copied: trunk/scipy/stats/models/_bspline.py (from rev 4600, trunk/scipy/stats/models/bspline.py)
===================================================================
--- trunk/scipy/stats/models/bspline.py 2008-08-04 20:27:24 UTC (rev 4600)
+++ trunk/scipy/stats/models/_bspline.py 2008-08-05 22:56:21 UTC (rev 4602)
@@ -0,0 +1,657 @@
+'''
+Bspines and smoothing splines.
+
+General references:
+
+ Craven, P. and Wahba, G. (1978) "Smoothing noisy data with spline functions.
+ Estimating the correct degree of smoothing by
+ the method of generalized cross-validation."
+ Numerische Mathematik, 31(4), 377-403.
+
+ Hastie, Tibshirani and Friedman (2001). "The Elements of Statistical
+ Learning." Springer-Verlag. 536 pages.
+
+ Hutchison, M. and Hoog, F. "Smoothing noisy data with spline functions."
+ Numerische Mathematik, 47(1), 99-106.
+'''
+
+import numpy as N
+import numpy.linalg as L
+
+from scipy.linalg import solveh_banded
+from scipy.optimize import golden
+from scipy.stats.models import _bspline
+
+
+# Issue warning regarding heavy development status of this module
+import warnings
+_msg = "The bspline code is technology preview and requires significant work\
+on the public API and documentation. The API will likely change in the future"
+warnings.warn(_msg, UserWarning)
+
+
+def _band2array(a, lower=0, symmetric=False, hermitian=False):
+ """
+ Take an upper or lower triangular banded matrix and return a
+ numpy array.
+
+ INPUTS:
+ a -- a matrix in upper or lower triangular banded matrix
+ lower -- is the matrix upper or lower triangular?
+ symmetric -- if True, return the original result plus its transpose
+ hermitian -- if True (and symmetric False), return the original
+ result plus its conjugate transposed
+
+ """
+
+ n = a.shape[1]
+ r = a.shape[0]
+ _a = 0
+
+ if not lower:
+ for j in range(r):
+ _b = N.diag(a[r-1-j],k=j)[j:(n+j),j:(n+j)]
+ _a += _b
+ if symmetric and j > 0: _a += _b.T
+ elif hermitian and j > 0: _a += _b.conjugate().T
+ else:
+ for j in range(r):
+ _b = N.diag(a[j],k=j)[0:n,0:n]
+ _a += _b
+ if symmetric and j > 0: _a += _b.T
+ elif hermitian and j > 0: _a += _b.conjugate().T
+ _a = _a.T
+
+ return _a
+
+
+def _upper2lower(ub):
+ """
+ Convert upper triangular banded matrix to lower banded form.
+
+ INPUTS:
+ ub -- an upper triangular banded matrix
+
+ OUTPUTS: lb
+ lb -- a lower triangular banded matrix with same entries
+ as ub
+ """
+
+ lb = N.zeros(ub.shape, ub.dtype)
+ nrow, ncol = ub.shape
+ for i in range(ub.shape[0]):
+ lb[i,0:(ncol-i)] = ub[nrow-1-i,i:ncol]
+ lb[i,(ncol-i):] = ub[nrow-1-i,0:i]
+ return lb
+
+def _lower2upper(lb):
+ """
+ Convert lower triangular banded matrix to upper banded form.
+
+ INPUTS:
+ lb -- a lower triangular banded matrix
+
+ OUTPUTS: ub
+ ub -- an upper triangular banded matrix with same entries
+ as lb
+ """
+
+ ub = N.zeros(lb.shape, lb.dtype)
+ nrow, ncol = lb.shape
+ for i in range(lb.shape[0]):
+ ub[nrow-1-i,i:ncol] = lb[i,0:(ncol-i)]
+ ub[nrow-1-i,0:i] = lb[i,(ncol-i):]
+ return ub
+
+def _triangle2unit(tb, lower=0):
+ """
+ Take a banded triangular matrix and return its diagonal and the
+ unit matrix: the banded triangular matrix with 1's on the diagonal,
+ i.e. each row is divided by the corresponding entry on the diagonal.
+
+ INPUTS:
+ tb -- a lower triangular banded matrix
+ lower -- if True, then tb is assumed to be lower triangular banded,
+ in which case return value is also lower triangular banded.
+
+ OUTPUTS: d, b
+ d -- diagonal entries of tb
+ b -- unit matrix: if lower is False, b is upper triangular
+ banded and its rows of have been divided by d,
+ else lower is True, b is lower triangular banded
+ and its columns have been divieed by d.
+
+ """
+
+ if lower: d = tb[0].copy()
+ else: d = tb[-1].copy()
+
+ if lower: return d, (tb / d)
+ else:
+ l = _upper2lower(tb)
+ return d, _lower2upper(l / d)
+
+def _trace_symbanded(a, b, lower=0):
+ """
+ Compute the trace(ab) for two upper or banded real symmetric matrices
+ stored either in either upper or lower form.
+
+ INPUTS:
+ a, b -- two banded real symmetric matrices (either lower or upper)
+ lower -- if True, a and b are assumed to be the lower half
+
+
+ OUTPUTS: trace
+ trace -- trace(ab)
+
+ """
+
+ if lower:
+ t = _zero_triband(a * b, lower=1)
+ return t[0].sum() + 2 * t[1:].sum()
+ else:
+ t = _zero_triband(a * b, lower=0)
+ return t[-1].sum() + 2 * t[:-1].sum()
+
+
+def _zero_triband(a, lower=0):
+ """
+ Explicitly zero out unused elements of a real symmetric banded matrix.
+
+ INPUTS:
+ a -- a real symmetric banded matrix (either upper or lower hald)
+ lower -- if True, a is assumed to be the lower half
+
+ """
+
+ nrow, ncol = a.shape
+ if lower:
+ for i in range(nrow): a[i,(ncol-i):] = 0.
+ else:
+ for i in range(nrow): a[i,0:i] = 0.
+ return a
+
+
+class BSpline(object):
+
+ '''
+
+ Bsplines of a given order and specified knots.
+
+ Implementation is based on description in Chapter 5 of
+
+ Hastie, Tibshirani and Friedman (2001). "The Elements of Statistical
+ Learning." Springer-Verlag. 536 pages.
+
+
+ INPUTS:
+ knots -- a sorted array of knots with knots[0] the lower boundary,
+ knots[1] the upper boundary and knots[1:-1] the internal
+ knots.
+ order -- order of the Bspline, default is 4 which yields cubic
+ splines
+ M -- number of additional boundary knots, if None it defaults
+ to order
+ coef -- an optional array of real-valued coefficients for the Bspline
+ of shape (knots.shape + 2 * (M - 1) - order,).
+ x -- an optional set of x values at which to evaluate the
+ Bspline to avoid extra evaluation in the __call__ method
+
+ '''
+
+ def __init__(self, knots, order=4, M=None, coef=None, x=None):
+
+ knots = N.squeeze(N.unique(N.asarray(knots)))
+
+ if knots.ndim != 1:
+ raise ValueError, 'expecting 1d array for knots'
+
+ self.m = order
+ if M is None:
+ M = self.m
+ self.M = M
+
+ self.tau = N.hstack([[knots[0]]*(self.M-1), knots, [knots[-1]]*(self.M-1)])
+
+ self.K = knots.shape[0] - 2
+ if coef is None:
+ self.coef = N.zeros((self.K + 2 * self.M - self.m), N.float64)
+ else:
+ self.coef = N.squeeze(coef)
+ if self.coef.shape != (self.K + 2 * self.M - self.m):
+ raise ValueError, 'coefficients of Bspline have incorrect shape'
+ if x is not None:
+ self.x = x
+
+ def _setx(self, x):
+ self._x = x
+ self._basisx = self.basis(self._x)
+
+ def _getx(self):
+ return self._x
+
+ x = property(_getx, _setx)
+
+ def __call__(self, *args):
+ """
+ Evaluate the BSpline at a given point, yielding
+ a matrix B and return
+
+ B * self.coef
+
+
+ INPUTS:
+ args -- optional arguments. If None, it returns self._basisx,
+ the BSpline evaluated at the x values passed in __init__.
+ Otherwise, return the BSpline evaluated at the
+ first argument args[0].
+
+ OUTPUTS: y
+ y -- value of Bspline at specified x values
+
+ BUGS:
+ If self has no attribute x, an exception will be raised
+ because self has no attribute _basisx.
+
+ """
+
+ if not args:
+ b = self._basisx.T
+ else:
+ x = args[0]
+ b = N.asarray(self.basis(x)).T
+ return N.squeeze(N.dot(b, self.coef))
+
+ def basis_element(self, x, i, d=0):
+ """
+ Evaluate a particular basis element of the BSpline,
+ or its derivative.
+
+ INPUTS:
+ x -- x values at which to evaluate the basis element
+ i -- which element of the BSpline to return
+ d -- the order of derivative
+
+ OUTPUTS: y
+ y -- value of d-th derivative of the i-th basis element
+ of the BSpline at specified x values
+
+ """
+
+ x = N.asarray(x, N.float64)
+ _shape = x.shape
+ if _shape == ():
+ x.shape = (1,)
+ x.shape = (N.product(_shape,axis=0),)
+ if i < self.tau.shape[0] - 1:
+ ## TODO: OWNDATA flags...
+ v = _bspline.evaluate(x, self.tau, self.m, d, i, i+1)
+ else:
+ return N.zeros(x.shape, N.float64)
+
+ if (i == self.tau.shape[0] - self.m):
+ v = N.where(N.equal(x, self.tau[-1]), 1, v)
+ v.shape = _shape
+ return v
+
+ def basis(self, x, d=0, lower=None, upper=None):
+ """
+ Evaluate the basis of the BSpline or its derivative.
+ If lower or upper is specified, then only
+ the [lower:upper] elements of the basis are returned.
+
+ INPUTS:
+ x -- x values at which to evaluate the basis element
+ i -- which element of the BSpline to return
+ d -- the order of derivative
+ lower -- optional lower limit of the set of basis
+ elements
+ upper -- optional upper limit of the set of basis
+ elements
+
+ OUTPUTS: y
+ y -- value of d-th derivative of the basis elements
+ of the BSpline at specified x values
+
+ """
+ x = N.asarray(x)
+ _shape = x.shape
+ if _shape == ():
+ x.shape = (1,)
+ x.shape = (N.product(_shape,axis=0),)
+
+ if upper is None:
+ upper = self.tau.shape[0] - self.m
+ if lower is None:
+ lower = 0
+ upper = min(upper, self.tau.shape[0] - self.m)
+ lower = max(0, lower)
+
+ d = N.asarray(d)
+ if d.shape == ():
+ v = _bspline.evaluate(x, self.tau, self.m, int(d), lower, upper)
+ else:
+ if d.shape[0] != 2:
+ raise ValueError, "if d is not an integer, expecting a jx2 \
+ array with first row indicating order \
+ of derivative, second row coefficient in front."
+
+ v = 0
+ for i in range(d.shape[1]):
+ v += d[1,i] * _bspline.evaluate(x, self.tau, self.m, d[0,i], lower, upper)
+
+ v.shape = (upper-lower,) + _shape
+ if upper == self.tau.shape[0] - self.m:
+ v[-1] = N.where(N.equal(x, self.tau[-1]), 1, v[-1])
+ return v
+
+ def gram(self, d=0):
+ """
+ Compute Gram inner product matrix, storing it in lower
+ triangular banded form.
+
+ The (i,j) entry is
+
+ G_ij = integral b_i^(d) b_j^(d)
+
+ where b_i are the basis elements of the BSpline and (d) is the
+ d-th derivative.
+
+ If d is a matrix then, it is assumed to specify a differential
+ operator as follows: the first row represents the order of derivative
+ with the second row the coefficient corresponding to that order.
+
+ For instance:
+
+ [[2, 3],
+ [3, 1]]
+
+ represents 3 * f^(2) + 1 * f^(3).
+
+ INPUTS:
+ d -- which derivative to apply to each basis element,
+ if d is a matrix, it is assumed to specify
+ a differential operator as above
+
+ OUTPUTS: gram
+ gram -- the matrix of inner products of (derivatives)
+ of the BSpline elements
+
+ """
+
+ d = N.squeeze(d)
+ if N.asarray(d).shape == ():
+ self.g = _bspline.gram(self.tau, self.m, int(d), int(d))
+ else:
+ d = N.asarray(d)
+ if d.shape[0] != 2:
+ raise ValueError, "if d is not an integer, expecting a jx2 \
+ array with first row indicating order \
+ of derivative, second row coefficient in front."
+ if d.shape == (2,):
+ d.shape = (2,1)
+ self.g = 0
+ for i in range(d.shape[1]):
+ for j in range(d.shape[1]):
+ self.g += d[1,i]* d[1,j] * _bspline.gram(self.tau, self.m, int(d[0,i]), int(d[0,j]))
+ self.g = self.g.T
+ self.d = d
+ return N.nan_to_num(self.g)
+
+class SmoothingSpline(BSpline):
+
+ penmax = 30.
+ method = "target_df"
+ target_df = 5
+ default_pen = 1.0e-03
+ optimize = True
+
+ '''
+ A smoothing spline, which can be used to smooth scatterplots, i.e.
+ a list of (x,y) tuples.
+
+ See fit method for more information.
+
+ '''
+
+ def fit(self, y, x=None, weights=None, pen=0.):
+ """
+ Fit the smoothing spline to a set of (x,y) pairs.
+
+ INPUTS:
+ y -- response variable
+ x -- if None, uses self.x
+ weights -- optional array of weights
+ pen -- constant in front of Gram matrix
+
+ OUTPUTS: None
+ The smoothing spline is determined by self.coef,
+ subsequent calls of __call__ will be the smoothing spline.
+
+ ALGORITHM:
+ Formally, this solves a minimization:
+
+ fhat = ARGMIN_f SUM_i=1^n (y_i-f(x_i))^2 + pen * int f^(2)^2
+
+ See Chapter 5 of
+
+ Hastie, Tibshirani and Friedman (2001). "The Elements of Statistical
+ Learning." Springer-Verlag. 536 pages.
+
+ for more details.
+
+ TODO:
+ Should add arbitrary derivative penalty instead of just
+ second derivative.
+ """
+
+ banded = True
+
+ if x is None:
+ x = self._x
+ bt = self._basisx.copy()
+ else:
+ bt = self.basis(x)
+
+ if pen == 0.: # can't use cholesky for singular matrices
+ banded = False
+
+ if x.shape != y.shape:
+ raise ValueError, 'x and y shape do not agree, by default x are \
+ the Bspline\'s internal knots'
+
+ if pen >= self.penmax:
+ pen = self.penmax
+
+
+ if weights is not None:
+ self.weights = weights
+ else:
+ self.weights = 1.
+
+ _w = N.sqrt(self.weights)
+ bt *= _w
+
+ # throw out rows with zeros (this happens at boundary points!)
+
+ mask = N.flatnonzero(1 - N.alltrue(N.equal(bt, 0), axis=0))
+
+ bt = bt[:,mask]
+ y = y[mask]
+
+ self.df_total = y.shape[0]
+
+ bty = N.squeeze(N.dot(bt, _w * y))
+ self.N = y.shape[0]
+
+ if not banded:
+ self.btb = N.dot(bt, bt.T)
+ _g = _band2array(self.g, lower=1, symmetric=True)
+ self.coef, _, self.rank = L.lstsq(self.btb + pen*_g, bty)[0:3]
+ self.rank = min(self.rank, self.btb.shape[0])
+ del(_g)
+ else:
+ self.btb = N.zeros(self.g.shape, N.float64)
+ nband, nbasis = self.g.shape
+ for i in range(nbasis):
+ for k in range(min(nband, nbasis-i)):
+ self.btb[k,i] = (bt[i] * bt[i+k]).sum()
+
+ bty.shape = (1,bty.shape[0])
+ self.pen = pen
+ self.chol, self.coef = solveh_banded(self.btb +
+ pen*self.g,
+ bty, lower=1)
+
+ self.coef = N.squeeze(self.coef)
+ self.resid = y * self.weights - N.dot(self.coef, bt)
+ self.pen = pen
+
+ del(bty); del(mask); del(bt)
+
+ def smooth(self, y, x=None, weights=None):
+
+ if self.method == "target_df":
+ if hasattr(self, 'pen'):
+ self.fit(y, x=x, weights=weights, pen=self.pen)
+ else:
+ self.fit_target_df(y, x=x, weights=weights, df=self.target_df)
+ elif self.method == "optimize_gcv":
+ self.fit_optimize_gcv(y, x=x, weights=weights)
+
+
+ def gcv(self):
+ """
+ Generalized cross-validation score of current fit.
+
+ Craven, P. and Wahba, G. "Smoothing noisy data with spline functions.
+ Estimating the correct degree of smoothing by
+ the method of generalized cross-validation."
+ Numerische Mathematik, 31(4), 377-403.
+ """
+
+ norm_resid = (self.resid**2).sum()
+ return norm_resid / (self.df_total - self.trace())
+
+ def df_resid(self):
+ """
+ Residual degrees of freedom in the fit.
+
+ self.N - self.trace()
+
+ where self.N is the number of observations of last fit.
+ """
+
+ return self.N - self.trace()
+
+ def df_fit(self):
+ """
+ How many degrees of freedom used in the fit?
+
+ self.trace()
+
+ """
+ return self.trace()
+
+ def trace(self):
+ """
+ Trace of the smoothing matrix S(pen)
+
+ TODO: addin a reference to Wahba, and whoever else I used.
+ """
+
+ if self.pen > 0:
+ _invband = _bspline.invband(self.chol.copy())
+ tr = _trace_symbanded(_invband, self.btb, lower=1)
+ return tr
+ else:
+ return self.rank
+
+ def fit_target_df(self, y, x=None, df=None, weights=None, tol=1.0e-03,
+ apen=0, bpen=1.0e-03):
+
+ """
+ Fit smoothing spline with approximately df degrees of freedom
+ used in the fit, i.e. so that self.trace() is approximately df.
+
+ Uses binary search strategy.
+
+ In general, df must be greater than the dimension of the null space
+ of the Gram inner product. For cubic smoothing splines, this means
+ that df > 2.
+
+ INPUTS:
+ y -- response variable
+ x -- if None, uses self.x
+ df -- target degrees of freedom
+ weights -- optional array of weights
+ tol -- (relative) tolerance for convergence
+ apen -- lower bound of penalty for binary search
+ bpen -- upper bound of penalty for binary search
+
+ OUTPUTS: None
+ The smoothing spline is determined by self.coef,
+ subsequent calls of __call__ will be the smoothing spline.
+
+ """
+
+ df = df or self.target_df
+
+ olddf = y.shape[0] - self.m
+
+ if hasattr(self, "pen"):
+ self.fit(y, x=x, weights=weights, pen=self.pen)
+ curdf = self.trace()
+ if N.fabs(curdf - df) / df < tol:
+ return
+ if curdf > df:
+ apen, bpen = self.pen, 2 * self.pen
+ else:
+ apen, bpen = 0., self.pen
+
+ while True:
+
+ curpen = 0.5 * (apen + bpen)
+ self.fit(y, x=x, weights=weights, pen=curpen)
+ curdf = self.trace()
+ if curdf > df:
+ apen, bpen = curpen, 2 * curpen
+ else:
+ apen, bpen = apen, curpen
+ if apen >= self.penmax:
+ raise ValueError, "penalty too large, try setting penmax \
+ higher or decreasing df"
+ if N.fabs(curdf - df) / df < tol:
+ break
+
+ def fit_optimize_gcv(self, y, x=None, weights=None, tol=1.0e-03,
+ brack=(-100,20)):
+ """
+ Fit smoothing spline trying to optimize GCV.
+
+ Try to find a bracketing interval for scipy.optimize.golden
+ based on bracket.
+
+ It is probably best to use target_df instead, as it is
+ sometimes difficult to find a bracketing interval.
+
+ INPUTS:
+ y -- response variable
+ x -- if None, uses self.x
+ df -- target degrees of freedom
+ weights -- optional array of weights
+ tol -- (relative) tolerance for convergence
+ brack -- an initial guess at the bracketing interval
+
+ OUTPUTS: None
+ The smoothing spline is determined by self.coef,
+ subsequent calls of __call__ will be the smoothing spline.
+
+ """
+
+ def _gcv(pen, y, x):
+ self.fit(y, x=x, pen=N.exp(pen))
+ a = self.gcv()
+ return a
+
+ a = golden(_gcv, args=(y,x), brack=bracket, tol=tol)
More information about the Scipy-svn
mailing list