[Scipy-svn] r2235 - trunk/Lib/stats

Tue Sep 26 12:59:29 EDT 2006

Author: rkern
Date: 2006-09-26 11:59:28 -0500 (Tue, 26 Sep 2006)
New Revision: 2235

Modified:
   trunk/Lib/stats/kde.py
   trunk/Lib/stats/stats.py
Log:
sum -> np.sum

Modified: trunk/Lib/stats/kde.py
===================================================================

--- trunk/Lib/stats/kde.py	2006-09-26 15:25:09 UTC (rev 2234)
+++ trunk/Lib/stats/kde.py	2006-09-26 16:59:28 UTC (rev 2235)
@@ -17,63 +17,60 @@
 #
 #-------------------------------------------------------------------------------
 
-#-------------------------------------------------------------------------------
-# Imports:
-#-------------------------------------------------------------------------------
-
+# Standard library imports.
 import warnings
 
+# Scipy imports.
 from scipy import linalg, special
-from numpy.oldnumeric import atleast_2d, reshape, zeros, newaxis, dot, exp, pi, sqrt, \
-     ravel, Float, take, power, atleast_1d, squeeze, transpose
+from numpy import atleast_2d, reshape, zeros, newaxis, dot, exp, pi, sqrt, \
+     ravel, power, atleast_1d, squeeze, sum, transpose
 from numpy.random import randint, multivariate_normal
 
+# Local imports.
 import stats
 import mvn
 
 __all__ = ['gaussian_kde',
-          ]
+]
 
-#-------------------------------------------------------------------------------
-# 'gaussian_kde' class:
-#-------------------------------------------------------------------------------
 
 class gaussian_kde(object):
-    """
+    """ Representation of a kernel-density estimate using Gaussian kernels.
+
     Parameters
     ----------
     dataset : (# of dims, # of data)-array
-      datapoints to estimate from
+        datapoints to estimate from
 
     Members
     -------
     d : int
-      number of dimensions
+        number of dimensions
     n : int
-      number of datapoints
+        number of datapoints
 
     Methods
     -------
     kde.evaluate(points) : array
-      evaluate the estimated pdf on a provided set of points
+        evaluate the estimated pdf on a provided set of points
     kde(points) : array
-      same as kde.evaluate(points)
+        same as kde.evaluate(points)
     kde.integrate_gaussian(mean, cov) : float
-      multiply pdf with a specified gaussian and integrate over the whole domain
+        multiply pdf with a specified Gaussian and integrate over the whole domain
     kde.integrate_box_1d(low, high) : float
-      integrate pdf (1D only) between two bounds
+        integrate pdf (1D only) between two bounds
     kde.integrate_box(low_bounds, high_bounds) : float
-      integrate pdf over a rectangular space between low_bounds and high_bounds
+        integrate pdf over a rectangular space between low_bounds and high_bounds
     kde.integrate_kde(other_kde) : float
-      integrate two kernel density estimates multiplied together
+        integrate two kernel density estimates multiplied together
 
    Internal Methods
    ----------------
     kde.covariance_factor() : float
-      computes the coefficient that multiplies the data covariance matrix to
-      obtain the kernel covariance matrix. Set this method to kde.scotts_factor
-      or kde.silverman_factor (or subclass to provide your own). The default is
-      scotts_factor.
+        computes the coefficient that multiplies the data covariance matrix to
+        obtain the kernel covariance matrix. Set this method to
+        kde.scotts_factor or kde.silverman_factor (or subclass to provide your
+        own). The default is scotts_factor.
     """
 
     def __init__(self, dataset):
@@ -82,11 +79,26 @@
         self.d, self.n = self.dataset.shape
 
         self._compute_covariance()
-
+        
+        
     def evaluate(self, points):
         """Evaluate the estimated pdf on a set of points.
 
-        points.shape == (# of dimensions, # of points)
+        Parameters
+        ----------
+        points : (# of dimensions, # of points)-array
+            Alternatively, a (# of dimensions,) vector can be passed in and
+            treated as a single point.
+
+        Returns
+        -------
+        values : (# of points,)-array
+            The values at each point.
+
+        Raises
+        ------
+        ValueError if the dimensionality of the input points is different than
+        the dimensionality of the KDE.
         """
 
         points = atleast_2d(points).astype(self.dataset.dtype)
@@ -96,6 +108,7 @@
             if d == 1 and m == self.d:
                 # points was passed in as a row vector
                 points = reshape(points, (self.d, 1))
+                m = 1
             else:
                 msg = "points have dimension %s, dataset has dimension %s" % (d,
                     self.d)
@@ -126,21 +139,25 @@
     __call__ = evaluate
 
     def integrate_gaussian(self, mean, cov):
-        """Multiply estimated density by a multivariate gaussian and integrate
+        """Multiply estimated density by a multivariate Gaussian and integrate
         over the wholespace.
 
         Parameters
         ----------
         mean : vector
-          the mean of the gaussian
+            the mean of the Gaussian
         cov : matrix
-          the covariance matrix of the gaussian
+            the covariance matrix of the Gaussian
 
         Returns
         -------
         result : scalar
-          the value of the integral
+            the value of the integral
 
+        Raises
+        ------
+        ValueError if the mean or covariance of the input Gaussian differs from
+        the KDE's dimensionality.
         """
 
         mean = atleast_1d(squeeze(mean))
@@ -170,19 +187,23 @@
         Parameters
         ----------
         low : scalar
-          lower bound of integration
+            lower bound of integration
         high : scalar
-          upper bound of integration
+            upper bound of integration
 
         Returns
         -------
         value : scalar
-          the result of the integral
+            the result of the integral
+
+        Raises
+        ------
+        ValueError if the KDE is over more than one dimension.
         """
         if self.d != 1:
             raise ValueError("integrate_box_1d() only handles 1D pdfs")
 
-        stdev = ravel(sqrt(self.covariance))[0]
+        stdev = np.ravel(sqrt(self.covariance))[0]
 
         normalized_low = ravel((low - self.dataset)/stdev)
         normalized_high = ravel((high - self.dataset)/stdev)
@@ -198,16 +219,16 @@
         Parameters
         ----------
         low_bounds : vector
-          lower bounds of integration
+            lower bounds of integration
         high_bounds : vector
-          upper bounds of integration
+            upper bounds of integration
         maxpts=None : int
-          maximum number of points to use for integration
+            maximum number of points to use for integration
 
         Returns
         -------
         value : scalar
-          the result of the integral
+            the result of the integral
         """
         if maxpts is not None:
             extra_kwds = {'maxpts': maxpts}
@@ -229,13 +250,17 @@
 
         Parameters
         ----------
-        other : GaussianKDE instance
-          the other kde
+        other : gaussian_kde instance
+            the other kde
 
         Returns
         -------
         value : scalar
-          the result of the integral
+            the result of the integral
+
+        Raises
+        ------
+        ValueError if the KDEs have different dimensionality.
         """
 
         if other.d != self.d:
@@ -268,22 +293,24 @@
 
         Parameters
         ----------
-        size : int=None
-          if None, then size = self.n; otherwise, the number of samples to draw
+        size : int, optional
+            The number of samples to draw.
+            If not provided, then the size is the same as the underlying
+            dataset.
 
         Returns
         -------
         dataset : (self.d, size)-array
-          sampled dataset
+            sampled dataset
         """
 
         if size is None:
             size = self.n
 
-        norm = transpose(multivariate_normal(zeros((self.d,), Float),
+        norm = transpose(multivariate_normal(zeros((self.d,), float),
             self.covariance, size=size))
         indices = randint(0, self.n, size=size)
-        means = take(self.dataset, indices, axis=1)
+        means = self.dataset[:,indices]
 
         return means + norm
 
@@ -294,13 +321,16 @@
     def silverman_factor(self):
         return power(self.n*(self.d+2.0)/4.0, -1./(self.d+4))
 
+    # This can be replaced with silverman_factor if one wants to use Silverman's
+    # rule for choosing the bandwidth of the kernels.
     covariance_factor = scotts_factor
 
     def _compute_covariance(self):
-        """Computes the covariance matrix for each gaussian kernel using
+        """Computes the covariance matrix for each Gaussian kernel using
         covariance_factor
         """
         self.factor = self.covariance_factor()
         self.covariance = atleast_2d(stats.cov(self.dataset, rowvar=1) *
             self.factor * self.factor)
         self.inv_cov = linalg.inv(self.covariance)
+        self._norm_factor = sqrt(linalg.det(2*pi*self.covariance)) * self.n

Modified: trunk/Lib/stats/stats.py
===================================================================
--- trunk/Lib/stats/stats.py	2006-09-26 15:25:09 UTC (rev 2234)
+++ trunk/Lib/stats/stats.py	2006-09-26 16:59:28 UTC (rev 2235)
@@ -246,7 +246,7 @@
     x, axis = _chk_asarray(x,axis)
     x = x.copy()
     Norig = x.shape[axis]
-    factor = 1.0-sum(np.isnan(x),axis)*1.0/Norig
+    factor = 1.0-np.sum(np.isnan(x),axis)*1.0/Norig
 
     x[np.isnan(x)] = 0
     return np.mean(x,axis)/factor
@@ -1727,7 +1727,7 @@
     f_obs = asarray(f_obs)
     k = len(f_obs)
     if f_exp is None:
-        f_exp = array([sum(f_obs,axis=0)/float(k)] * len(f_obs),float)
+        f_exp = array([np.sum(f_obs,axis=0)/float(k)] * len(f_obs),float)
     f_exp = f_exp.astype(float)
     chisq = np.add.reduce((f_obs-f_exp)**2 / f_exp)
     return chisq, chisqprob(chisq, k-1)
@@ -1790,7 +1790,7 @@
     ranked = rankdata(np.concatenate((x,y)))
     rankx = ranked[0:n1]       # get the x-ranks
     #ranky = ranked[n1:]        # the rest are y-ranks
-    u1 = n1*n2 + (n1*(n1+1))/2.0 - sum(rankx,axis=0)  # calc U for x
+    u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx,axis=0)  # calc U for x
     u2 = n1*n2 - u1                            # remainder is U for y
     bigu = max(u1,u2)
     smallu = min(u1,u2)
@@ -1841,7 +1841,7 @@
     ranked = rankdata(alldata)
     x = ranked[:n1]
     y = ranked[n1:]
-    s = sum(x,axis=0)
+    s = np.sum(x,axis=0)
     expected = n1*(n1+n2+1) / 2.0
     z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
     prob = 2*(1.0 -zprob(abs(z)))
@@ -1871,10 +1871,10 @@
         del ranked[0:n[i]]
     rsums = []
     for i in range(len(args)):
-        rsums.append(sum(args[i],axis=0)**2)
+        rsums.append(np.sum(args[i],axis=0)**2)
         rsums[i] = rsums[i] / float(n[i])
-    ssbn = sum(rsums,axis=0)
-    totaln = sum(n,axis=0)
+    ssbn = np.sum(rsums,axis=0)
+    totaln = np.sum(n,axis=0)
     h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1)
     df = len(args) - 1
     if T == 0:
@@ -1902,7 +1902,7 @@
     data = data.astype(float)
     for i in range(len(data)):
         data[i] = rankdata(data[i])
-    ssbn = sum(sum(args,1)**2,axis=0)
+    ssbn = np.sum(np.sum(args,1)**2,axis=0)
     chisq = 12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)
     return chisq, chisqprob(chisq,k-1)
 
@@ -1991,7 +1991,7 @@
     if len(p) == 2:  # ttest_ind
         c = array([1,-1])
         df = n-2
-        fact = sum(1.0/sum(x,0),axis=0)  # i.e., 1/n1 + 1/n2 + 1/n3 ...
+        fact = np.sum(1.0/np.sum(x,0),axis=0)  # i.e., 1/n1 + 1/n2 + 1/n3 ...
         t = dot(c,b) / np.sqrt(s_sq*fact)
         probs = betai(0.5*df,0.5,float(df)/(df+t*t))
         return t, probs
@@ -2074,7 +2074,7 @@
 Returns: the square of the sum over axis.
 """
     a, axis = _chk_asarray(a, axis)
-    s = sum(a,axis)
+    s = np.sum(a,axis)
     if not np.isscalar(s):
         return s.astype(float)*s
     else: