[Scipy-svn] r5223 - in trunk/scipy/stats: . tests

Thu Dec 4 17:53:44 EST 2008

Author: josef
Date: 2008-12-04 16:53:40 -0600 (Thu, 04 Dec 2008)
New Revision: 5223

Modified:
   trunk/scipy/stats/stats.py
   trunk/scipy/stats/tests/test_stats.py
Log:
new version of ks2_samp without NR, with tests

Modified: trunk/scipy/stats/stats.py
===================================================================

--- trunk/scipy/stats/stats.py	2008-12-04 22:49:32 UTC (rev 5222)
+++ trunk/scipy/stats/stats.py	2008-12-04 22:53:40 UTC (rev 5223)
@@ -2094,39 +2094,71 @@
 
 
 def ks_2samp(data1, data2):
-    """ Computes the Kolmogorov-Smirnof statistic on 2 samples.  Modified
-    from Numerical Recipies in C, page 493.  Returns KS D-value, prob.  Not
-    ufunc- like.
+    """ Computes the Kolmogorov-Smirnof statistic on 2 samples.
 
+    data1, data2: array_like, 1-dim
+        samples assumed to be drawn from a continuous distribution,
+        sample sizes can be different
+    
     Returns: KS D-value, p-value
+
+    Description:
+    ------------
+
+    Tests whether 2 samples are drawn from the same distribution. Note
+    that, like the one-sample K-S test the distribution is assumed to be
+    continuous.
+    
+    This is the two-sided test, one-sided tests are not implemented.
+    The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.
+
+    If the K-S statistic is small or the p-value is high, then we cannot
+    reject the hypothesis that the two distributions of the two samples
+    are the same
+
+    Examples:
+    ---------
+
+    >>> np.random.seed(12345678);
+
+    >>> n1 = 200  # size of first sample
+    >>> n2 = 300  # size of second sample
+
+    # different distribution
+    we can reject the null hypothesis since the pvalue is below 1%
+    >>> rvs1 = stats.norm.rvs(size=n1,loc=0.,scale=1);
+    >>> rvs2 = stats.norm.rvs(size=n2,loc=0.5,scale=1.5)
+    >>> ks_2samp_new(rvs1,rvs2)
+    (0.17333333333333334, 0.0012436147919875644)
+
+    slightly different distribution
+    we cannot reject the null hypothesis since the pvalue is high, 43.8%
+    >>> rvs3 = stats.norm.rvs(size=n2,loc=0.01,scale=1.0)
+    >>> ks_2samp_new(rvs1,rvs3)
+    (0.078333333333333255, 0.4379740175003739)
+
+    identical distribution
+    we cannot reject the null hypothesis since the pvalue is high, 65%
+    >>> rvs4 = stats.norm.rvs(size=n2,loc=0.0,scale=1.0)
+    >>> ks_2samp_new(rvs1,rvs4)
+    (0.066666666666666652, 0.64576306820960394)
+    
     """
     data1, data2 = map(asarray, (data1, data2))
-    j1 = 0    # zeros(data1.shape[1:]) TRIED TO MAKE THIS UFUNC-LIKE
-    j2 = 0    # zeros(data2.shape[1:])
-    fn1 = 0.0 # zeros(data1.shape[1:],float)
-    fn2 = 0.0 # zeros(data2.shape[1:],float)
     n1 = data1.shape[0]
     n2 = data2.shape[0]
-    en1 = n1*1
-    en2 = n2*1
-    d = zeros(data1.shape[1:])
-    data1 = np.sort(data1,0)
-    data2 = np.sort(data2,0)
-    while j1 < n1 and j2 < n2:
-        d1=data1[j1]
-        d2=data2[j2]
-        if d1 <= d2:
-            fn1 = (j1)/float(en1)
-            j1 = j1 + 1
-        if d2 <= d1:
-            fn2 = (j2)/float(en2)
-            j2 = j2 + 1
-        dt = (fn2-fn1)
-        if abs(dt) > abs(d):
-            d = dt
+    n1 = len(data1)
+    n2 = len(data2)
+    data1 = np.sort(data1)
+    data2 = np.sort(data2)
+    data_all = np.concatenate([data1,data2])
+    cdf1 = np.searchsorted(data1,data_all,side='right')/(1.0*n1)
+    cdf2 = (np.searchsorted(data2,data_all,side='right'))/(1.0*n2)
+    d = np.max(np.absolute(cdf1-cdf2))
+    #Note: d absolute not signed distance
+    en = np.sqrt(n1*n2/float(n1+n2))
     try:
-        en = np.sqrt(en1*en2/float(en1+en2))
-        prob = ksprob((en+0.12+0.11/en)*np.fabs(d))
+        prob = ksprob((en+0.12+0.11/en)*d)
     except:
         prob = 1.0
     return d, prob

Modified: trunk/scipy/stats/tests/test_stats.py
===================================================================
--- trunk/scipy/stats/tests/test_stats.py	2008-12-04 22:49:32 UTC (rev 5222)
+++ trunk/scipy/stats/tests/test_stats.py	2008-12-04 22:53:40 UTC (rev 5223)
@@ -1024,5 +1024,35 @@
     assert_almost_equal( np.array(stats.kstest(x,'norm', alternative = 'larger')),
                 np.array((0.0072115233216310994, 0.98531158590396228)), 14)
 
+    #missing: no test that uses *args
+
+
+def test_ks_2samp():
+    #exact small sample solution
+    data1 = np.array([1.0,2.0])
+    data2 = np.array([1.0,2.0,3.0])
+    assert_almost_equal(np.array(stats.ks_2samp(data1+0.01,data2)),
+                np.array((0.33333333333333337, 0.99062316386915694)))
+    assert_almost_equal(np.array(stats.ks_2samp(data1-0.01,data2)),
+                np.array((0.66666666666666674, 0.42490954988801982)))
+    #these can also be verified graphically
+    assert_almost_equal(
+        np.array(stats.ks_2samp(np.linspace(1,100,100),
+                              np.linspace(1,100,100)+2+0.1)),
+        np.array((0.030000000000000027, 0.99999999996005062)))
+    assert_almost_equal(
+        np.array(stats.ks_2samp(np.linspace(1,100,100),
+                              np.linspace(1,100,100)+2-0.1)),
+        np.array((0.020000000000000018, 0.99999999999999933)))
+    #these are just regression tests
+    assert_almost_equal(
+        np.array(stats.ks_2samp(np.linspace(1,100,100),
+                              np.linspace(1,100,110)+20.1)),
+        np.array((0.21090909090909091, 0.015880386730710221)))
+    assert_almost_equal(
+        np.array(stats.ks_2samp(np.linspace(1,100,100),
+                              np.linspace(1,100,110)+20-0.1)),
+        np.array((0.20818181818181825, 0.017981441789762638)))    
+
 if __name__ == "__main__":
     run_module_suite()