''' this contains new kstest, test function for test suite and functions to compare with R, including Monte Carlo ------------------------------------------------------------------------------------------------------------- #simple example comparison to R >>> x = np.linspace(-1,1,9) >>> kstest(x,'norm') (0.15865525393145707, 0.97727773578380395) # comparing with R >>> np.array(kstest(x,'norm')) - np.array((0.15865525393145705, 0.95164069201518386)) array([ 2.77555756e-17, 2.56370438e-02]) >>> x = np.linspace(-15,15,9) >>> kstest(x,'norm') (0.44435602715924361, 0.038850142705171065) # comparing with R >>> np.array(kstest(x,'norm')) - np.array((0.44435602715924361, 0.038850140086788665)) array([ 0.00000000e+00, 2.61838240e-09]) >>> np.random.seed(987654321) >>> x = stats.norm.rvs(loc=0.2, size=100) >>> kstest(x,'norm', alternative = 'smaller') (0.12464329735846891, 0.040989164077641749) >>> kstest(x,'norm', alternative = 'larger') (0.0072115233216311081, 0.98531158590396395) >>> kstest(x,'norm', mode='asymp') (0.12464329735846891, 0.08944488871182088) >>> np.array(kstest(x, 'norm', mode='asymp')) - np.array((0.12464329735846891, 0.089444888711820769 array([ 0.00000000e+00, 1.11022302e-16]) >>> np.array(kstest(x,'norm', alternative = 'smaller')) - np.array((0.12464329735846891, 0.040989164077641749)) array([ 0., 0.]) >>> np.array(kstest(x,'norm', alternative = 'larger')) - np.array((0.0072115233216310994, 0.98531158590396228)) array([ 8.67361738e-18, 1.66533454e-15]) raise attribute error instead of silent replace: with old version >>> np.random.seed(987654321) >>> stats.kstest('norm','',N=100) (0.058051726828984629, 0.49080925022181865) >>> np.random.seed(987654321) >>> stats.kstest('norm','t',(1,),N=100) #silently uses norm not t (0.058051726828984629, 0.49080925022181865) ''' import numpy as np import numpy.testing as npt from scipy import stats, special ##ksprob = special.kolmogorov ##import scipy.stats ##import distributions def kstest(rvs, cdf, args=(), N=20, alternative = 'unequal', mode='approx'): """Return the D-value and the p-value for a Kolmogorov-Smirnov test This performs a test of the distribution of random variables G(x) against a given distribution F(x). Under the null hypothesis the two distributions are identical, G(x)=F(x). The alternative hypothesis can be either 'unequal' (default), 'smaller' or 'larger'. In the two one-sided test, the alternative is that the empirical cumulative distribution function, of the random variable is "smaller" or "larger" then the cumulative distribution function of the hypothesis F(x), G(x)<=F(x), resp. G(x)>=F(x). If the p-value is greater than the significance level (say 5%), then we cannot reject the hypothesis that the data come from the given distribution. Parameters ---------- rvs : string or array or callable string: name of a distribution in scipy.stats array: random variables callable: function to generate random variables, requires keyword argument size cdf : string or callable string: name of a distribution in scipy.stats if rvs is a string then cdf can evaluate to False or be the same as rvs callable: function to evaluate cdf args : distribution parameters used if rvs or cdf are strings N : sample size if rvs is string or callable alternative : 'unequal' (default), 'smaller' or 'larger' defines the alternative hypothesis (see explanation) mode : 'approx' (default) or 'asymp' defines distribution used for calculating p-value 'approx' : use approximation to exact distribution of test statistic 'asymp' : use asymptotic distribution of test statistic Returns ------- D: test statistic either D, D+ or D- p-value Examples -------- >>> x = np.linspace(-15,15,9) >>> kstest(x,'norm') (0.44435602715924361, 0.038850142705171065) >>> np.random.seed(987654321) >>> kstest('norm','',N=100) (0.058352892479417884, 0.88531190944151261) is equivalent to this >>> np.random.seed(987654321) >>> kstest(stats.norm.rvs(size=100),'norm') (0.058352892479417884, 0.88531190944151261) test against one-sided alternative hypothesis --------------------------------------------- >>> np.random.seed(987654321) >>> # shift distribution to larger values, so that cdf_dgp(x)< norm.cdf(x) >>> x = stats.norm.rvs(loc=0.2, size=100) >>> kstest(x,'norm', alternative = 'smaller') (0.12464329735846891, 0.040989164077641749) >>> # reject equal distribution against alternative hypothesis: smaller >>> kstest(x,'norm', alternative = 'larger') (0.0072115233216311081, 0.98531158590396395) >>> # don't reject equal distribution against alternative hypothesis: larger >>> kstest(x,'norm', mode='asymp') (0.12464329735846891, 0.08944488871182088) testing t distributed random variables against normal distribution ------------------------------------------------------------------ >>> np.random.seed(987654321) >>> stats.kstest(stats.t.rvs(100,size=100),'norm') (0.062018929165471248, 0.44505373063343567) >>> np.random.seed(987654321) >>> stats.kstest(stats.t.rvs(3,size=100),'norm') (0.12101689575982888, 0.049143106661937996) """ if isinstance(rvs, basestring): #cdf = getattr(stats, rvs).cdf if (not cdf) or (cdf == rvs): cdf = getattr(stats, rvs).cdf rvs = getattr(stats, rvs).rvs else: raise AttributeError, 'if rvs is string, cdf has to be the same distribution' if isinstance(cdf, basestring): cdf = getattr(stats, cdf).cdf if callable(rvs): kwds = {'size':N} vals = np.sort(rvs(*args,**kwds)) else: vals = np.sort(rvs) N = len(vals) cdfvals = cdf(vals, *args) if alternative in ['unequal', 'larger']: Dplus = (np.arange(1.0, N+1)/N - cdfvals).max() if alternative == 'larger': return Dplus, stats.ksone.sf(Dplus,N) if alternative in ['unequal', 'smaller']: Dmin = (cdfvals - np.arange(0.0, N)/N).max() if alternative == 'smaller': return Dmin, stats.ksone.sf(Dmin,N) if alternative == 'unequal': D = np.max([Dplus,Dmin]) if mode == 'asymp': return D, stats.kstwobign.sf(D*np.sqrt(N)) if mode == 'approx': pval_two = stats.kstwobign.sf(D*np.sqrt(N)) if N > 2666 or pval_two > 0.80 - N*0.3/1000.0 : return D, stats.kstwobign.sf(D*np.sqrt(N)) else: return D, stats.ksone.sf(D,N)*2 def test_kstest(): from numpy.testing import assert_almost_equal # comparing with values from R x = np.linspace(-1,1,9) D,p = kstest(x,'norm') assert_almost_equal( D, 0.15865525393145705, 12) assert_almost_equal( p, 0.95164069201518386, 1) x = np.linspace(-15,15,9) D,p = kstest(x,'norm') assert_almost_equal( D, 0.44435602715924361, 15) assert_almost_equal( p, 0.038850140086788665, 8) # the following tests rely on deterministicaly replicated rvs np.random.seed(987654321) x = stats.norm.rvs(loc=0.2, size=100) D,p = kstest(x, 'norm', mode='asymp') assert_almost_equal( D, 0.12464329735846891, 15) assert_almost_equal( p, 0.089444888711820769, 15) assert_almost_equal( np.array(kstest(x, 'norm', mode='asymp')), np.array((0.12464329735846891, 0.089444888711820769)), 15) assert_almost_equal( np.array(kstest(x,'norm', alternative = 'smaller')), np.array((0.12464329735846891, 0.040989164077641749)), 15) assert_almost_equal( np.array(kstest(x,'norm', alternative = 'larger')), np.array((0.0072115233216310994, 0.98531158590396228)), 14) # the rest is for Monte Carlo comparison with r try: import rpy r=rpy.r ksfn=r('ks.test') except ImportError: print "rpy not available" def get_R_kstest(xxr): n = len(xxr) #print '\n two-sided' resultr = ksfn(xxr,'pnorm', exact = True) D_r = resultr['statistic']['D'] resultrf = ksfn(xxr,'pnorm', exact = False) #print '\n less' resultr_l = ksfn(xxr,'pnorm', alternative = "less", exact = True ) D_r_l = resultr_l['statistic']['D^-'] #print '\n greater' resultr_g = ksfn(xxr,'pnorm', alternative = "greater", exact = True) D_r_g = resultr_g['statistic']['D^+'] return [(resultr['statistic']['D'], resultr['p.value']), (resultrf['statistic']['D'], resultrf['p.value']), (resultr_g['statistic']['D^+'], resultr_g['p.value']), (resultr_l['statistic']['D^-'], resultr_l['p.value'])] def check(xxr): n = len(xxr) print n resultr={} resultrf={} print '\n two-sided' resultr = ksfn(xxr,'pnorm', exact = True) D_r = resultr['statistic']['D'] print resultr['statistic']['D'] print resultr['p.value'] resultrf = ksfn(xxr,'pnorm', exact = False) print resultrf['statistic']['D'] print resultrf['p.value'] print stats.kstest(xxr,'norm') print stats.ksone.sf(D_r,n) print stats.ksone.sf(D_r,n)*2 print stats.kstwobign.sf(D_r*np.sqrt(n)) pval_ksone_x2 = stats.ksone.sf(D_r,n)*2 pval_kstwo = stats.kstwobign.sf(D_r*np.sqrt(n)) print '\n less' resultr_l = ksfn(xxr,'pnorm', alternative = "less", exact = True ) D_r_l = resultr_l['statistic']['D^-'] print resultr_l['statistic']['D^-'] print resultr_l['p.value'] print stats.kstest(xxr,'norm') print stats.ksone.sf(D_r_l,n) print stats.ksone.sf(D_r_l,n)*2 print stats.kstwobign.sf(D_r_l*np.sqrt(n)) pval_ksone_l = stats.ksone.sf(D_r_l,n) pval_kstwo_l = stats.kstwobign.sf(D_r_l*np.sqrt(n)) print '\n greater' resultr_g = ksfn(xxr,'pnorm', alternative = "greater", exact = True) D_r_g = resultr_g['statistic']['D^+'] print resultr_g['statistic']['D^+'] print resultr_g['p.value'] print stats.kstest(xxr,'norm') print stats.ksone.sf(D_r_g,n) print stats.ksone.sf(D_r_g,n)*2 print stats.kstwobign.sf(D_r_g*np.sqrt(n)) pval_ksone_g = stats.ksone.sf(D_r_g,n) pval_kstwo_g = stats.kstwobign.sf(D_r_g*np.sqrt(n)) ## npt.assert_almost_equal(resultr_l['p.value'],pval_ksone_l,3) ## npt.assert_almost_equal(resultr_g['p.value'],pval_ksone_g,3) ## ## npt.assert_almost_equal(resultr['p.value'],pval_ksone_x2,3) ## npt.assert_almost_equal(resultrf['p.value'],pval_kstwo,4) #return resultr, resultrf, resultr_g, resultr_l return [(D_r, resultr['p.value']), (D_r, resultrf['p.value']), (D_r_g, resultr_g['p.value']), (D_r_l, resultr_l['p.value'])] ##resultrf = ksfn(xxr,'pnorm', exact = False) ##resultrf['statistic']['D'] ##resultrf['p.value'] ##stats.ksone.sf(resultr['statistic']['D'],100)*2 ##stats.kstwobign.sf(resultr['statistic']['D']*np.sqrt(100.0)) ## ### example normal size=1000 ##>>> xxrl = stats.norm.rvs(size=1000) ##>>> resultrl=ksfn(xxrl,'pnorm', exact = True) ##>>> resultrl['p.value'] ##0.2419499342788699 ##>>> resultrl['statistic']['D'] ##0.032317405617139472 ##>>> stats.kstest(xxrl,'norm') ##(0.032317405617139472, 0.12118954799968018) def monte_carlo(dgpdistfn, testdistfn, *args, **kwds): #dgpdistfn = getatrr(stats,dgpdist) #np.random.seed(987654321) loc = kwds.setdefault('loc',0.0) scale = kwds.setdefault('scale',1.0) n_repl = kwds.setdefault('n_repl',100) n = kwds.setdefault('size',100) smoothing = False#False#True# if smoothing: xxr = dgpdistfn.rvs(loc=loc, scale=scale, size=n) xxr.sort(axis=0) xxr = xxr.mean(axis=1) old = [] new = [] ksr = [] print 'data generation distribution is %s , hypothesis is %s' % \ (dgpdistfn.name, testdistfn.name) for ii in range(n_repl): #xxr = stats.norm.rvs(loc=loc,scale=scale, size=n) #xxr = stats.uniform.rvs(size=n,loc=-1,scale=2) xxr = dgpdistfn.rvs(loc=loc, scale=scale, size=n) old.append([stats.kstest(xxr,testdistfn.name)]) resultks = [] resultks.append(kstest(xxr,testdistfn.name, alternative = 'unequal', mode='approx')) resultks.append(kstest(xxr,testdistfn.name, alternative = 'unequal', mode='asymp')) resultks.append(kstest(xxr,testdistfn.name, alternative = 'larger')) resultks.append(kstest(xxr,testdistfn.name, alternative = 'smaller')) new.append(resultks) #result_r = get_R_kstest(xxr) ksr.append(get_R_kstest(xxr)) MAEold = np.absolute(np.array(old)-np.array(ksr)).mean(axis=0) MAE = np.absolute(np.array(new)-np.array(ksr)).mean(axis=0) print '='*50 print 'n = %d, loc = %f scale = %f, n_repl = %d' % (n,loc,scale,n_repl) print '''columns: D, pval rows are kstest(x,testdistfn.name, alternative = 'unequal', mode='approx')) kstest(x,testdistfn.name, alternative = 'unequal', mode='asymp')) kstest(x,testdistfn.name, alternative = 'larger')) kstest(x,testdistfn.name, alternative = 'smaller')) ''' print 'MAE old kstest' print MAEold print 'MAE new kstest' print MAE print 'percent count absdev > 0.005' print np.sum(np.absolute(np.array(new)-np.array(ksr))>0.005,axis=0)/float(n_repl)*100 print 'percent count absdev > 0.01' print np.sum(np.absolute(np.array(new)-np.array(ksr))>0.02,axis=0)/float(n_repl)*100 print 'percent count abs percent dev > 1%' print np.sum(np.absolute(np.array(new)-np.array(ksr))/np.array(ksr)>0.01,axis=0)/float(n_repl)*100 print 'percent count abs percent dev > 10%' print np.sum(np.absolute(np.array(new)-np.array(ksr))/np.array(ksr)>0.1,axis=0)/float(n_repl)*100 print 'new: count rejection at 1% significance' print 1-np.sum(np.absolute(np.array(new)[:,:,1])>0.01,axis=0)/float(n_repl) print 'R: proportion of rejection at 1% significance' print 1-np.sum(np.absolute(np.array(ksr)[:,:,1])>0.01,axis=0)/float(n_repl) print 'new: proportion of rejection at 5% significance' print 1-np.sum(np.absolute(np.array(new)[:,:,1])>0.05,axis=0)/float(n_repl) print 'R: proportion of rejection at 5% significance' print 1-np.sum(np.absolute(np.array(ksr)[:,:,1])>0.05,axis=0)/float(n_repl) print 'new: proportion of rejection at 10% significance' print 1-np.sum(np.absolute(np.array(new)[:,:,1])>0.1,axis=0)/float(n_repl) print 'R: proportion of rejection at 10% significance' print 1-np.sum(np.absolute(np.array(ksr)[:,:,1])>0.1,axis=0)/float(n_repl) if __name__ == '__main__': test_kstest() #monte_carlo(dgpdistfn, testdistfn, *args, **kwds) monte_carlo(stats.norm, stats.norm, loc=0.0, size=100, n_repl=1000)