'''
this contains new kstest, test function for test suite and functions to compare with R, including Monte Carlo
-------------------------------------------------------------------------------------------------------------


#simple example comparison to R

>>> x = np.linspace(-1,1,9)
>>> kstest(x,'norm')
(0.15865525393145707, 0.97727773578380395)
# comparing with R
>>> np.array(kstest(x,'norm')) - np.array((0.15865525393145705, 0.95164069201518386))
array([  2.77555756e-17,   2.56370438e-02])


>>> x = np.linspace(-15,15,9)
>>> kstest(x,'norm')
(0.44435602715924361, 0.038850142705171065)
# comparing with R
>>> np.array(kstest(x,'norm')) - np.array((0.44435602715924361, 0.038850140086788665))
array([  0.00000000e+00,   2.61838240e-09])


>>> np.random.seed(987654321)
>>> x = stats.norm.rvs(loc=0.2, size=100)
>>> kstest(x,'norm', alternative = 'smaller')
(0.12464329735846891, 0.040989164077641749)
>>> kstest(x,'norm', alternative = 'larger')
(0.0072115233216311081, 0.98531158590396395)
>>> kstest(x,'norm', mode='asymp')
(0.12464329735846891, 0.08944488871182088)
>>> np.array(kstest(x, 'norm', mode='asymp')) - np.array((0.12464329735846891, 0.089444888711820769
array([  0.00000000e+00,   1.11022302e-16])
>>> np.array(kstest(x,'norm', alternative = 'smaller')) - np.array((0.12464329735846891, 0.040989164077641749))
array([ 0.,  0.])
>>> np.array(kstest(x,'norm', alternative = 'larger')) - np.array((0.0072115233216310994, 0.98531158590396228))
array([  8.67361738e-18,   1.66533454e-15])

raise attribute error instead of silent replace:
with old version
>>> np.random.seed(987654321)
>>> stats.kstest('norm','',N=100)
(0.058051726828984629, 0.49080925022181865)
>>> np.random.seed(987654321)
>>> stats.kstest('norm','t',(1,),N=100) #silently uses norm not t
(0.058051726828984629, 0.49080925022181865)

'''




import numpy as np
import numpy.testing as npt
from scipy import stats, special



##ksprob = special.kolmogorov
##import scipy.stats
##import distributions
def kstest(rvs, cdf, args=(), N=20, alternative = 'unequal', mode='approx'):
    """Return the D-value and the p-value for a Kolmogorov-Smirnov test


    This performs a test of the distribution of random variables G(x) against
    a given distribution F(x). Under the null hypothesis the two distributions
    are identical, G(x)=F(x). The alternative hypothesis can be either
    'unequal' (default), 'smaller' or 'larger'. In the two one-sided test,
    the alternative is that the empirical cumulative distribution function,
    of the random variable is "smaller" or "larger" then the cumulative
    distribution function of the hypothesis F(x), G(x)<=F(x), resp. G(x)>=F(x).

    If the p-value is greater than the significance level (say 5%), then we
    cannot reject the hypothesis that the data come from the given
    distribution.

    Parameters
    ----------
    rvs : string or array or callable
        string: name of a distribution in scipy.stats
        array: random variables
        callable: function to generate random variables,
                requires keyword argument  size
    cdf : string or callable
        string: name of a distribution in scipy.stats
            if rvs is a string then cdf can evaluate to False
            or be the same as rvs
        callable: function to evaluate cdf

    args : distribution parameters used if rvs or cdf are strings
    N : sample size if rvs is string or callable
    alternative : 'unequal' (default), 'smaller' or 'larger'
        defines the alternative hypothesis (see explanation)
    mode : 'approx' (default) or 'asymp'
        defines distribution used for calculating p-value
        'approx' : use approximation to exact distribution of test statistic
        'asymp' : use asymptotic distribution of test statistic
    

    Returns
    -------
    D: test statistic either D, D+ or D-
    p-value

    Examples
    --------

    >>> x = np.linspace(-15,15,9)
    >>> kstest(x,'norm')
    (0.44435602715924361, 0.038850142705171065)

    >>> np.random.seed(987654321)
    >>> kstest('norm','',N=100)
    (0.058352892479417884, 0.88531190944151261)

    is equivalent to this
    >>> np.random.seed(987654321)
    >>> kstest(stats.norm.rvs(size=100),'norm')
    (0.058352892479417884, 0.88531190944151261)

    test against one-sided alternative hypothesis
    ---------------------------------------------
    >>> np.random.seed(987654321)
    >>> # shift distribution to larger values, so that cdf_dgp(x)< norm.cdf(x)
    >>> x = stats.norm.rvs(loc=0.2, size=100) 
    >>> kstest(x,'norm', alternative = 'smaller') 
    (0.12464329735846891, 0.040989164077641749)
    >>> # reject equal distribution against alternative hypothesis: smaller
    >>> kstest(x,'norm', alternative = 'larger')
    (0.0072115233216311081, 0.98531158590396395)
    >>> # don't reject equal distribution against alternative hypothesis: larger
    >>> kstest(x,'norm', mode='asymp')
    (0.12464329735846891, 0.08944488871182088)
    

    testing t distributed random variables against normal distribution
    ------------------------------------------------------------------
    >>> np.random.seed(987654321)
    >>> stats.kstest(stats.t.rvs(100,size=100),'norm')
    (0.062018929165471248, 0.44505373063343567)
    >>> np.random.seed(987654321)
    >>> stats.kstest(stats.t.rvs(3,size=100),'norm')
    (0.12101689575982888, 0.049143106661937996)
    """
    if isinstance(rvs, basestring):
        #cdf = getattr(stats, rvs).cdf
        if (not cdf) or (cdf == rvs):
            cdf = getattr(stats, rvs).cdf
            rvs = getattr(stats, rvs).rvs
            
        else:
            raise AttributeError, 'if rvs is string, cdf has to be the same distribution'
        
        
    if isinstance(cdf, basestring):
        cdf = getattr(stats, cdf).cdf
    if callable(rvs):
        kwds = {'size':N}
        vals = np.sort(rvs(*args,**kwds))
    else:
        vals = np.sort(rvs)
        N = len(vals)
    cdfvals = cdf(vals, *args)
    
    if alternative in ['unequal', 'larger']:
        Dplus = (np.arange(1.0, N+1)/N - cdfvals).max()
        if alternative == 'larger':
            return Dplus, stats.ksone.sf(Dplus,N)
            
    if alternative in ['unequal', 'smaller']:
        Dmin = (cdfvals - np.arange(0.0, N)/N).max()
        if alternative == 'smaller':
            return Dmin, stats.ksone.sf(Dmin,N)
        
    if alternative == 'unequal':
        D = np.max([Dplus,Dmin])
        if mode == 'asymp':
             return D, stats.kstwobign.sf(D*np.sqrt(N))
        if mode == 'approx':
            pval_two = stats.kstwobign.sf(D*np.sqrt(N))
            if N > 2666 or pval_two > 0.80 - N*0.3/1000.0 :
                return D, stats.kstwobign.sf(D*np.sqrt(N))
            else:
                return D, stats.ksone.sf(D,N)*2

def test_kstest():
    from numpy.testing import assert_almost_equal
    
    # comparing with values from R
    x = np.linspace(-1,1,9)
    D,p = kstest(x,'norm')
    assert_almost_equal( D, 0.15865525393145705, 12)
    assert_almost_equal( p, 0.95164069201518386, 1)

    x = np.linspace(-15,15,9)
    D,p = kstest(x,'norm')
    assert_almost_equal( D, 0.44435602715924361, 15)
    assert_almost_equal( p, 0.038850140086788665, 8)

    # the following tests rely on deterministicaly replicated rvs
    np.random.seed(987654321)
    x = stats.norm.rvs(loc=0.2, size=100)
    D,p = kstest(x, 'norm', mode='asymp')
    assert_almost_equal( D, 0.12464329735846891, 15)
    assert_almost_equal( p, 0.089444888711820769, 15)
    assert_almost_equal( np.array(kstest(x, 'norm', mode='asymp')),
                np.array((0.12464329735846891, 0.089444888711820769)), 15)
    assert_almost_equal( np.array(kstest(x,'norm', alternative = 'smaller')),
                np.array((0.12464329735846891, 0.040989164077641749)), 15)
    assert_almost_equal( np.array(kstest(x,'norm', alternative = 'larger')),
                np.array((0.0072115233216310994, 0.98531158590396228)), 14)


# the rest is for Monte Carlo comparison with r

try:
    import rpy
    r=rpy.r
    ksfn=r('ks.test')
except ImportError:
    print "rpy not available"

def get_R_kstest(xxr):
    n = len(xxr)
    #print '\n two-sided'
    resultr = ksfn(xxr,'pnorm', exact = True)
    D_r = resultr['statistic']['D']
    
    resultrf = ksfn(xxr,'pnorm', exact = False)

    #print '\n less'
    resultr_l = ksfn(xxr,'pnorm', alternative = "less", exact = True )
    D_r_l = resultr_l['statistic']['D^-']

    #print '\n greater'
    resultr_g = ksfn(xxr,'pnorm', alternative = "greater", exact = True)
    D_r_g = resultr_g['statistic']['D^+']

    return [(resultr['statistic']['D'], resultr['p.value']),
            (resultrf['statistic']['D'], resultrf['p.value']),
            (resultr_g['statistic']['D^+'], resultr_g['p.value']),
            (resultr_l['statistic']['D^-'], resultr_l['p.value'])]

def check(xxr):
    n = len(xxr)
    print n
    resultr={}
    resultrf={}
    print '\n two-sided'
    resultr = ksfn(xxr,'pnorm', exact = True)
    D_r = resultr['statistic']['D']
    print resultr['statistic']['D']
    print resultr['p.value']
    resultrf = ksfn(xxr,'pnorm', exact = False)
    print resultrf['statistic']['D']
    print resultrf['p.value']

    print stats.kstest(xxr,'norm')
    print stats.ksone.sf(D_r,n)
    print stats.ksone.sf(D_r,n)*2
    print stats.kstwobign.sf(D_r*np.sqrt(n))
    pval_ksone_x2 = stats.ksone.sf(D_r,n)*2
    pval_kstwo = stats.kstwobign.sf(D_r*np.sqrt(n))

    print '\n less'
    resultr_l = ksfn(xxr,'pnorm', alternative = "less", exact = True )
    D_r_l = resultr_l['statistic']['D^-']
    print resultr_l['statistic']['D^-']
    print resultr_l['p.value']

    print stats.kstest(xxr,'norm')
    print stats.ksone.sf(D_r_l,n)
    print stats.ksone.sf(D_r_l,n)*2
    print stats.kstwobign.sf(D_r_l*np.sqrt(n))
    pval_ksone_l = stats.ksone.sf(D_r_l,n)
    pval_kstwo_l = stats.kstwobign.sf(D_r_l*np.sqrt(n))
    
    print '\n greater'
    resultr_g = ksfn(xxr,'pnorm', alternative = "greater", exact = True)
    D_r_g = resultr_g['statistic']['D^+']
    print resultr_g['statistic']['D^+']
    print resultr_g['p.value']

    print stats.kstest(xxr,'norm')
    print stats.ksone.sf(D_r_g,n)
    print stats.ksone.sf(D_r_g,n)*2
    print stats.kstwobign.sf(D_r_g*np.sqrt(n))
    pval_ksone_g = stats.ksone.sf(D_r_g,n)
    pval_kstwo_g = stats.kstwobign.sf(D_r_g*np.sqrt(n))

##    npt.assert_almost_equal(resultr_l['p.value'],pval_ksone_l,3)
##    npt.assert_almost_equal(resultr_g['p.value'],pval_ksone_g,3)
##    
##    npt.assert_almost_equal(resultr['p.value'],pval_ksone_x2,3)
##    npt.assert_almost_equal(resultrf['p.value'],pval_kstwo,4)
    #return resultr, resultrf, resultr_g, resultr_l
    return [(D_r, resultr['p.value']),
            (D_r, resultrf['p.value']),
            (D_r_g, resultr_g['p.value']),
            (D_r_l, resultr_l['p.value'])]
            

##resultrf = ksfn(xxr,'pnorm', exact = False)
##resultrf['statistic']['D']
##resultrf['p.value']
##stats.ksone.sf(resultr['statistic']['D'],100)*2
##stats.kstwobign.sf(resultr['statistic']['D']*np.sqrt(100.0))
##
### example normal size=1000
##>>> xxrl = stats.norm.rvs(size=1000)
##>>> resultrl=ksfn(xxrl,'pnorm', exact = True)
##>>> resultrl['p.value']
##0.2419499342788699
##>>> resultrl['statistic']['D']
##0.032317405617139472
##>>> stats.kstest(xxrl,'norm')
##(0.032317405617139472, 0.12118954799968018)



def monte_carlo(dgpdistfn, testdistfn, *args, **kwds):
 
    #dgpdistfn = getatrr(stats,dgpdist)
    #np.random.seed(987654321)
    loc = kwds.setdefault('loc',0.0)
    scale = kwds.setdefault('scale',1.0)
    n_repl = kwds.setdefault('n_repl',100)
    n = kwds.setdefault('size',100)


    smoothing = False#False#True#
    if smoothing:
        xxr = dgpdistfn.rvs(loc=loc, scale=scale, size=n)
        xxr.sort(axis=0)
        xxr = xxr.mean(axis=1)

    old = []
    new = []
    ksr = []

    print 'data generation distribution is %s , hypothesis is %s' % \
                              (dgpdistfn.name, testdistfn.name)
    for ii in range(n_repl):
    
        #xxr = stats.norm.rvs(loc=loc,scale=scale, size=n)
        #xxr = stats.uniform.rvs(size=n,loc=-1,scale=2)
        xxr = dgpdistfn.rvs(loc=loc, scale=scale, size=n)
        
        old.append([stats.kstest(xxr,testdistfn.name)])
        resultks = []
        resultks.append(kstest(xxr,testdistfn.name, alternative = 'unequal', mode='approx'))
        resultks.append(kstest(xxr,testdistfn.name, alternative = 'unequal', mode='asymp'))
        resultks.append(kstest(xxr,testdistfn.name, alternative = 'larger'))
        resultks.append(kstest(xxr,testdistfn.name, alternative = 'smaller'))
        new.append(resultks)
        
        #result_r = get_R_kstest(xxr)
        ksr.append(get_R_kstest(xxr))

    MAEold = np.absolute(np.array(old)-np.array(ksr)).mean(axis=0)
    MAE = np.absolute(np.array(new)-np.array(ksr)).mean(axis=0)
    print '='*50
    print 'n = %d, loc = %f scale = %f, n_repl = %d' % (n,loc,scale,n_repl)
    print '''columns: D, pval
        rows are
        kstest(x,testdistfn.name, alternative = 'unequal', mode='approx'))
        kstest(x,testdistfn.name, alternative = 'unequal', mode='asymp'))
        kstest(x,testdistfn.name, alternative = 'larger'))
        kstest(x,testdistfn.name, alternative = 'smaller'))
        '''

    print 'MAE old kstest'
    print MAEold
    print 'MAE new kstest'
    print MAE
    print 'percent count absdev > 0.005'
    print np.sum(np.absolute(np.array(new)-np.array(ksr))>0.005,axis=0)/float(n_repl)*100
    print 'percent count absdev > 0.01'
    print np.sum(np.absolute(np.array(new)-np.array(ksr))>0.02,axis=0)/float(n_repl)*100
    print 'percent count abs percent dev > 1%'
    print np.sum(np.absolute(np.array(new)-np.array(ksr))/np.array(ksr)>0.01,axis=0)/float(n_repl)*100
    print 'percent count abs percent dev > 10%'
    print np.sum(np.absolute(np.array(new)-np.array(ksr))/np.array(ksr)>0.1,axis=0)/float(n_repl)*100
    print 'new: count rejection at 1% significance'
    print 1-np.sum(np.absolute(np.array(new)[:,:,1])>0.01,axis=0)/float(n_repl)
    print 'R: proportion of rejection at 1% significance'
    print 1-np.sum(np.absolute(np.array(ksr)[:,:,1])>0.01,axis=0)/float(n_repl)
    print 'new: proportion of rejection at 5% significance'
    print 1-np.sum(np.absolute(np.array(new)[:,:,1])>0.05,axis=0)/float(n_repl)
    print 'R: proportion of rejection at 5% significance'
    print 1-np.sum(np.absolute(np.array(ksr)[:,:,1])>0.05,axis=0)/float(n_repl)
    print 'new: proportion of rejection at 10% significance'
    print 1-np.sum(np.absolute(np.array(new)[:,:,1])>0.1,axis=0)/float(n_repl)
    print 'R: proportion of rejection at 10% significance'
    print 1-np.sum(np.absolute(np.array(ksr)[:,:,1])>0.1,axis=0)/float(n_repl)


if __name__ == '__main__':
    test_kstest()
    
    #monte_carlo(dgpdistfn, testdistfn, *args, **kwds)
    monte_carlo(stats.norm, stats.norm, loc=0.0, size=100, n_repl=1000)