[Scipy-svn] r2267 - in trunk/Lib/sandbox/pyem: . pyem pyem/src

scipy-svn at scipy.org scipy-svn at scipy.org
Thu Oct 12 09:45:23 EDT 2006


Author: cdavid
Date: 2006-10-12 08:45:14 -0500 (Thu, 12 Oct 2006)
New Revision: 2267

Added:
   trunk/Lib/sandbox/pyem/TODO
   trunk/Lib/sandbox/pyem/pyem/
   trunk/Lib/sandbox/pyem/pyem/__init__.py
   trunk/Lib/sandbox/pyem/pyem/densities.py
   trunk/Lib/sandbox/pyem/pyem/gmm_em.py
   trunk/Lib/sandbox/pyem/pyem/src/
   trunk/Lib/sandbox/pyem/pyem/src/Makefile
   trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx
   trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd
   trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd
Removed:
   trunk/Lib/sandbox/pyem/Makefile
   trunk/Lib/sandbox/pyem/c_gmm.pyx
   trunk/Lib/sandbox/pyem/c_numpy.pxd
   trunk/Lib/sandbox/pyem/c_python.pxd
   trunk/Lib/sandbox/pyem/densities.py
   trunk/Lib/sandbox/pyem/gmm_em.py
Log:
[pyem @ david at ar.media.kyoto-u.ac.jp-20060712045202-8dd4340c31b48cd7]
Put files into a package
David Cournapeau <david at ar.media.kyoto-u.ac.jp> | 2006-07-12 13:52:02 +0900 (Wed, 12 Jul 2006)

Deleted: trunk/Lib/sandbox/pyem/Makefile
===================================================================
--- trunk/Lib/sandbox/pyem/Makefile	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/Makefile	2006-10-12 13:45:14 UTC (rev 2267)
@@ -1,26 +0,0 @@
-CC	= colorgcc
-LD	= gcc
-
-PYREX	= python2.4-pyrexc
-
-PYTHONINC	= -I/usr/include/python2.4
-NUMPYINC	= -I/usr/lib/python2.4/site-packages/numpy/core/include
-OPTIMS		= -O3 -ffast-math -march=pentium4
-WARN		= -W -Wall
-
-CFLAGS	= $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN)
-
-c_gmm.so: c_gmm.o
-	$(LD) -shared -o $@ $< -Wl,-soname,$@
-
-c_gmm.o: c_gmm.c
-	$(CC) -c $(CFLAGS) -fPIC $<
-
-c_gmm.c: c_gmm.pyx c_numpy.pxd c_python.pxd
-	$(PYREX) $<
-
-clean:
-	rm -f c_gmm.c
-	rm -f *.o
-	rm -f *.so
-	rm -f *.pyc

Added: trunk/Lib/sandbox/pyem/TODO
===================================================================
--- trunk/Lib/sandbox/pyem/TODO	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/TODO	2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,7 @@
+# Last Change: Fri Jun 30 07:00 PM 2006 J
+
+# In more urgent order:
+
+1: test and benchmarks for various length and model size
+2: refactoring for Mixture models, having online mode, etc...
+3: for gaussian densities: compute level <-> confidence ellipses relationship with Chi2 model

Deleted: trunk/Lib/sandbox/pyem/c_gmm.pyx
===================================================================
--- trunk/Lib/sandbox/pyem/c_gmm.pyx	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/c_gmm.pyx	2006-10-12 13:45:14 UTC (rev 2267)
@@ -1,66 +0,0 @@
-# Last Change: Tue May 16 11:00 AM 2006 J
-
-cimport c_numpy
-cimport c_python
-import numpy
-
-c_numpy.import_array()
-
-# pyrex version of _vq. Much faster in high dimension/high number of K 
-# (ie more than 3-4)
-def _vq(data, init):
-    (n, d)  = data.shape
-    label   = numpy.zeros(n, int)
-    _imp_vq(data, init, label)
-
-    return label
-
-def _imp_vq(c_numpy.ndarray data, c_numpy.ndarray init, c_numpy.ndarray label):
-    cdef int n
-    cdef int d
-    cdef int nc
-    cdef int i
-    cdef int j
-    cdef int k
-    cdef int *labeld
-    cdef double *da, *code
-    cdef double dist
-    cdef double acc
-
-    n   = data.dimensions[0]
-    d   = data.dimensions[1]
-    nc  = init.dimensions[0]
-
-    if not data.dtype == numpy.dtype(numpy.float64):
-        print '_vq not (yet) implemented for dtype %s'%dtype.name
-        return
-    da  = (<double*>data.data)
-
-    if not init.dtype == numpy.dtype(numpy.float64):
-        print '_vq not (yet) implemented for dtype %s'%dtype.name
-        return
-    code    = (<double*>init.data)
-
-    if not label.dtype == numpy.dtype(numpy.int32):
-        print '_vq not (yet) implemented for dtype %s'%dtype.name
-        return
-    labeld  = (<int*>label.data)
-
-    for i from 0<=i<n:
-        acc = 0
-        lab = 0
-        for j from 0<=j<d:
-            acc = acc + (da[i * d + j] - code[j]) * \
-                (da[i * d + j] - code[j])
-        dist    = acc
-        for k from 1<=k<nc:
-            acc     = 0
-            for j from 0<=j<d:
-                acc = acc + (da[i * d + j] - code[k * d + j]) * \
-                    (da[i * d + j] - code[k * d + j])
-            if acc < dist:
-                dist    = acc
-                lab     = k
-        labeld[i]   = lab
-
-    return lab

Deleted: trunk/Lib/sandbox/pyem/c_numpy.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/c_numpy.pxd	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/c_numpy.pxd	2006-10-12 13:45:14 UTC (rev 2267)
@@ -1,59 +0,0 @@
-# :Author:    Robert Kern
-# :Copyright: 2004, Enthought, Inc.
-# :License:   BSD Style
-
-
-cdef extern from "numpy/arrayobject.h":
-    ctypedef enum PyArray_TYPES:
-        PyArray_BOOL
-        PyArray_BYTE
-        PyArray_UBYTE
-        PyArray_SHORT
-        PyArray_USHORT 
-        PyArray_INT
-        PyArray_UINT 
-        PyArray_LONG
-        PyArray_ULONG
-        PyArray_LONGLONG
-        PyArray_ULONGLONG
-        PyArray_FLOAT
-        PyArray_DOUBLE 
-        PyArray_LONGDOUBLE
-        PyArray_CFLOAT
-        PyArray_CDOUBLE
-        PyArray_CLONGDOUBLE
-        PyArray_OBJECT
-        PyArray_STRING
-        PyArray_UNICODE
-        PyArray_VOID
-        PyArray_NTYPES
-        PyArray_NOTYPE
-
-    ctypedef int intp 
-
-    ctypedef extern class numpy.dtype [object PyArray_Descr]:
-        cdef int type_num, elsize, alignment
-        cdef char type, kind, byteorder, hasobject
-        cdef object fields, typeobj
-
-    ctypedef extern class numpy.ndarray [object PyArrayObject]:
-        cdef char *data
-        cdef int nd
-        cdef intp *dimensions
-        cdef intp *strides
-        cdef object base
-        cdef dtype descr
-        cdef int flags
-
-    ndarray PyArray_SimpleNew(int ndims, intp* dims, int item_type)
-    int PyArray_Check(object obj)
-    ndarray PyArray_ContiguousFromObject(object obj, PyArray_TYPES type, 
-        int mindim, int maxdim)
-    intp PyArray_SIZE(ndarray arr)
-    void *PyArray_DATA(ndarray arr)
-    ndarray PyArray_FromAny(object obj, dtype newtype, int mindim, int maxdim,
-		    int requirements, object context)
-    ndarray PyArray_NewFromDescr(object subtype, dtype newtype, int nd, intp* dims,
-		    intp* strides, void* data, int flags, object parent)
-
-    void import_array()

Deleted: trunk/Lib/sandbox/pyem/c_python.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/c_python.pxd	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/c_python.pxd	2006-10-12 13:45:14 UTC (rev 2267)
@@ -1,20 +0,0 @@
-# -*- Mode: Python -*-  Not really, but close enough
-
-# Expose as much of the Python C API as we need here
-
-cdef extern from "stdlib.h":
-    ctypedef int size_t
-
-cdef extern from "Python.h":
-    ctypedef int Py_intptr_t
-    void*  PyMem_Malloc(size_t)
-    void*  PyMem_Realloc(void *p, size_t n)
-    void   PyMem_Free(void *p)
-    char*  PyString_AsString(object string)
-    object PyString_FromString(char *v)
-    object PyString_InternFromString(char *v)
-    int    PyErr_CheckSignals()
-    object PyFloat_FromDouble(double v)
-    void   Py_XINCREF(object o)
-    void   Py_XDECREF(object o)
-    void   Py_CLEAR(object o) # use instead of decref

Deleted: trunk/Lib/sandbox/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/densities.py	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/densities.py	2006-10-12 13:45:14 UTC (rev 2267)
@@ -1,375 +0,0 @@
-#! /usr/bin/python
-#
-# Copyrighted David Cournapeau
-# Last Change: Mon May 29 01:00 PM 2006 J
-
-import numpy as N
-import numpy.linalg as lin
-
-# Error classes
-class DenError(Exception):
-    """Base class for exceptions in this module.
-    
-    Attributes:
-        expression -- input expression in which the error occurred
-        message -- explanation of the error"""
-    def __init__(self, message):
-        self.message    = message
-    
-    def __str__(self):
-        return self.message
-
-# The following function do all the fancy stuff to check that parameters
-# are Ok, and call the right implementation if args are OK.
-def gauss_den(x, mu, va, log = False):
-    """ Compute multivariate Gaussian density at points x for 
-    mean mu and variance va.
-    
-    Vector are row vectors, except va which can be a matrix
-    (row vector variance for diagonal variance)
-    
-    If log is True, than the log density is returned 
-    (useful for underflow ?)"""
-    mu  = N.atleast_2d(mu)
-    va  = N.atleast_2d(va)
-    x   = N.atleast_2d(x)
-    
-    #=======================#
-    # Checking parameters   #
-    #=======================#
-    if len(N.shape(mu)) != 2:
-        raise DenError("mu is not rank 2")
-        
-    if len(N.shape(va)) != 2:
-        raise DenError("va is not rank 2")
-        
-    if len(N.shape(x)) != 2:
-        raise DenError("x is not rank 2")
-        
-    (n, d)      = x.shape
-    (dm0, dm1)  = mu.shape
-    (dv0, dv1)  = va.shape
-    
-    # Check x and mu same dimension
-    if dm0 != 1:
-        msg = "mean must be a row vector!"
-        raise DenError(msg)
-    if dm1 != d:
-        msg = "x and mu not same dim"
-        raise DenError(msg)
-    # Check va and mu same size
-    if dv1 != d:
-        msg = "mu and va not same dim"
-        raise DenError(msg)
-    if dv0 != 1 and dv0 != d:
-        msg = "va not square"
-        raise DenError(msg)
-
-    #===============#
-    # Computation   #
-    #===============#
-    data    = x - mu
-
-    if d == 1:
-        # scalar case
-        return _scalar_gauss_den(data[:, 0], mu[0, 0], va[0, 0], log)
-    elif dv0 == 1:
-        # Diagonal matrix case
-        return _diag_gauss_den(data, mu, va, log)
-    elif dv1 == dv0:
-        # full case
-        return  _full_gauss_den(data, mu, va, log)
-    else:
-        raise DenError("variance mode not recognized, this is a bug")
-
-# Those 3 functions do almost all the actual computation
-def _scalar_gauss_den(x, mu, va, log):
-    """ This function is the actual implementation
-    of gaussian pdf in scalar case. It assumes all args
-    are conformant, so it should not be used directly
-    
-    ** Expect centered data (ie with mean removed) **
-
-    Call gauss_den instead"""
-    d       = mu.size
-    inva    = 1/va
-    fac     = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
-    y       = (x ** 2) * -0.5 * inva
-    if not log:
-        y   = fac * N.exp(y)
-    else:
-        y   = y + log(fac)
-
-    return y
-    
-def _diag_gauss_den(x, mu, va, log):
-    """ This function is the actual implementation
-    of gaussian pdf in scalar case. It assumes all args
-    are conformant, so it should not be used directly
-    
-    ** Expect centered data (ie with mean removed) **
-
-    Call gauss_den instead"""
-    # Diagonal matrix case
-    d   = mu.size
-    y   = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
-    if not log:
-        for i in range(1, d):
-            y    *=  _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
-    else:
-        for i in range(1, d):
-            y    +=  _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
-    return y
-
-def _full_gauss_den(x, mu, va, log):
-    """ This function is the actual implementation
-    of gaussian pdf in full matrix case. 
-    
-    It assumes all args are conformant, so it should 
-    not be used directly Call gauss_den instead
-    
-    ** Expect centered data (ie with mean removed) **
-
-    Does not check if va is definite positive (on inversible 
-    for that matter), so the inverse computation and/or determinant
-    would throw an exception."""
-    d       = mu.size
-    inva    = lin.inv(va)
-    fac     = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
-
-    # # Slow version
-    # n       = N.size(x, 0)
-    # y       = N.zeros(n, float)
-    # for i in range(n):
-    #     y[i] = N.matrixmultiply(x[i,:],
-    #              N.matrixmultiply(inva, N.transpose(x[i,:])))
-    # y *= -0.5
-
-    # we are using a trick with sum to "emulate" 
-    # the matrix multiplication inva * x without any explicit loop
-    y   = N.matrixmultiply(x, inva)
-    y   = -0.5 * N.sum(y * x, 1)
-
-    if not log:
-        y   = fac * N.exp(y)
-    else:
-        y   = y + N.log(fac)
- 
-    return y
-
-# To plot a confidence ellipse from multi-variate gaussian pdf
-def gauss_ell(mu, va, dim = [0, 1], npoints = 100):
-    """ Given a mean and covariance for multi-variate
-    gaussian, returns npoints points for the ellipse
-    of confidence 0.39
-    
-    Returns the coordinate x and y of the ellipse"""
-    
-    # TODO: Get a confidence interval using the Chi2 distribution
-    # of points at a given mahalanobis distance...
-    mu      = N.atleast_1d(mu)
-    va      = N.atleast_1d(va)
-    c       = N.array(dim)
-
-    if mu.size == va.size:
-        mode    = 'diag'
-    else:
-        if va.ndim == 2:
-            if va.shape[0] == va.shape[1]:
-                mode    = 'full'
-            else:
-                raise DenError("variance not square")
-        else:
-            raise DenError("mean and variance are not dim conformant")
-
-    level   = 0.39
-    
-    # Generates a circle of npoints
-    theta   = N.linspace(0, 2 * N.pi, npoints)
-    circle  = N.array([N.cos(theta), N.sin(theta)])
-
-    # Get the dimension which we are interested in:
-    mu  = mu[dim]
-    if mode == 'diag':
-        va      = va[dim]
-        elps    = N.outerproduct(mu, N.ones(npoints, float))
-        elps    += N.matrixmultiply(N.diag(N.sqrt(va)), circle)
-    elif mode == 'full':
-        va  = va[c,:][:,c]
-        # Method: compute the cholesky decomp of each cov matrix, that is
-        # compute cova such as va = cova * cova' 
-        # WARN: scipy is different than matlab here, as scipy computes a lower
-        # triangular cholesky decomp: 
-        #   - va = cova * cova' (scipy)
-        #   - va = cova' * cova (matlab)
-        # So take care when comparing results with matlab !
-        cova    = lin.cholesky(va)
-        elps    = N.outerproduct(mu, N.ones(npoints, float))
-        elps    += N.matrixmultiply(cova, circle)
-    else:
-        raise DenParam("var mode not recognized")
-
-    return elps[0, :], elps[1, :]
-
-def test_gauss_den():
-    """"""
-    # import tables
-    # import numpy as N
-    # 
-    # filename    = 'dendata.h5'
-
-    # # # Dimension 1
-    # # d   = 1
-    # # mu  = 1.0
-    # # va  = 2.0
-
-    # # X   = N.randn(1e3, 1)
-
-    # # Y   = gauss_den(X, mu, va)
-
-    # # h5file      = tables.openFile(filename, "w")
-
-    # # h5file.createArray(h5file.root, 'X', X)
-    # # h5file.createArray(h5file.root, 'mu', mu)
-    # # h5file.createArray(h5file.root, 'va', va)
-    # # h5file.createArray(h5file.root, 'Y', Y)
-
-    # # h5file.close()
-
-    # # # Dimension 2, diag
-    # # d   = 2
-    # # mu  = N.array([1.0, -2.0])
-    # # va  = N.array([1.0, 2.0])
-
-    # # X   = N.randn(1e3, 2)
-
-    # # Y   = gauss_den(X, mu, va)
-
-    # # h5file      = tables.openFile(filename, "w")
-
-    # # h5file.createArray(h5file.root, 'X', X)
-    # # h5file.createArray(h5file.root, 'mu', mu)
-    # # h5file.createArray(h5file.root, 'va', va)
-    # # h5file.createArray(h5file.root, 'Y', Y)
-
-    # # Dimension 2, full
-    # d   = 2
-    # mu  = N.array([[0.2, -1.0]])
-    # va  = N.array([[1.2, 0.1], [0.1, 0.5]])
-
-    # X   = N.randn(1e3, 2)
-
-    # Y   = gauss_den(X, mu, va)
-
-    # h5file      = tables.openFile(filename, "w")
-
-    # h5file.createArray(h5file.root, 'X', X)
-    # h5file.createArray(h5file.root, 'mu', mu)
-    # h5file.createArray(h5file.root, 'va', va)
-    # h5file.createArray(h5file.root, 'Y', Y)
-
-    # h5file.close()
-
-    import numpy.testing as testing
-    #=================
-    # Small test in 1d
-    #=================
-    va  = 2.0
-    mu  = 1.0
-    X   = N.linspace(-2, 2, 10)[:, N.NewAxis]
-
-    Yt  = N.array([0.02973257230591, 0.05512079811082, 0.09257745306945, 
-            0.14086453882683,
-            0.19418015562214, 0.24250166773127, 0.27436665745048, 0.28122547107069,
-            0.26114678964743, 0.21969564473386])
-
-    Y   = gauss_den(X, mu, va)
-    try:
-        testing.assert_array_almost_equal(Y, Yt)
-        print "1d test succeded"
-    except AssertionError:
-        print "test fails in 1d"
-
-    #============================
-    # Small test in 2d (diagonal)
-    #============================
-    mu  = N.atleast_2d([-1.0, 2.0])
-    va  = N.atleast_2d([2.0, 3.0])
-    X1  = N.linspace(-2, 2, 10)[:, N.NewAxis]
-    X2  = N.linspace(-1, 3, 10)[:, N.NewAxis]
-    X   = N.concatenate(([X1, X2]), 1)
-    
-    Yt  = N.array([0.01129091565384, 0.02025416837152, 0.03081845516786, 
-            0.03977576221540, 0.04354490552910, 0.04043592581117, 
-            0.03184994053539, 0.02127948225225, 0.01205937178755, 
-            0.00579694938623 ])
-
-    Y   = gauss_den(X, mu, va)
-    try:
-        testing.assert_array_almost_equal(Y, Yt)
-        print "2d diag test succeded"
-    except AssertionError:
-        print "test fails in 2d diag"
-
-    #============================
-    # Small test in 2d (full mat)
-    #============================
-    mu  = N.array([[0.2, -1.0]])
-    va  = N.array([[1.2, 0.1], [0.1, 0.5]])
-    X1  = N.linspace(-2, 2, 10)[:, N.NewAxis]
-    X2  = N.linspace(-3, 3, 10)[:, N.NewAxis]
-    X   = N.concatenate(([X1, X2]), 1)
-    
-    Yt  = N.array([0.00096157109751, 0.01368908714856,
-        0.07380823191162, 0.15072050533842, 
-        0.11656739937861, 0.03414436965525,
-        0.00378789836599, 0.00015915297541, 
-        0.00000253261067, 0.00000001526368])
-
-    Y   = gauss_den(X, mu, va)
-    try:
-        testing.assert_array_almost_equal(Y, Yt)
-        print "2d full test succeded"
-    except AssertionError:
-        print "test fails in 2d full"
-         
-
-if __name__ == "__main__":
-    import pylab
-
-    #=========================================
-    # Test plotting a simple diag 2d variance:
-    #=========================================
-    va  = N.array([5, 3])
-    mu  = N.array([2, 3])
-
-    # Generate a multivariate gaussian of mean mu and covariance va
-    X       = N.randn(1e3, 2)
-    Yc      = N.matrixmultiply(N.diag(N.sqrt(va)), X.transpose())
-    Yc      = Yc.transpose() + mu
-
-    # Plotting
-    Xe, Ye  = gauss_ell(mu, va, npoints = 100)
-    pylab.figure()
-    pylab.plot(Yc[:, 0], Yc[:, 1], '.')
-    pylab.plot(Xe, Ye, 'r')
-
-    #=========================================
-    # Test plotting a simple full 2d variance:
-    #=========================================
-    va  = N.array([[0.2, 0.1],[0.1, 0.5]])
-    mu  = N.array([0, 3])
-
-    # Generate a multivariate gaussian of mean mu and covariance va
-    X       = N.randn(1e3, 2)
-    Yc      = N.matrixmultiply(lin.cholesky(va), X.transpose())
-    Yc      = Yc.transpose() + mu
-
-    # Plotting
-    Xe, Ye  = gauss_ell(mu, va, npoints = 100)
-    pylab.figure()
-    pylab.plot(Yc[:, 0], Yc[:, 1], '.')
-    pylab.plot(Xe, Ye, 'r')
-    pylab.show()
-
-    savefig('example.png')

Deleted: trunk/Lib/sandbox/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/gmm_em.py	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/gmm_em.py	2006-10-12 13:45:14 UTC (rev 2267)
@@ -1,734 +0,0 @@
-# /usr/bin/python
-# Last Change: Thu May 25 03:00 PM 2006 J
-
-# TODO:
-#   - interface with libem
-#   - implements E and M in batch mode for mixtures, with special case for gmm 
-#   ( general EM should be outside this module. This one should be GMM specific, maybe)
-#   - implements E and M in online mode
-#   - python has list. Does it make sense to keep the h2m format for mean, va, when
-#   we can use list instead ?
-#   - a Gaussian Mixture Model should be a class, really !
-
-import numpy as N
-import numpy.linalg as lin
-import densities
-
-MAX_DEV = 1e-10
-
-# Error classes
-class GmmError(Exception):
-    """Base class for exceptions in this module."""
-    pass
-
-class GmmParamError(GmmError):
-    """Exception raised for errors in gmm params
-
-    Attributes:
-        expression -- input expression in which the error occurred
-        message -- explanation of the error
-    """
-    def __init__(self, message):
-        self.message    = message
-    
-    def __str__(self):
-        return self.message
-
-
-# Function to generate a GMM, or valid parameters for GMM
-def gen_rand_index(p, n):
-    """Generate a N samples vector containing random index between 1 
-    and length(p), each index i with probability p(i)"""
-    # TODO Check args here
-    
-    # TODO: check each value of inverse distribution is
-    # different
-    invcdf  = N.cumsum(p)
-    uni     = N.rand(n)
-    index   = N.zeros(n)
-
-    # This one should be a bit faster
-    for k in range(len(p)-1, 0, -1):
-        blop        = N.where(N.logical_and(invcdf[k-1] <= uni, 
-                    uni < invcdf[k]))
-        index[blop] = k
-        
-    return index
-
-def gen_gmm(w, mu, va, n):
-    """Generate a gaussiam mixture model with weights w, 
-    mean mu and variances va. Each column of the parameters
-    are one component parameter.
-    """
-    # Check args
-    K, d, mode  = check_gmm_param(w, mu, va)
-
-    # Generate the mixture
-    S   = gen_rand_index(w, n)  # State index (ie hidden var)
-    X   = N.randn(n, d)         # standard gaussian
-
-    if mode == 'diag':
-        X   = mu[S, :]  + X * N.sqrt(va[S,:])
-    elif mode == 'full':
-        # Naive implementation
-        # cho = N.zeros(va.shape, float)
-        # for k in range(K):
-        #     # Using cholesky is more stable than sqrtm; sqrtm is not
-        #     # available in numpy anyway, only in scipy...
-        #     cho[k*d:k*d+d,:]    = lin.cholesky(va[k*d:k*d+d,:])
-        # for i in range(n):
-        # X[i,:]  = mu[S[i],:] + N.matrixmultiply(cho[S[i]*d:S[i]*d+d,:], X[i,:])
-        
-        # Faster:
-        cho = N.zeros((K, va.shape[1], va.shape[1]), float)
-        for k in range(K):
-            # Using cholesky is more stable than sqrtm; sqrtm is not
-            # available in numpy anyway, only in scipy...
-            cho[k]  = lin.cholesky(va[k*d:k*d+d,:])
-
-        for s in range(K):
-            tmpind      = N.where(S == s)[0]
-            X[tmpind]   = N.matrixmultiply(X[tmpind], cho[s].transpose()) + mu[s]
-    else:
-        raise GmmParamError('cov matrix mode not recognized')
-
-    return X
-
-def gen_gmm_param(d, K, varmode = 'diag', spread = 1):
-    """Generate valid parameters for a gaussian mixture model.
-    d is the dimension, K the number of components, and varmode
-    the mode for cov matrices.
-
-    Returns:    
-        - w
-        - mu
-        - va
-    """
-    w   = abs(N.randn(K))
-    w   = w / sum(w)
-
-    mu  = spread * N.randn(K, d)
-    if varmode == 'diag':
-        va  = abs(N.randn(K, d))
-    elif varmode == 'full':
-        va  = N.randn(K * d, d)
-        for k in range(K):
-            va[k*d:k*d+d]   = N.matrixmultiply( va[k*d:k*d+d], 
-                va[k*d:k*d+d].transpose())
-    else:
-        raise GmmParamError('cov matrix mode not recognized')
-
-    return w, mu, va
-
-def check_gmm_param(w, mu, va):
-    """Check that w, mu and va are valid parameters for
-    a mixture of gaussian: w should sum to 1, there should
-    be the same number of component in each param, the variances
-    should be positive definite, etc... 
-    
-    Params:
-        w   = vector or list of weigths of the mixture (K elements)
-        mu  = matrix: K * d
-        va  = list of variances (vector K * d or square matrices Kd * d)
-
-    returns:
-        K   = number of components
-        d   = dimension
-        mode    = 'diag' if diagonal covariance, 'full' of full matrices
-    """
-        
-    # Check that w is valid
-    if N.fabs(N.sum(w)  - 1) > MAX_DEV:
-        raise GmmParamError('weight does not sum to 1')
-    
-    if not len(w.shape) == 1:
-        raise GmmParamError('weight is not a vector')
-
-    # Check that mean and va have the same number of components
-    K           = len(w)
-
-    if N.ndim(mu) < 2:
-        msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
-        raise GmmParamError(msg)
-    if N.ndim(va) < 2:
-        msg = """va should be a K,d / K *d, d matrix, and a row vector if
-        only 1 diag comp"""
-        raise GmmParamError(msg)
-
-    (Km, d)     = mu.shape
-    (Ka, da)    = va.shape
-
-    if not K == Km:
-        msg = "not same number of component in mean and weights"
-        raise GmmParamError(msg)
-
-    if not d == da:
-        msg = "not same number of dimensions in mean and variances"
-        raise GmmParamError(msg)
-
-    if Km == Ka:
-        mode = 'diag'
-    else:
-        mode = 'full'
-        if not Ka == Km*d:
-            msg = "not same number of dimensions in mean and variances"
-            raise GmmParamError(msg)
-        
-    return K, d, mode
-        
-# For EM on GMM
-def multiple_gauss_den(data, mu, va):
-    """Helper function to generate several Gaussian
-    pdf (different parameters) from the same data"""
-    mu  = N.atleast_2d(mu)
-    va  = N.atleast_2d(va)
-
-    K   = mu.shape[0]
-    n   = data.shape[0]
-    d   = data.shape[1]
-    
-    y   = N.zeros((n, K), float)
-    if mu.size == va.size:
-        for i in range(K):
-            y[:, i] = densities.gauss_den(data, mu[i, :], va[i, :])
-    else:
-        for i in range(K):
-            y[:, i] = densities.gauss_den(data, mu[i, :], 
-                        va[d*i:d*i+d, :])
-
-    return y
-
-def gmm_init_kmean(data, k, mode, init = [], niter = 10):
-    """gmm_init_kmean(data, k, mode, init = [], niter = 10)
-    
-    Init EM for GMM with kmean from data, for k components. 
-    
-    Args:
-        - data:     Each row of data is one frame of dimension d. 
-        - k:        Number of components to look for
-        - mode:     Diagonal or Full covariance matrices
-        - init:     The initial centroids. The value given for k is
-        ignored, and the number of row in initc is used instead. 
-        If initc is not given, then the centroids are initialized 
-        with the k first values of data.
-        - niter:    Number of iterations in kmean.
-    
-    Returns:
-        (w, mu, va), initial parameters for a GMM.
-
-    Method:
-        Each weight is equiprobable, each mean is one centroid returned by kmean, and
-    covariances for component i is initialized with covariance of 
-    data corresponding with label i. Other strategies are possible, this one
-    is an easy one"""
-    if len(init) == 0:
-        init   = data[0:k, :]
-    else:
-        k       = initc.shape[0]
-
-    if data.ndim == 1:
-        d   = 1
-    else:
-        d   = N.shape(data)[1]
-
-    (code, label)   = kmean(data, init, niter)
-
-    w   = N.ones(k, float) / k
-    mu  = code.copy()
-    if mode == 'diag':
-        va  = N.zeros((k, d), float)
-        for i in range(k):
-            for j in range(d):
-                va[i,j] = N.cov(data[N.where(label==i), j], rowvar = 0)
-    elif mode == 'full':
-        va  = N.zeros((k*d, d), float)
-        for i in range(k):
-            va[i*d:i*d+d,:] = N.cov(data[N.where(label==i)], rowvar = 0)
-    else:
-        raise GmmParamError("mode " + str(mode) + " not recognized")
-
-    return w, mu, va
-
-def gmm_posterior(data, w, mu, va):
-    """ Computes the latent variable distribution (a 
-    posteriori probability) knowing the explicit data 
-    for the Gaussian model (w, mu, var): gamma(t, i) = 
-        P[state = i | observation = data(t); w, mu, va]
-
-    This is basically the E step of EM for GMM.
-   
-    the second returned value is the non normalized version 
-    of gamma, and may be needed for some computation, 
-    like eg likelihood"""
-    n   = data.shape[0]
-    K   = len(w)
-
-    # compute the gaussian pdf
-    tgd	= multiple_gauss_den(data, mu, va)
-    # multiply by the weight
-    tgd	*= w
-    # Normalize to get a pdf
-    gd	= tgd  / N.sum(tgd, axis=1)[:, N.NewAxis]
-
-    return gd, tgd
-
-# This function is just calling gmm_update and gmm_posterior, with
-# initialization. This is ugly, and we should have a class to model a GMM
-# instead of all this code to try to guess correct values and parameters...
-def gmm_em(data, niter = 10, k = 2, mode = 'diag', w = [], mu = [], va = []):
-    """
-    gmm_em(data, niter = 10, k = 2, mode = 'diag', w = [], mu = [], va = []):
-
-    Compute the parameters of a Gaussian Mixture Model using EM algorithm, 
-    with initial values w, mu and va (overwritten by the function).
-
-    Args:
-        - data:     contains the observed features, one row is one frame, ie one 
-        observation of dimension d
-        - niter:    number of iterations
-        - mode:     'diag' or 'full', depending on the wanted model for cov
-        matrices.
-        - K:        number of components
-        - w, mu, va initial parameters for the GMM. All or none must be given.
-        If no initial values are given, initialized by gmm_init_kmean; if they
-        are given, mode and k are ignored, and guessed from the given parameters
-        instead.
-
-    Returns:
-        w, mu, va, like as found by EM, where like is the likelihood for each 
-        iteration.
-    """
-    if len(w) == 0:
-        w, mu, va   = gmm_init_kmean(data, k, mode, 5)
-    else:
-        k, d, mode  = check_gmm_parm(w, mu, va)
-    
-    like    = N.zeros(niter, float)
-    for i in range(niter):
-        g, tgd  = gmm_posterior(data, w, mu, va)
-        like[i] = N.sum(N.log(N.sum(tgd, 1)))
-        w, mu, va   = gmm_update(X, g, d, k, mode)
-
-    return w, mu, va, like
-    
-def gmm_update(data, gamma, d, K, varmode):
-    """Computes update of the Gaussian Mixture Model (M step)
-    from the a posteriori pdf, computed by gmm_posterior
-    (E step).
-    """
-    n       = data.shape[0]
-    invn    = 1.0/n
-    mGamma  = N.sum(gamma)
-
-    if varmode == 'diag':
-        mu  = N.zeros((K, d), float)
-        va  = N.zeros((K, d), float)
-        for k in range(K):
-            x       = N.sum(N.outerproduct(gamma[:, k], 
-                        N.ones((1, d))) * data)
-            xx      = N.sum(N.outerproduct(gamma[:, k], 
-                        N.ones((1, d))) * (data ** 2))
-
-            mu[k,:] = x / mGamma[k]
-            va[k,:] = xx  / mGamma[k] - mu[k,:] ** 2
-        w   = invn * mGamma
-
-    elif varmode == 'full':
-        mu  = N.zeros((K, d), float)
-        va  = N.zeros((K*d, d), float)
-
-        # This is really slow because of the loop on each frame
-        # ways to improve that: either use 3d matrices and see 
-        # the memory usage increasing in n*d*d,
-        # using C (or pyrex, or ...) or using the trick commented
-        # which loops on the dimensions instead. If the number of dimensions
-        # is high, this won't help, though...
-        for k in range(K):
-            x   = N.sum(N.outerproduct(gamma[:, k], 
-                        N.ones((1, d), float)) * data)
-            xx  = N.zeros((d, d), float)
-            # for i in range(n):
-            #     xx  += gamma[i, k] * N.outerproduct(data[i,:], 
-            #                 data[i,:])
-
-            # This should be much faster
-            for i in range(d):
-                for j in range(d):
-                    xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[:,k])
-
-            mu[k,:] = x / mGamma[k]
-            va[k*d:k*d+d,:] = xx  / mGamma[k] - \
-                                N.outerproduct(mu[k,:], mu[k,:])
-        w   = invn * mGamma
-    else:
-        raise GmmParamError("varmode not recognized")
-
-    return w, mu, va
-
-# Misc functions
-def gmm_ellipses(mu, va, c = [0, 1], npoints = 100):
-    """Returns a list of ellipses describing the Gmm
-    defined by mu and va. c is the dimension we are projecting
-    the variances on a 2d space.
-    
-    Returns:
-        -Xe:    a list of x coordinates for the ellipses
-        -Ye:    a list of y coordinates for the ellipses
-
-    Example:
-        Suppose we have w, mu and va as parameters for a mixture, then:
-        
-        X       = gen_gmm(w, mu, va, 1000)
-        Xe, Ye  = gmm_ellipses(mu, va)
-        pylab.plot(X[:,0], X[:, 1], '.')
-        for k in len(w):
-            pylab.plot(Xe[k], Ye[k], 'r')
-            
-        Will plot samples X draw from the mixture model, and
-        plot the ellipses of equi-probability from the mean with
-        fixed level of confidence 0.39. 
-        
-    TODO: be able to modify the confidence interval to arbitrary
-    value (to do in gauss_ell)"""
-    K   = mu.shape[0]
-    w   = N.ones(K, float) / K
-    
-    K, d, mode  = check_gmm_param(w, mu, va)
-
-    # TODO: adjustable level (to do in gauss_ell). 
-    # For now, a level of 0.39 means that we draw
-    # ellipses for 1 standard deviation. 
-    Xe  = []
-    Ye  = []   
-    if mode == 'diag':
-        for i in range(K):
-            xe, ye  = densities.gauss_ell(mu[i,:], va[i,:], dim = c, 
-                    npoints = npoints)
-            Xe.append(xe)
-            Ye.append(ye)
-    elif mode == 'full':
-        for i in range(K):
-            xe, ye  = densities.gauss_ell(mu[i,:], va[i*d:i*d+d,:], dim = c, 
-                    npoints = npoints)
-            Xe.append(xe)
-            Ye.append(ye)
-
-    return Xe, Ye
-
-import c_gmm
-def kmean(data, init, iter = 10):
-    """Simple kmean implementation for EM
-    
-    returns a tuple (code, label), where code are the final
-    centroids, and label are the class label indec for each
-    frame (ie row) of data"""
-
-    data    = N.atleast_2d(data)
-    init    = N.atleast_2d(init)
-
-    (n, d)  = data.shape
-    (k, d1) = init.shape
-
-    if not d == d1:
-        msg = "data and init centers do not have same dimensions..."
-        raise GmmParamError(msg)
-    
-    code    = N.asarray(init.copy(), float)
-    for i in range(iter):
-        # Compute the nearest neighbour for each obs
-        # using the current code book
-        label   = c_gmm._vq(data, code)
-        # Update the code by computing centroids using the new code book
-        for j in range(k):
-            code[j,:] = N.mean(data[N.where(label==j)]) 
-
-    return code, label
-
-def _vq(data, code):
-    """ Please do not use directly. Use kmean instead"""
-    # No attempt to be efficient has been made...
-    (n, d)  = data.shape
-    (k, d)  = code.shape
-
-    label   = N.zeros(n, int)
-    for i in range(n):
-        d           = N.sum((data[i, :] - code) ** 2, 1)
-        label[i]    = N.argmin(d)
-
-    return label
-    
-# Test functions usable for now
-def test_kmean():
-    X   = N.array([[3.0, 3], [4, 3], [4, 2],
-            [9, 2], [5, 1], [6, 2], [9, 4], 
-            [5, 2], [5, 4], [7, 4], [6, 5]])
-
-    initc   = N.concatenate(([[X[0]], [X[1]], [X[2]]])) 
-
-    codet1  = N.array([[3.0000, 3.0000],
-            [6.2000, 4.0000], 
-            [5.8000, 1.8000]])
-            
-    codet2  = N.array([[11.0/3, 8.0/3], 
-            [6.7500, 4.2500],
-            [6.2500, 1.7500]])
-
-    code    = initc.copy()
-
-    code1   = kmean(X, code, 1)[0]
-    code2   = kmean(X, code, 2)[0]
-
-    import numpy.testing as testing
-    try:
-        testing.assert_array_almost_equal(code1, codet1)
-        print "kmean test 1 succeded"
-    except AssertionError:
-        print "kmean test 1 failed"
-
-    try:
-        testing.assert_array_almost_equal(code2, codet2)
-        print "kmean test 2 succeded"
-    except AssertionError:
-        print "kmean test 2 failed"
-
-# Test functions (unusable by other people for now....)
-def test_gmm_den():
-    """"""
-    import tables
-    
-    filename    = 'gmm_test.h5'
-    
-    k   = 3
-    d   = 2
-    mode        = 'full'
-    w, mu, va   = gen_gmm_param(d, k, mode, 2)
-    X           = gen_gmm(w, mu, va, 1e3)
-
-    h5file      = tables.openFile(filename, "w")
-
-    h5file.createArray(h5file.root, 'mu', mu)
-    h5file.createArray(h5file.root, 'va', va)
-    h5file.createArray(h5file.root, 'X', X)
-
-    Y1 =  densities.gauss_den(X, mu[0,:], va[0:2,:])
-    Y2 =  densities.gauss_den(X, mu[1,:], va[2:4,:])
-    Y3 =  densities.gauss_den(X, mu[2,:], va[4:6,:])
-
-    Y  =  multiple_gauss_den(X, mu, va)
-
-    h5file.createArray(h5file.root, 'Y', Y)
-    h5file.createArray(h5file.root, 'Y2', Y2)
-    h5file.createArray(h5file.root, 'Y1', Y1)
-    h5file.createArray(h5file.root, 'Y3', Y3)
-
-    h5file.close()
-
-def test_gmm_em():
-    """"""
-    import numpy as N
-    import tables
-    import pylab as P
-    filename    = 'data.h5'
-    
-    k               = 3
-    d               = 2
-    mode            = 'full'
-    wr, mur, var    = gen_gmm_param(d, k, mode, 2)
-    X               = gen_gmm(wr, mur, var, 1e3)
-
-    h5file      = tables.openFile(filename, "w")
-
-    h5file.createArray(h5file.root, 'wr', wr)
-    h5file.createArray(h5file.root, 'mur', mur)
-    h5file.createArray(h5file.root, 'var', var)
-    h5file.createArray(h5file.root, 'X', X)
-
-    P.plot(X[:, 0], X[:, 1], '.')
-    Xe, Ye  = gmm_ellipses(mur, var, npoints = 100)
-    for i in range(k):
-        P.plot(Xe[i], Ye[i], 'g')
-
-    # init EM: simply use kmean
-    initc   = X[0:k, :]
-    (code, label)   = kmean(X, initc, 10)
-
-    h5file.createArray(h5file.root, 'initc', initc)
-
-    w0      = N.ones(k, float) / k
-    mu0     = code.copy()
-    if mode == 'diag':
-        va0     = N.zeros((k, d), float)
-        for i in range(k):
-            for j in range(d):
-                va0[i,j] = N.cov(X[N.where(label==i), j], rowvar = 0)
-    elif mode == 'full':
-        va0     = N.zeros((k*d, d), float)
-        for i in range(k):
-            va0[i*d:i*d+d,:] = N.cov(X[N.where(label==i)], rowvar = 0)
-
-    h5file.createArray(h5file.root, 'w0', w0)
-    h5file.createArray(h5file.root, 'mu0', mu0)
-    h5file.createArray(h5file.root, 'va0', va0)
-
-    
-    # # Use random values
-    # w0  = N.ones(k, float) / k
-    # mu0 = N.randn(k, d)
-    # va0 = N.fabs(N.randn(k, d))
-
-    w   = w0.copy()
-    mu  = mu0.copy()
-    va  = va0.copy()
-
-    X0e, Y0e  = gmm_ellipses(mu, va, npoints = 100)
-    for i in range(k):
-        P.plot(X0e[i], Y0e[i], 'k')
-
-    niter   = 10
-    like    = N.zeros(niter, float)
-
-    for i in range(niter):
-        g, tgd  = gmm_posterior(X, w, mu, va)
-        #h5file.createArray(h5file.root, 'g', g)
-        #h5file.createArray(h5file.root, 'tgd', tgd)
-        like[i] = N.sum(N.log(N.sum(tgd, 1)))
-        print like[i]
-        w, mu, va    = gmm_update(X, g, d, k, mode)
-
-    h5file.createArray(h5file.root, 'w', w)
-    h5file.createArray(h5file.root, 'mu', mu)
-    h5file.createArray(h5file.root, 'va', va)
-
-    X0e, Y0e  = gmm_ellipses(mu, va, npoints = 100)
-    for i in range(k):
-        P.plot(X0e[i], Y0e[i], 'r')
-    P.figure()
-    P.plot(like)
-    h5file.close()
-    P.show()
-
-def _bench1():
-    #===========================================
-    # Diag GMM with 5 components of dimension 20
-    #===========================================
-    k       = 5
-    d       = 20
-    mode    = 'diag'
-
-    print "Generating the mixture"
-    # Generate a model with k components, d dimensions
-    wr, mur, var    = gen_gmm_param(d, k, mode, 3)
-    X               = gen_gmm(wr, mur, var, 1e4)
-
-    print "Init the mixture"
-    # Init the mixture with kmean
-    w0, mu0, va0    = gmm_init_kmean(X, k, mode, niter = 5)
-    
-    # # Use random values instead of kmean
-    # w0  = N.ones(k, float) / k
-    # mu0 = N.randn(k, d)
-    # va0 = N.fabs(N.randn(k, d))
-
-    print "EM computing..."
-    # Copy the initial values because we want to draw them later...
-    w   = w0.copy()
-    mu  = mu0.copy()
-    va  = va0.copy()
-
-    # The actual EM, with likelihood computation
-    niter   = 10
-    like    = N.zeros(niter, float)
-
-    for i in range(niter):
-        print "post"
-        g, tgd  = gmm_posterior(X, w, mu, va)
-        like[i] = N.sum(N.log(N.sum(tgd, 1)))
-        print "update"
-        w, mu, va   = gmm_update(X, g, d, k, mode)
-
-
-def benchmark():
-    import profile
-    profile.run('_bench1()', 'bench1prof')
-
-if __name__ == "__main__":
-    #=============================
-    # Simple GMM with 3 components
-    #=============================
-    import pylab as P
-    k       = 4
-    d       = 2
-    mode    = 'diag'
-
-    print "Generating the mixture"
-    # Generate a model with k components, d dimensions
-    wr, mur, var    = gen_gmm_param(d, k, mode, 3)
-    X               = gen_gmm(wr, mur, var, 1e3)
-
-    print "Init the mixture"
-    # Init the mixture with kmean
-    w0, mu0, va0    = gmm_init_kmean(X, k, mode, niter = 5)
-    
-    # # Use random values instead of kmean
-    # w0  = N.ones(k, float) / k
-    # mu0 = N.randn(k, d)
-    # va0 = N.fabs(N.randn(k, d))
-
-    # Copy the initial values because we want to draw them later...
-    w   = w0.copy()
-    mu  = mu0.copy()
-    va  = va0.copy()
-
-    # The actual EM, with likelihood computation
-    niter   = 10
-    like    = N.zeros(niter, float)
-
-    print "computing..."
-    for i in range(niter):
-        print "post"
-        g, tgd  = gmm_posterior(X, w, mu, va)
-        like[i] = N.sum(N.log(N.sum(tgd, 1)))
-        print "update"
-        w, mu, va   = gmm_update(X, g, d, k, mode)
-
-    print "drawing..."
-    # Draw what is happening
-    P.subplot(2, 1, 1)
-    P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')
-
-    # Real confidence ellipses
-    Xre, Yre  = gmm_ellipses(mur, var)
-    P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
-    for i in range(1,k):
-        P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
-
-    # Initial confidence ellipses as found by kmean
-    X0e, Y0e  = gmm_ellipses(mu0, va0)
-    P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
-    for i in range(1,k):
-        P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
-
-    # Values found by EM
-    Xe, Ye  = gmm_ellipses(mu, va)
-    P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
-    for i in range(1,k):
-        P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
-    P.legend(loc = 0)
-    P.subplot(2, 1, 2)
-    P.plot(like)
-    P.title('log likelihood')
-
-    # # Export the figure
-    # F   = P.gcf()
-    # DPI = F.get_dpi()
-    # DefaultSize = F.get_size_inches()
-    # # the default is 100dpi for savefig:
-    # F.savefig("example1.png")
-
-    # # Now make the image twice as big, while keeping the fonts and all the
-    # # same size
-    # F.set_figsize_inches( (DefaultSize[0]*2, DefaultSize[1]*2) )
-    # Size = F.get_size_inches()
-    # print "Size in Inches", Size
-    # F.savefig("example2.png")
-    P.show()

Added: trunk/Lib/sandbox/pyem/pyem/__init__.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/__init__.py	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/__init__.py	2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,4 @@
+#! /usr/bin/env python
+# Last Change: Wed Jul 12 12:00 PM 2006 J
+
+version = '0.2'

Added: trunk/Lib/sandbox/pyem/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/densities.py	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/densities.py	2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,373 @@
+#! /usr/bin/python
+#
+# Copyrighted David Cournapeau
+# Last Change: Fri Jun 30 06:00 PM 2006 J
+
+import numpy as N
+import numpy.linalg as lin
+
+# Error classes
+class DenError(Exception):
+    """Base class for exceptions in this module.
+    
+    Attributes:
+        expression -- input expression in which the error occurred
+        message -- explanation of the error"""
+    def __init__(self, message):
+        self.message    = message
+    
+    def __str__(self):
+        return self.message
+
+# The following function do all the fancy stuff to check that parameters
+# are Ok, and call the right implementation if args are OK.
+def gauss_den(x, mu, va, log = False):
+    """ Compute multivariate Gaussian density at points x for 
+    mean mu and variance va.
+    
+    Vector are row vectors, except va which can be a matrix
+    (row vector variance for diagonal variance)
+    
+    If log is True, than the log density is returned 
+    (useful for underflow ?)"""
+    mu  = N.atleast_2d(mu)
+    va  = N.atleast_2d(va)
+    x   = N.atleast_2d(x)
+    
+    #=======================#
+    # Checking parameters   #
+    #=======================#
+    if len(N.shape(mu)) != 2:
+        raise DenError("mu is not rank 2")
+        
+    if len(N.shape(va)) != 2:
+        raise DenError("va is not rank 2")
+        
+    if len(N.shape(x)) != 2:
+        raise DenError("x is not rank 2")
+        
+    (n, d)      = x.shape
+    (dm0, dm1)  = mu.shape
+    (dv0, dv1)  = va.shape
+    
+    # Check x and mu same dimension
+    if dm0 != 1:
+        msg = "mean must be a row vector!"
+        raise DenError(msg)
+    if dm1 != d:
+        msg = "x and mu not same dim"
+        raise DenError(msg)
+    # Check va and mu same size
+    if dv1 != d:
+        msg = "mu and va not same dim"
+        raise DenError(msg)
+    if dv0 != 1 and dv0 != d:
+        msg = "va not square"
+        raise DenError(msg)
+
+    #===============#
+    # Computation   #
+    #===============#
+    data    = x - mu
+
+    if d == 1:
+        # scalar case
+        return _scalar_gauss_den(data[:, 0], mu[0, 0], va[0, 0], log)
+    elif dv0 == 1:
+        # Diagonal matrix case
+        return _diag_gauss_den(data, mu, va, log)
+    elif dv1 == dv0:
+        # full case
+        return  _full_gauss_den(data, mu, va, log)
+    else:
+        raise DenError("variance mode not recognized, this is a bug")
+
+# Those 3 functions do almost all the actual computation
+def _scalar_gauss_den(x, mu, va, log):
+    """ This function is the actual implementation
+    of gaussian pdf in scalar case. It assumes all args
+    are conformant, so it should not be used directly
+    
+    ** Expect centered data (ie with mean removed) **
+
+    Call gauss_den instead"""
+    d       = mu.size
+    inva    = 1/va
+    fac     = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
+    y       = (x ** 2) * -0.5 * inva
+    if not log:
+        y   = fac * N.exp(y)
+    else:
+        y   = y + log(fac)
+
+    return y
+    
+def _diag_gauss_den(x, mu, va, log):
+    """ This function is the actual implementation
+    of gaussian pdf in scalar case. It assumes all args
+    are conformant, so it should not be used directly
+    
+    ** Expect centered data (ie with mean removed) **
+
+    Call gauss_den instead"""
+    # Diagonal matrix case
+    d   = mu.size
+    y   = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
+    if not log:
+        for i in range(1, d):
+            y    *=  _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
+    else:
+        for i in range(1, d):
+            y    +=  _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
+    return y
+
+def _full_gauss_den(x, mu, va, log):
+    """ This function is the actual implementation
+    of gaussian pdf in full matrix case. 
+    
+    It assumes all args are conformant, so it should 
+    not be used directly Call gauss_den instead
+    
+    ** Expect centered data (ie with mean removed) **
+
+    Does not check if va is definite positive (on inversible 
+    for that matter), so the inverse computation and/or determinant
+    would throw an exception."""
+    d       = mu.size
+    inva    = lin.inv(va)
+    fac     = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
+
+    # # Slow version
+    # n       = N.size(x, 0)
+    # y       = N.zeros(n, float)
+    # for i in range(n):
+    #     y[i] = N.matrixmultiply(x[i,:],
+    #              N.matrixmultiply(inva, N.transpose(x[i,:])))
+    # y *= -0.5
+
+    # we are using a trick with sum to "emulate" 
+    # the matrix multiplication inva * x without any explicit loop
+    y   = N.matrixmultiply(x, inva)
+    y   = -0.5 * N.sum(y * x, 1)
+
+    if not log:
+        y   = fac * N.exp(y)
+    else:
+        y   = y + N.log(fac)
+ 
+    return y
+
+# To plot a confidence ellipse from multi-variate gaussian pdf
+def gauss_ell(mu, va, dim = [0, 1], npoints = 100):
+    """ Given a mean and covariance for multi-variate
+    gaussian, returns npoints points for the ellipse
+    of confidence 0.39
+    
+    Returns the coordinate x and y of the ellipse"""
+    
+    # TODO: Get a confidence interval using the Chi2 distribution
+    # of points at a given mahalanobis distance...
+    mu      = N.atleast_1d(mu)
+    va      = N.atleast_1d(va)
+    c       = N.array(dim)
+
+    if mu.size == va.size:
+        mode    = 'diag'
+    else:
+        if va.ndim == 2:
+            if va.shape[0] == va.shape[1]:
+                mode    = 'full'
+            else:
+                raise DenError("variance not square")
+        else:
+            raise DenError("mean and variance are not dim conformant")
+
+    level   = 0.39
+    
+    # Generates a circle of npoints
+    theta   = N.linspace(0, 2 * N.pi, npoints)
+    circle  = N.array([N.cos(theta), N.sin(theta)])
+
+    # Get the dimension which we are interested in:
+    mu  = mu[dim]
+    if mode == 'diag':
+        va      = va[dim]
+        elps    = N.outerproduct(mu, N.ones(npoints, float))
+        elps    += N.matrixmultiply(N.diag(N.sqrt(va)), circle)
+    elif mode == 'full':
+        va  = va[c,:][:,c]
+        # Method: compute the cholesky decomp of each cov matrix, that is
+        # compute cova such as va = cova * cova' 
+        # WARN: scipy is different than matlab here, as scipy computes a lower
+        # triangular cholesky decomp: 
+        #   - va = cova * cova' (scipy)
+        #   - va = cova' * cova (matlab)
+        # So take care when comparing results with matlab !
+        cova    = lin.cholesky(va)
+        elps    = N.outerproduct(mu, N.ones(npoints, float))
+        elps    += N.matrixmultiply(cova, circle)
+    else:
+        raise DenParam("var mode not recognized")
+
+    return elps[0, :], elps[1, :]
+
+def test_gauss_den():
+    """"""
+    # import tables
+    # import numpy as N
+    # 
+    # filename    = 'dendata.h5'
+
+    # # # Dimension 1
+    # # d   = 1
+    # # mu  = 1.0
+    # # va  = 2.0
+
+    # # X   = N.randn(1e3, 1)
+
+    # # Y   = gauss_den(X, mu, va)
+
+    # # h5file      = tables.openFile(filename, "w")
+
+    # # h5file.createArray(h5file.root, 'X', X)
+    # # h5file.createArray(h5file.root, 'mu', mu)
+    # # h5file.createArray(h5file.root, 'va', va)
+    # # h5file.createArray(h5file.root, 'Y', Y)
+
+    # # h5file.close()
+
+    # # # Dimension 2, diag
+    # # d   = 2
+    # # mu  = N.array([1.0, -2.0])
+    # # va  = N.array([1.0, 2.0])
+
+    # # X   = N.randn(1e3, 2)
+
+    # # Y   = gauss_den(X, mu, va)
+
+    # # h5file      = tables.openFile(filename, "w")
+
+    # # h5file.createArray(h5file.root, 'X', X)
+    # # h5file.createArray(h5file.root, 'mu', mu)
+    # # h5file.createArray(h5file.root, 'va', va)
+    # # h5file.createArray(h5file.root, 'Y', Y)
+
+    # # Dimension 2, full
+    # d   = 2
+    # mu  = N.array([[0.2, -1.0]])
+    # va  = N.array([[1.2, 0.1], [0.1, 0.5]])
+
+    # X   = N.randn(1e3, 2)
+
+    # Y   = gauss_den(X, mu, va)
+
+    # h5file      = tables.openFile(filename, "w")
+
+    # h5file.createArray(h5file.root, 'X', X)
+    # h5file.createArray(h5file.root, 'mu', mu)
+    # h5file.createArray(h5file.root, 'va', va)
+    # h5file.createArray(h5file.root, 'Y', Y)
+
+    # h5file.close()
+
+    import numpy.testing as testing
+    #=================
+    # Small test in 1d
+    #=================
+    va  = 2.0
+    mu  = 1.0
+    X   = N.linspace(-2, 2, 10)[:, N.NewAxis]
+
+    Yt  = N.array([0.02973257230591, 0.05512079811082, 0.09257745306945, 
+            0.14086453882683,
+            0.19418015562214, 0.24250166773127, 0.27436665745048, 0.28122547107069,
+            0.26114678964743, 0.21969564473386])
+
+    Y   = gauss_den(X, mu, va)
+    try:
+        testing.assert_array_almost_equal(Y, Yt)
+        print "1d test succeded"
+    except AssertionError:
+        print "test fails in 1d"
+
+    #============================
+    # Small test in 2d (diagonal)
+    #============================
+    mu  = N.atleast_2d([-1.0, 2.0])
+    va  = N.atleast_2d([2.0, 3.0])
+    X1  = N.linspace(-2, 2, 10)[:, N.NewAxis]
+    X2  = N.linspace(-1, 3, 10)[:, N.NewAxis]
+    X   = N.concatenate(([X1, X2]), 1)
+    
+    Yt  = N.array([0.01129091565384, 0.02025416837152, 0.03081845516786, 
+            0.03977576221540, 0.04354490552910, 0.04043592581117, 
+            0.03184994053539, 0.02127948225225, 0.01205937178755, 
+            0.00579694938623 ])
+
+    Y   = gauss_den(X, mu, va)
+    try:
+        testing.assert_array_almost_equal(Y, Yt)
+        print "2d diag test succeded"
+    except AssertionError:
+        print "test fails in 2d diag"
+
+    #============================
+    # Small test in 2d (full mat)
+    #============================
+    mu  = N.array([[0.2, -1.0]])
+    va  = N.array([[1.2, 0.1], [0.1, 0.5]])
+    X1  = N.linspace(-2, 2, 10)[:, N.NewAxis]
+    X2  = N.linspace(-3, 3, 10)[:, N.NewAxis]
+    X   = N.concatenate(([X1, X2]), 1)
+    
+    Yt  = N.array([0.00096157109751, 0.01368908714856,
+        0.07380823191162, 0.15072050533842, 
+        0.11656739937861, 0.03414436965525,
+        0.00378789836599, 0.00015915297541, 
+        0.00000253261067, 0.00000001526368])
+
+    Y   = gauss_den(X, mu, va)
+    try:
+        testing.assert_array_almost_equal(Y, Yt)
+        print "2d full test succeded"
+    except AssertionError:
+        print "test fails in 2d full"
+         
+
+if __name__ == "__main__":
+    import pylab
+
+    #=========================================
+    # Test plotting a simple diag 2d variance:
+    #=========================================
+    va  = N.array([5, 3])
+    mu  = N.array([2, 3])
+
+    # Generate a multivariate gaussian of mean mu and covariance va
+    X       = N.randn(1e3, 2)
+    Yc      = N.matrixmultiply(N.diag(N.sqrt(va)), X.transpose())
+    Yc      = Yc.transpose() + mu
+
+    # Plotting
+    Xe, Ye  = gauss_ell(mu, va, npoints = 100)
+    pylab.figure()
+    pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+    pylab.plot(Xe, Ye, 'r')
+
+    #=========================================
+    # Test plotting a simple full 2d variance:
+    #=========================================
+    va  = N.array([[0.2, 0.1],[0.1, 0.5]])
+    mu  = N.array([0, 3])
+
+    # Generate a multivariate gaussian of mean mu and covariance va
+    X       = N.randn(1e3, 2)
+    Yc      = N.matrixmultiply(lin.cholesky(va), X.transpose())
+    Yc      = Yc.transpose() + mu
+
+    # Plotting
+    Xe, Ye  = gauss_ell(mu, va, npoints = 100)
+    pylab.figure()
+    pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+    pylab.plot(Xe, Ye, 'r')
+    pylab.show()

Added: trunk/Lib/sandbox/pyem/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/gmm_em.py	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/gmm_em.py	2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,759 @@
+# /usr/bin/python
+# Last Change: Fri Jun 30 07:00 PM 2006 J
+
+# TODO:
+#   - interface with libem
+#   - implements E and M in batch mode for mixtures, with special case for gmm 
+#   ( general EM should be outside this module. This one should be GMM specific, maybe)
+#   - implements E and M in online mode
+#   - python has list. Does it make sense to keep the h2m format for mean, va, when
+#   we can use list instead ?
+#   - a Gaussian Mixture Model should be a class, really !
+
+import numpy as N
+import numpy.linalg as lin
+import densities
+
+MAX_DEV = 1e-10
+
+# Error classes
+class GmmError(Exception):
+    """Base class for exceptions in this module."""
+    pass
+
+class GmmParamError(GmmError):
+    """Exception raised for errors in gmm params
+
+    Attributes:
+        expression -- input expression in which the error occurred
+        message -- explanation of the error
+    """
+    def __init__(self, message):
+        self.message    = message
+    
+    def __str__(self):
+        return self.message
+
+# Base class for mixture models: a mixture model can be initialized, 
+# have parameters, have its parameters changed, have function to returns sufficient
+# statistics, generate data from the model, have the functions relative to EM
+class Mixture:
+    def __init__(self, np, prior):
+        self.np     = np
+        self.prior  = prior
+
+    def em_post(self):
+        pass
+
+    def em_update(self):
+        pass
+
+    # Check that the current state is consistent (such as priori weighting to 1, etc...)
+    def check_state(self):
+        pass
+
+    # Return npoints points generated from the model
+    def generate(self, npoints):
+        pass
+
+# The Gaussian Mixture
+class GMM(Mixture):
+    pass
+
+# Function to generate a GMM, or valid parameters for GMM
+def gen_rand_index(p, n):
+    """Generate a N samples vector containing random index between 1 
+    and length(p), each index i with probability p(i)"""
+    # TODO Check args here
+    
+    # TODO: check each value of inverse distribution is
+    # different
+    invcdf  = N.cumsum(p)
+    uni     = N.rand(n)
+    index   = N.zeros(n)
+
+    # This one should be a bit faster
+    for k in range(len(p)-1, 0, -1):
+        blop        = N.where(N.logical_and(invcdf[k-1] <= uni, 
+                    uni < invcdf[k]))
+        index[blop] = k
+        
+    return index
+
+def gen_gmm(w, mu, va, n):
+    """Generate a gaussiam mixture model with weights w, 
+    mean mu and variances va. Each column of the parameters
+    are one component parameter.
+    """
+    # Check args
+    K, d, mode  = check_gmm_param(w, mu, va)
+
+    # Generate the mixture
+    S   = gen_rand_index(w, n)  # State index (ie hidden var)
+    X   = N.randn(n, d)         # standard gaussian
+
+    if mode == 'diag':
+        X   = mu[S, :]  + X * N.sqrt(va[S,:])
+    elif mode == 'full':
+        # Naive implementation
+        # cho = N.zeros(va.shape, float)
+        # for k in range(K):
+        #     # Using cholesky is more stable than sqrtm; sqrtm is not
+        #     # available in numpy anyway, only in scipy...
+        #     cho[k*d:k*d+d,:]    = lin.cholesky(va[k*d:k*d+d,:])
+        # for i in range(n):
+        # X[i,:]  = mu[S[i],:] + N.matrixmultiply(cho[S[i]*d:S[i]*d+d,:], X[i,:])
+        
+        # Faster:
+        cho = N.zeros((K, va.shape[1], va.shape[1]), float)
+        for k in range(K):
+            # Using cholesky is more stable than sqrtm; sqrtm is not
+            # available in numpy anyway, only in scipy...
+            cho[k]  = lin.cholesky(va[k*d:k*d+d,:])
+
+        for s in range(K):
+            tmpind      = N.where(S == s)[0]
+            X[tmpind]   = N.matrixmultiply(X[tmpind], cho[s].transpose()) + mu[s]
+    else:
+        raise GmmParamError('cov matrix mode not recognized')
+
+    return X
+
+def gen_gmm_param(d, K, varmode = 'diag', spread = 1):
+    """Generate valid parameters for a gaussian mixture model.
+    d is the dimension, K the number of components, and varmode
+    the mode for cov matrices.
+
+    Returns:    
+        - w
+        - mu
+        - va
+    """
+    w   = abs(N.randn(K))
+    w   = w / sum(w)
+
+    mu  = spread * N.randn(K, d)
+    if varmode == 'diag':
+        va  = abs(N.randn(K, d))
+    elif varmode == 'full':
+        va  = N.randn(K * d, d)
+        for k in range(K):
+            va[k*d:k*d+d]   = N.matrixmultiply( va[k*d:k*d+d], 
+                va[k*d:k*d+d].transpose())
+    else:
+        raise GmmParamError('cov matrix mode not recognized')
+
+    return w, mu, va
+
+def check_gmm_param(w, mu, va):
+    """Check that w, mu and va are valid parameters for
+    a mixture of gaussian: w should sum to 1, there should
+    be the same number of component in each param, the variances
+    should be positive definite, etc... 
+    
+    Params:
+        w   = vector or list of weigths of the mixture (K elements)
+        mu  = matrix: K * d
+        va  = list of variances (vector K * d or square matrices Kd * d)
+
+    returns:
+        K   = number of components
+        d   = dimension
+        mode    = 'diag' if diagonal covariance, 'full' of full matrices
+    """
+        
+    # Check that w is valid
+    if N.fabs(N.sum(w)  - 1) > MAX_DEV:
+        raise GmmParamError('weight does not sum to 1')
+    
+    if not len(w.shape) == 1:
+        raise GmmParamError('weight is not a vector')
+
+    # Check that mean and va have the same number of components
+    K           = len(w)
+
+    if N.ndim(mu) < 2:
+        msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
+        raise GmmParamError(msg)
+    if N.ndim(va) < 2:
+        msg = """va should be a K,d / K *d, d matrix, and a row vector if
+        only 1 diag comp"""
+        raise GmmParamError(msg)
+
+    (Km, d)     = mu.shape
+    (Ka, da)    = va.shape
+
+    if not K == Km:
+        msg = "not same number of component in mean and weights"
+        raise GmmParamError(msg)
+
+    if not d == da:
+        msg = "not same number of dimensions in mean and variances"
+        raise GmmParamError(msg)
+
+    if Km == Ka:
+        mode = 'diag'
+    else:
+        mode = 'full'
+        if not Ka == Km*d:
+            msg = "not same number of dimensions in mean and variances"
+            raise GmmParamError(msg)
+        
+    return K, d, mode
+        
+# For EM on GMM
+def multiple_gauss_den(data, mu, va):
+    """Helper function to generate several Gaussian
+    pdf (different parameters) from the same data"""
+    mu  = N.atleast_2d(mu)
+    va  = N.atleast_2d(va)
+
+    K   = mu.shape[0]
+    n   = data.shape[0]
+    d   = data.shape[1]
+    
+    y   = N.zeros((n, K), float)
+    if mu.size == va.size:
+        for i in range(K):
+            y[:, i] = densities.gauss_den(data, mu[i, :], va[i, :])
+    else:
+        for i in range(K):
+            y[:, i] = densities.gauss_den(data, mu[i, :], 
+                        va[d*i:d*i+d, :])
+
+    return y
+
+def gmm_init_kmean(data, k, mode, init = [], niter = 10):
+    """gmm_init_kmean(data, k, mode, init = [], niter = 10)
+    
+    Init EM for GMM with kmean from data, for k components. 
+    
+    Args:
+        - data:     Each row of data is one frame of dimension d. 
+        - k:        Number of components to look for
+        - mode:     Diagonal or Full covariance matrices
+        - init:     The initial centroids. The value given for k is
+        ignored, and the number of row in initc is used instead. 
+        If initc is not given, then the centroids are initialized 
+        with the k first values of data.
+        - niter:    Number of iterations in kmean.
+    
+    Returns:
+        (w, mu, va), initial parameters for a GMM.
+
+    Method:
+        Each weight is equiprobable, each mean is one centroid returned by kmean, and
+    covariances for component i is initialized with covariance of 
+    data corresponding with label i. Other strategies are possible, this one
+    is an easy one"""
+    if len(init) == 0:
+        init   = data[0:k, :]
+    else:
+        k       = initc.shape[0]
+
+    if data.ndim == 1:
+        d   = 1
+    else:
+        d   = N.shape(data)[1]
+
+    (code, label)   = kmean(data, init, niter)
+
+    w   = N.ones(k, float) / k
+    mu  = code.copy()
+    if mode == 'diag':
+        va  = N.zeros((k, d), float)
+        for i in range(k):
+            for j in range(d):
+                va[i,j] = N.cov(data[N.where(label==i), j], rowvar = 0)
+    elif mode == 'full':
+        va  = N.zeros((k*d, d), float)
+        for i in range(k):
+            va[i*d:i*d+d,:] = N.cov(data[N.where(label==i)], rowvar = 0)
+    else:
+        raise GmmParamError("mode " + str(mode) + " not recognized")
+
+    return w, mu, va
+
+def gmm_posterior(data, w, mu, va):
+    """ Computes the latent variable distribution (a 
+    posteriori probability) knowing the explicit data 
+    for the Gaussian model (w, mu, var): gamma(t, i) = 
+        P[state = i | observation = data(t); w, mu, va]
+
+    This is basically the E step of EM for GMM.
+   
+    the second returned value is the non normalized version 
+    of gamma, and may be needed for some computation, 
+    like eg likelihood"""
+    n   = data.shape[0]
+    K   = len(w)
+
+    # compute the gaussian pdf
+    tgd	= multiple_gauss_den(data, mu, va)
+    # multiply by the weight
+    tgd	*= w
+    # Normalize to get a pdf
+    gd	= tgd  / N.sum(tgd, axis=1)[:, N.NewAxis]
+
+    return gd, tgd
+
+# This function is just calling gmm_update and gmm_posterior, with
+# initialization. This is ugly, and we should have a class to model a GMM
+# instead of all this code to try to guess correct values and parameters...
+def gmm_em(data, niter = 10, k = 2, mode = 'diag', w = [], mu = [], va = []):
+    """
+    gmm_em(data, niter = 10, k = 2, mode = 'diag', w = [], mu = [], va = []):
+
+    Compute the parameters of a Gaussian Mixture Model using EM algorithm, 
+    with initial values w, mu and va (overwritten by the function).
+
+    Args:
+        - data:     contains the observed features, one row is one frame, ie one 
+        observation of dimension d
+        - niter:    number of iterations
+        - mode:     'diag' or 'full', depending on the wanted model for cov
+        matrices.
+        - K:        number of components
+        - w, mu, va initial parameters for the GMM. All or none must be given.
+        If no initial values are given, initialized by gmm_init_kmean; if they
+        are given, mode and k are ignored, and guessed from the given parameters
+        instead.
+
+    Returns:
+        w, mu, va, like as found by EM, where like is the likelihood for each 
+        iteration.
+    """
+    if len(w) == 0:
+        w, mu, va   = gmm_init_kmean(data, k, mode, 5)
+    else:
+        k, d, mode  = check_gmm_parm(w, mu, va)
+    
+    like    = N.zeros(niter, float)
+    for i in range(niter):
+        g, tgd  = gmm_posterior(data, w, mu, va)
+        like[i] = N.sum(N.log(N.sum(tgd, 1)))
+        w, mu, va   = gmm_update(X, g, d, k, mode)
+
+    return w, mu, va, like
+    
+def gmm_update(data, gamma, d, K, varmode):
+    """Computes update of the Gaussian Mixture Model (M step)
+    from the a posteriori pdf, computed by gmm_posterior
+    (E step).
+    """
+    n       = data.shape[0]
+    invn    = 1.0/n
+    mGamma  = N.sum(gamma)
+
+    if varmode == 'diag':
+        mu  = N.zeros((K, d), float)
+        va  = N.zeros((K, d), float)
+        for k in range(K):
+            x       = N.sum(N.outerproduct(gamma[:, k], 
+                        N.ones((1, d))) * data)
+            xx      = N.sum(N.outerproduct(gamma[:, k], 
+                        N.ones((1, d))) * (data ** 2))
+
+            mu[k,:] = x / mGamma[k]
+            va[k,:] = xx  / mGamma[k] - mu[k,:] ** 2
+        w   = invn * mGamma
+
+    elif varmode == 'full':
+        mu  = N.zeros((K, d), float)
+        va  = N.zeros((K*d, d), float)
+
+        # This is really slow because of the loop on each frame
+        # ways to improve that: either use 3d matrices and see 
+        # the memory usage increasing in n*d*d,
+        # using C (or pyrex, or ...) or using the trick commented
+        # which loops on the dimensions instead. If the number of dimensions
+        # is high, this won't help, though...
+        for k in range(K):
+            x   = N.sum(N.outerproduct(gamma[:, k], 
+                        N.ones((1, d), float)) * data)
+            xx  = N.zeros((d, d), float)
+            # for i in range(n):
+            #     xx  += gamma[i, k] * N.outerproduct(data[i,:], 
+            #                 data[i,:])
+
+            # This should be much faster
+            for i in range(d):
+                for j in range(d):
+                    xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[:,k])
+
+            mu[k,:] = x / mGamma[k]
+            va[k*d:k*d+d,:] = xx  / mGamma[k] - \
+                                N.outerproduct(mu[k,:], mu[k,:])
+        w   = invn * mGamma
+    else:
+        raise GmmParamError("varmode not recognized")
+
+    return w, mu, va
+
+# Misc functions
+def gmm_ellipses(mu, va, c = [0, 1], npoints = 100):
+    """Returns a list of ellipses describing the Gmm
+    defined by mu and va. c is the dimension we are projecting
+    the variances on a 2d space.
+    
+    Returns:
+        -Xe:    a list of x coordinates for the ellipses
+        -Ye:    a list of y coordinates for the ellipses
+
+    Example:
+        Suppose we have w, mu and va as parameters for a mixture, then:
+        
+        X       = gen_gmm(w, mu, va, 1000)
+        Xe, Ye  = gmm_ellipses(mu, va)
+        pylab.plot(X[:,0], X[:, 1], '.')
+        for k in len(w):
+            pylab.plot(Xe[k], Ye[k], 'r')
+            
+        Will plot samples X draw from the mixture model, and
+        plot the ellipses of equi-probability from the mean with
+        fixed level of confidence 0.39. 
+        
+    TODO: be able to modify the confidence interval to arbitrary
+    value (to do in gauss_ell)"""
+    K   = mu.shape[0]
+    w   = N.ones(K, float) / K
+    
+    K, d, mode  = check_gmm_param(w, mu, va)
+
+    # TODO: adjustable level (to do in gauss_ell). 
+    # For now, a level of 0.39 means that we draw
+    # ellipses for 1 standard deviation. 
+    Xe  = []
+    Ye  = []   
+    if mode == 'diag':
+        for i in range(K):
+            xe, ye  = densities.gauss_ell(mu[i,:], va[i,:], dim = c, 
+                    npoints = npoints)
+            Xe.append(xe)
+            Ye.append(ye)
+    elif mode == 'full':
+        for i in range(K):
+            xe, ye  = densities.gauss_ell(mu[i,:], va[i*d:i*d+d,:], dim = c, 
+                    npoints = npoints)
+            Xe.append(xe)
+            Ye.append(ye)
+
+    return Xe, Ye
+
+import c_gmm
+def kmean(data, init, iter = 10):
+    """Simple kmean implementation for EM
+    
+    returns a tuple (code, label), where code are the final
+    centroids, and label are the class label indec for each
+    frame (ie row) of data"""
+
+    data    = N.atleast_2d(data)
+    init    = N.atleast_2d(init)
+
+    (n, d)  = data.shape
+    (k, d1) = init.shape
+
+    if not d == d1:
+        msg = "data and init centers do not have same dimensions..."
+        raise GmmParamError(msg)
+    
+    code    = N.asarray(init.copy(), float)
+    for i in range(iter):
+        # Compute the nearest neighbour for each obs
+        # using the current code book
+        label   = c_gmm._vq(data, code)
+        # Update the code by computing centroids using the new code book
+        for j in range(k):
+            code[j,:] = N.mean(data[N.where(label==j)]) 
+
+    return code, label
+
+def _vq(data, code):
+    """ Please do not use directly. Use kmean instead"""
+    # No attempt to be efficient has been made...
+    (n, d)  = data.shape
+    (k, d)  = code.shape
+
+    label   = N.zeros(n, int)
+    for i in range(n):
+        d           = N.sum((data[i, :] - code) ** 2, 1)
+        label[i]    = N.argmin(d)
+
+    return label
+    
+# Test functions usable for now
+def test_kmean():
+    X   = N.array([[3.0, 3], [4, 3], [4, 2],
+            [9, 2], [5, 1], [6, 2], [9, 4], 
+            [5, 2], [5, 4], [7, 4], [6, 5]])
+
+    initc   = N.concatenate(([[X[0]], [X[1]], [X[2]]])) 
+
+    codet1  = N.array([[3.0000, 3.0000],
+            [6.2000, 4.0000], 
+            [5.8000, 1.8000]])
+            
+    codet2  = N.array([[11.0/3, 8.0/3], 
+            [6.7500, 4.2500],
+            [6.2500, 1.7500]])
+
+    code    = initc.copy()
+
+    code1   = kmean(X, code, 1)[0]
+    code2   = kmean(X, code, 2)[0]
+
+    import numpy.testing as testing
+    try:
+        testing.assert_array_almost_equal(code1, codet1)
+        print "kmean test 1 succeded"
+    except AssertionError:
+        print "kmean test 1 failed"
+
+    try:
+        testing.assert_array_almost_equal(code2, codet2)
+        print "kmean test 2 succeded"
+    except AssertionError:
+        print "kmean test 2 failed"
+
+# Test functions (unusable by other people for now....)
+def test_gmm_den():
+    """"""
+    import tables
+    
+    filename    = 'gmm_test.h5'
+    
+    k   = 3
+    d   = 2
+    mode        = 'full'
+    w, mu, va   = gen_gmm_param(d, k, mode, 2)
+    X           = gen_gmm(w, mu, va, 1e3)
+
+    h5file      = tables.openFile(filename, "w")
+
+    h5file.createArray(h5file.root, 'mu', mu)
+    h5file.createArray(h5file.root, 'va', va)
+    h5file.createArray(h5file.root, 'X', X)
+
+    Y1 =  densities.gauss_den(X, mu[0,:], va[0:2,:])
+    Y2 =  densities.gauss_den(X, mu[1,:], va[2:4,:])
+    Y3 =  densities.gauss_den(X, mu[2,:], va[4:6,:])
+
+    Y  =  multiple_gauss_den(X, mu, va)
+
+    h5file.createArray(h5file.root, 'Y', Y)
+    h5file.createArray(h5file.root, 'Y2', Y2)
+    h5file.createArray(h5file.root, 'Y1', Y1)
+    h5file.createArray(h5file.root, 'Y3', Y3)
+
+    h5file.close()
+
+def test_gmm_em():
+    """"""
+    import numpy as N
+    import tables
+    import pylab as P
+    filename    = 'data.h5'
+    
+    k               = 3
+    d               = 2
+    mode            = 'full'
+    wr, mur, var    = gen_gmm_param(d, k, mode, 2)
+    X               = gen_gmm(wr, mur, var, 1e3)
+
+    h5file      = tables.openFile(filename, "w")
+
+    h5file.createArray(h5file.root, 'wr', wr)
+    h5file.createArray(h5file.root, 'mur', mur)
+    h5file.createArray(h5file.root, 'var', var)
+    h5file.createArray(h5file.root, 'X', X)
+
+    P.plot(X[:, 0], X[:, 1], '.')
+    Xe, Ye  = gmm_ellipses(mur, var, npoints = 100)
+    for i in range(k):
+        P.plot(Xe[i], Ye[i], 'g')
+
+    # init EM: simply use kmean
+    initc   = X[0:k, :]
+    (code, label)   = kmean(X, initc, 10)
+
+    h5file.createArray(h5file.root, 'initc', initc)
+
+    w0      = N.ones(k, float) / k
+    mu0     = code.copy()
+    if mode == 'diag':
+        va0     = N.zeros((k, d), float)
+        for i in range(k):
+            for j in range(d):
+                va0[i,j] = N.cov(X[N.where(label==i), j], rowvar = 0)
+    elif mode == 'full':
+        va0     = N.zeros((k*d, d), float)
+        for i in range(k):
+            va0[i*d:i*d+d,:] = N.cov(X[N.where(label==i)], rowvar = 0)
+
+    h5file.createArray(h5file.root, 'w0', w0)
+    h5file.createArray(h5file.root, 'mu0', mu0)
+    h5file.createArray(h5file.root, 'va0', va0)
+
+    
+    # # Use random values
+    # w0  = N.ones(k, float) / k
+    # mu0 = N.randn(k, d)
+    # va0 = N.fabs(N.randn(k, d))
+
+    w   = w0.copy()
+    mu  = mu0.copy()
+    va  = va0.copy()
+
+    X0e, Y0e  = gmm_ellipses(mu, va, npoints = 100)
+    for i in range(k):
+        P.plot(X0e[i], Y0e[i], 'k')
+
+    niter   = 10
+    like    = N.zeros(niter, float)
+
+    for i in range(niter):
+        g, tgd  = gmm_posterior(X, w, mu, va)
+        #h5file.createArray(h5file.root, 'g', g)
+        #h5file.createArray(h5file.root, 'tgd', tgd)
+        like[i] = N.sum(N.log(N.sum(tgd, 1)))
+        print like[i]
+        w, mu, va    = gmm_update(X, g, d, k, mode)
+
+    h5file.createArray(h5file.root, 'w', w)
+    h5file.createArray(h5file.root, 'mu', mu)
+    h5file.createArray(h5file.root, 'va', va)
+
+    X0e, Y0e  = gmm_ellipses(mu, va, npoints = 100)
+    for i in range(k):
+        P.plot(X0e[i], Y0e[i], 'r')
+    P.figure()
+    P.plot(like)
+    h5file.close()
+    P.show()
+
+def _bench1():
+    #===========================================
+    # Diag GMM with 5 components of dimension 20
+    #===========================================
+    k       = 5
+    d       = 20
+    mode    = 'diag'
+
+    print "Generating the mixture"
+    # Generate a model with k components, d dimensions
+    wr, mur, var    = gen_gmm_param(d, k, mode, 3)
+    X               = gen_gmm(wr, mur, var, 1e4)
+
+    print "Init the mixture"
+    # Init the mixture with kmean
+    w0, mu0, va0    = gmm_init_kmean(X, k, mode, niter = 5)
+    
+    # # Use random values instead of kmean
+    # w0  = N.ones(k, float) / k
+    # mu0 = N.randn(k, d)
+    # va0 = N.fabs(N.randn(k, d))
+
+    print "EM computing..."
+    # Copy the initial values because we want to draw them later...
+    w   = w0.copy()
+    mu  = mu0.copy()
+    va  = va0.copy()
+
+    # The actual EM, with likelihood computation
+    niter   = 10
+    like    = N.zeros(niter, float)
+
+    for i in range(niter):
+        print "post"
+        g, tgd  = gmm_posterior(X, w, mu, va)
+        like[i] = N.sum(N.log(N.sum(tgd, 1)))
+        print "update"
+        w, mu, va   = gmm_update(X, g, d, k, mode)
+
+
+def benchmark():
+    import profile
+    profile.run('_bench1()', 'bench1prof')
+
+if __name__ == "__main__":
+    #=============================
+    # Simple GMM with 3 components
+    #=============================
+    import pylab as P
+    k       = 4
+    d       = 2
+    mode    = 'diag'
+
+    print "Generating the mixture"
+    # Generate a model with k components, d dimensions
+    wr, mur, var    = gen_gmm_param(d, k, mode, 3)
+    X               = gen_gmm(wr, mur, var, 1e3)
+
+    print "Init the mixture"
+    # Init the mixture with kmean
+    w0, mu0, va0    = gmm_init_kmean(X, k, mode, niter = 5)
+    
+    # # Use random values instead of kmean
+    # w0  = N.ones(k, float) / k
+    # mu0 = N.randn(k, d)
+    # va0 = N.fabs(N.randn(k, d))
+
+    # Copy the initial values because we want to draw them later...
+    w   = w0.copy()
+    mu  = mu0.copy()
+    va  = va0.copy()
+
+    # The actual EM, with likelihood computation
+    niter   = 10
+    like    = N.zeros(niter, float)
+
+    print "computing..."
+    for i in range(niter):
+        print "post"
+        g, tgd  = gmm_posterior(X, w, mu, va)
+        like[i] = N.sum(N.log(N.sum(tgd, 1)))
+        print "update"
+        w, mu, va   = gmm_update(X, g, d, k, mode)
+
+    print "drawing..."
+    # Draw what is happening
+    P.subplot(2, 1, 1)
+    P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')
+
+    # Real confidence ellipses
+    Xre, Yre  = gmm_ellipses(mur, var)
+    P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
+    for i in range(1,k):
+        P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
+
+    # Initial confidence ellipses as found by kmean
+    X0e, Y0e  = gmm_ellipses(mu0, va0)
+    P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
+    for i in range(1,k):
+        P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
+
+    # Values found by EM
+    Xe, Ye  = gmm_ellipses(mu, va)
+    P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
+    for i in range(1,k):
+        P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
+    P.legend(loc = 0)
+    P.subplot(2, 1, 2)
+    P.plot(like)
+    P.title('log likelihood')
+
+    # # Export the figure
+    # F   = P.gcf()
+    # DPI = F.get_dpi()
+    # DefaultSize = F.get_size_inches()
+    # # the default is 100dpi for savefig:
+    # F.savefig("example1.png")
+
+    # # Now make the image twice as big, while keeping the fonts and all the
+    # # same size
+    # F.set_figsize_inches( (DefaultSize[0]*2, DefaultSize[1]*2) )
+    # Size = F.get_size_inches()
+    # print "Size in Inches", Size
+    # F.savefig("example2.png")
+    P.show()

Added: trunk/Lib/sandbox/pyem/pyem/src/Makefile
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/Makefile	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/src/Makefile	2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,26 @@
+CC	= colorgcc
+LD	= gcc
+
+PYREX	= python2.4-pyrexc
+
+PYTHONINC	= -I/usr/include/python2.4
+NUMPYINC	= -I/usr/lib/python2.4/site-packages/numpy/core/include
+OPTIMS		= -O3 -ffast-math -march=pentium4
+WARN		= -W -Wall
+
+CFLAGS	= $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN)
+
+c_gmm.so: c_gmm.o
+	$(LD) -shared -o $@ $< -Wl,-soname,$@
+
+c_gmm.o: c_gmm.c
+	$(CC) -c $(CFLAGS) -fPIC $<
+
+c_gmm.c: c_gmm.pyx c_numpy.pxd c_python.pxd
+	$(PYREX) $<
+
+clean:
+	rm -f c_gmm.c
+	rm -f *.o
+	rm -f *.so
+	rm -f *.pyc

Added: trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx	2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,66 @@
+# Last Change: Tue May 16 11:00 AM 2006 J
+
+cimport c_numpy
+cimport c_python
+import numpy
+
+c_numpy.import_array()
+
+# pyrex version of _vq. Much faster in high dimension/high number of K 
+# (ie more than 3-4)
+def _vq(data, init):
+    (n, d)  = data.shape
+    label   = numpy.zeros(n, int)
+    _imp_vq(data, init, label)
+
+    return label
+
+def _imp_vq(c_numpy.ndarray data, c_numpy.ndarray init, c_numpy.ndarray label):
+    cdef int n
+    cdef int d
+    cdef int nc
+    cdef int i
+    cdef int j
+    cdef int k
+    cdef int *labeld
+    cdef double *da, *code
+    cdef double dist
+    cdef double acc
+
+    n   = data.dimensions[0]
+    d   = data.dimensions[1]
+    nc  = init.dimensions[0]
+
+    if not data.dtype == numpy.dtype(numpy.float64):
+        print '_vq not (yet) implemented for dtype %s'%dtype.name
+        return
+    da  = (<double*>data.data)
+
+    if not init.dtype == numpy.dtype(numpy.float64):
+        print '_vq not (yet) implemented for dtype %s'%dtype.name
+        return
+    code    = (<double*>init.data)
+
+    if not label.dtype == numpy.dtype(numpy.int32):
+        print '_vq not (yet) implemented for dtype %s'%dtype.name
+        return
+    labeld  = (<int*>label.data)
+
+    for i from 0<=i<n:
+        acc = 0
+        lab = 0
+        for j from 0<=j<d:
+            acc = acc + (da[i * d + j] - code[j]) * \
+                (da[i * d + j] - code[j])
+        dist    = acc
+        for k from 1<=k<nc:
+            acc     = 0
+            for j from 0<=j<d:
+                acc = acc + (da[i * d + j] - code[k * d + j]) * \
+                    (da[i * d + j] - code[k * d + j])
+            if acc < dist:
+                dist    = acc
+                lab     = k
+        labeld[i]   = lab
+
+    return lab

Added: trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd	2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,59 @@
+# :Author:    Robert Kern
+# :Copyright: 2004, Enthought, Inc.
+# :License:   BSD Style
+
+
+cdef extern from "numpy/arrayobject.h":
+    ctypedef enum PyArray_TYPES:
+        PyArray_BOOL
+        PyArray_BYTE
+        PyArray_UBYTE
+        PyArray_SHORT
+        PyArray_USHORT 
+        PyArray_INT
+        PyArray_UINT 
+        PyArray_LONG
+        PyArray_ULONG
+        PyArray_LONGLONG
+        PyArray_ULONGLONG
+        PyArray_FLOAT
+        PyArray_DOUBLE 
+        PyArray_LONGDOUBLE
+        PyArray_CFLOAT
+        PyArray_CDOUBLE
+        PyArray_CLONGDOUBLE
+        PyArray_OBJECT
+        PyArray_STRING
+        PyArray_UNICODE
+        PyArray_VOID
+        PyArray_NTYPES
+        PyArray_NOTYPE
+
+    ctypedef int intp 
+
+    ctypedef extern class numpy.dtype [object PyArray_Descr]:
+        cdef int type_num, elsize, alignment
+        cdef char type, kind, byteorder, hasobject
+        cdef object fields, typeobj
+
+    ctypedef extern class numpy.ndarray [object PyArrayObject]:
+        cdef char *data
+        cdef int nd
+        cdef intp *dimensions
+        cdef intp *strides
+        cdef object base
+        cdef dtype descr
+        cdef int flags
+
+    ndarray PyArray_SimpleNew(int ndims, intp* dims, int item_type)
+    int PyArray_Check(object obj)
+    ndarray PyArray_ContiguousFromObject(object obj, PyArray_TYPES type, 
+        int mindim, int maxdim)
+    intp PyArray_SIZE(ndarray arr)
+    void *PyArray_DATA(ndarray arr)
+    ndarray PyArray_FromAny(object obj, dtype newtype, int mindim, int maxdim,
+		    int requirements, object context)
+    ndarray PyArray_NewFromDescr(object subtype, dtype newtype, int nd, intp* dims,
+		    intp* strides, void* data, int flags, object parent)
+
+    void import_array()

Added: trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd	2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd	2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,20 @@
+# -*- Mode: Python -*-  Not really, but close enough
+
+# Expose as much of the Python C API as we need here
+
+cdef extern from "stdlib.h":
+    ctypedef int size_t
+
+cdef extern from "Python.h":
+    ctypedef int Py_intptr_t
+    void*  PyMem_Malloc(size_t)
+    void*  PyMem_Realloc(void *p, size_t n)
+    void   PyMem_Free(void *p)
+    char*  PyString_AsString(object string)
+    object PyString_FromString(char *v)
+    object PyString_InternFromString(char *v)
+    int    PyErr_CheckSignals()
+    object PyFloat_FromDouble(double v)
+    void   Py_XINCREF(object o)
+    void   Py_XDECREF(object o)
+    void   Py_CLEAR(object o) # use instead of decref




More information about the Scipy-svn mailing list