[Scipy-svn] r2267 - in trunk/Lib/sandbox/pyem: . pyem pyem/src
scipy-svn at scipy.org
scipy-svn at scipy.org
Thu Oct 12 09:45:23 EDT 2006
Author: cdavid
Date: 2006-10-12 08:45:14 -0500 (Thu, 12 Oct 2006)
New Revision: 2267
Added:
trunk/Lib/sandbox/pyem/TODO
trunk/Lib/sandbox/pyem/pyem/
trunk/Lib/sandbox/pyem/pyem/__init__.py
trunk/Lib/sandbox/pyem/pyem/densities.py
trunk/Lib/sandbox/pyem/pyem/gmm_em.py
trunk/Lib/sandbox/pyem/pyem/src/
trunk/Lib/sandbox/pyem/pyem/src/Makefile
trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx
trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd
trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd
Removed:
trunk/Lib/sandbox/pyem/Makefile
trunk/Lib/sandbox/pyem/c_gmm.pyx
trunk/Lib/sandbox/pyem/c_numpy.pxd
trunk/Lib/sandbox/pyem/c_python.pxd
trunk/Lib/sandbox/pyem/densities.py
trunk/Lib/sandbox/pyem/gmm_em.py
Log:
[pyem @ david at ar.media.kyoto-u.ac.jp-20060712045202-8dd4340c31b48cd7]
Put files into a package
David Cournapeau <david at ar.media.kyoto-u.ac.jp> | 2006-07-12 13:52:02 +0900 (Wed, 12 Jul 2006)
Deleted: trunk/Lib/sandbox/pyem/Makefile
===================================================================
--- trunk/Lib/sandbox/pyem/Makefile 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/Makefile 2006-10-12 13:45:14 UTC (rev 2267)
@@ -1,26 +0,0 @@
-CC = colorgcc
-LD = gcc
-
-PYREX = python2.4-pyrexc
-
-PYTHONINC = -I/usr/include/python2.4
-NUMPYINC = -I/usr/lib/python2.4/site-packages/numpy/core/include
-OPTIMS = -O3 -ffast-math -march=pentium4
-WARN = -W -Wall
-
-CFLAGS = $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN)
-
-c_gmm.so: c_gmm.o
- $(LD) -shared -o $@ $< -Wl,-soname,$@
-
-c_gmm.o: c_gmm.c
- $(CC) -c $(CFLAGS) -fPIC $<
-
-c_gmm.c: c_gmm.pyx c_numpy.pxd c_python.pxd
- $(PYREX) $<
-
-clean:
- rm -f c_gmm.c
- rm -f *.o
- rm -f *.so
- rm -f *.pyc
Added: trunk/Lib/sandbox/pyem/TODO
===================================================================
--- trunk/Lib/sandbox/pyem/TODO 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/TODO 2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,7 @@
+# Last Change: Fri Jun 30 07:00 PM 2006 J
+
+# In more urgent order:
+
+1: test and benchmarks for various length and model size
+2: refactoring for Mixture models, having online mode, etc...
+3: for gaussian densities: compute level <-> confidence ellipses relationship with Chi2 model
Deleted: trunk/Lib/sandbox/pyem/c_gmm.pyx
===================================================================
--- trunk/Lib/sandbox/pyem/c_gmm.pyx 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/c_gmm.pyx 2006-10-12 13:45:14 UTC (rev 2267)
@@ -1,66 +0,0 @@
-# Last Change: Tue May 16 11:00 AM 2006 J
-
-cimport c_numpy
-cimport c_python
-import numpy
-
-c_numpy.import_array()
-
-# pyrex version of _vq. Much faster in high dimension/high number of K
-# (ie more than 3-4)
-def _vq(data, init):
- (n, d) = data.shape
- label = numpy.zeros(n, int)
- _imp_vq(data, init, label)
-
- return label
-
-def _imp_vq(c_numpy.ndarray data, c_numpy.ndarray init, c_numpy.ndarray label):
- cdef int n
- cdef int d
- cdef int nc
- cdef int i
- cdef int j
- cdef int k
- cdef int *labeld
- cdef double *da, *code
- cdef double dist
- cdef double acc
-
- n = data.dimensions[0]
- d = data.dimensions[1]
- nc = init.dimensions[0]
-
- if not data.dtype == numpy.dtype(numpy.float64):
- print '_vq not (yet) implemented for dtype %s'%dtype.name
- return
- da = (<double*>data.data)
-
- if not init.dtype == numpy.dtype(numpy.float64):
- print '_vq not (yet) implemented for dtype %s'%dtype.name
- return
- code = (<double*>init.data)
-
- if not label.dtype == numpy.dtype(numpy.int32):
- print '_vq not (yet) implemented for dtype %s'%dtype.name
- return
- labeld = (<int*>label.data)
-
- for i from 0<=i<n:
- acc = 0
- lab = 0
- for j from 0<=j<d:
- acc = acc + (da[i * d + j] - code[j]) * \
- (da[i * d + j] - code[j])
- dist = acc
- for k from 1<=k<nc:
- acc = 0
- for j from 0<=j<d:
- acc = acc + (da[i * d + j] - code[k * d + j]) * \
- (da[i * d + j] - code[k * d + j])
- if acc < dist:
- dist = acc
- lab = k
- labeld[i] = lab
-
- return lab
Deleted: trunk/Lib/sandbox/pyem/c_numpy.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/c_numpy.pxd 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/c_numpy.pxd 2006-10-12 13:45:14 UTC (rev 2267)
@@ -1,59 +0,0 @@
-# :Author: Robert Kern
-# :Copyright: 2004, Enthought, Inc.
-# :License: BSD Style
-
-
-cdef extern from "numpy/arrayobject.h":
- ctypedef enum PyArray_TYPES:
- PyArray_BOOL
- PyArray_BYTE
- PyArray_UBYTE
- PyArray_SHORT
- PyArray_USHORT
- PyArray_INT
- PyArray_UINT
- PyArray_LONG
- PyArray_ULONG
- PyArray_LONGLONG
- PyArray_ULONGLONG
- PyArray_FLOAT
- PyArray_DOUBLE
- PyArray_LONGDOUBLE
- PyArray_CFLOAT
- PyArray_CDOUBLE
- PyArray_CLONGDOUBLE
- PyArray_OBJECT
- PyArray_STRING
- PyArray_UNICODE
- PyArray_VOID
- PyArray_NTYPES
- PyArray_NOTYPE
-
- ctypedef int intp
-
- ctypedef extern class numpy.dtype [object PyArray_Descr]:
- cdef int type_num, elsize, alignment
- cdef char type, kind, byteorder, hasobject
- cdef object fields, typeobj
-
- ctypedef extern class numpy.ndarray [object PyArrayObject]:
- cdef char *data
- cdef int nd
- cdef intp *dimensions
- cdef intp *strides
- cdef object base
- cdef dtype descr
- cdef int flags
-
- ndarray PyArray_SimpleNew(int ndims, intp* dims, int item_type)
- int PyArray_Check(object obj)
- ndarray PyArray_ContiguousFromObject(object obj, PyArray_TYPES type,
- int mindim, int maxdim)
- intp PyArray_SIZE(ndarray arr)
- void *PyArray_DATA(ndarray arr)
- ndarray PyArray_FromAny(object obj, dtype newtype, int mindim, int maxdim,
- int requirements, object context)
- ndarray PyArray_NewFromDescr(object subtype, dtype newtype, int nd, intp* dims,
- intp* strides, void* data, int flags, object parent)
-
- void import_array()
Deleted: trunk/Lib/sandbox/pyem/c_python.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/c_python.pxd 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/c_python.pxd 2006-10-12 13:45:14 UTC (rev 2267)
@@ -1,20 +0,0 @@
-# -*- Mode: Python -*- Not really, but close enough
-
-# Expose as much of the Python C API as we need here
-
-cdef extern from "stdlib.h":
- ctypedef int size_t
-
-cdef extern from "Python.h":
- ctypedef int Py_intptr_t
- void* PyMem_Malloc(size_t)
- void* PyMem_Realloc(void *p, size_t n)
- void PyMem_Free(void *p)
- char* PyString_AsString(object string)
- object PyString_FromString(char *v)
- object PyString_InternFromString(char *v)
- int PyErr_CheckSignals()
- object PyFloat_FromDouble(double v)
- void Py_XINCREF(object o)
- void Py_XDECREF(object o)
- void Py_CLEAR(object o) # use instead of decref
Deleted: trunk/Lib/sandbox/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/densities.py 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/densities.py 2006-10-12 13:45:14 UTC (rev 2267)
@@ -1,375 +0,0 @@
-#! /usr/bin/python
-#
-# Copyrighted David Cournapeau
-# Last Change: Mon May 29 01:00 PM 2006 J
-
-import numpy as N
-import numpy.linalg as lin
-
-# Error classes
-class DenError(Exception):
- """Base class for exceptions in this module.
-
- Attributes:
- expression -- input expression in which the error occurred
- message -- explanation of the error"""
- def __init__(self, message):
- self.message = message
-
- def __str__(self):
- return self.message
-
-# The following function do all the fancy stuff to check that parameters
-# are Ok, and call the right implementation if args are OK.
-def gauss_den(x, mu, va, log = False):
- """ Compute multivariate Gaussian density at points x for
- mean mu and variance va.
-
- Vector are row vectors, except va which can be a matrix
- (row vector variance for diagonal variance)
-
- If log is True, than the log density is returned
- (useful for underflow ?)"""
- mu = N.atleast_2d(mu)
- va = N.atleast_2d(va)
- x = N.atleast_2d(x)
-
- #=======================#
- # Checking parameters #
- #=======================#
- if len(N.shape(mu)) != 2:
- raise DenError("mu is not rank 2")
-
- if len(N.shape(va)) != 2:
- raise DenError("va is not rank 2")
-
- if len(N.shape(x)) != 2:
- raise DenError("x is not rank 2")
-
- (n, d) = x.shape
- (dm0, dm1) = mu.shape
- (dv0, dv1) = va.shape
-
- # Check x and mu same dimension
- if dm0 != 1:
- msg = "mean must be a row vector!"
- raise DenError(msg)
- if dm1 != d:
- msg = "x and mu not same dim"
- raise DenError(msg)
- # Check va and mu same size
- if dv1 != d:
- msg = "mu and va not same dim"
- raise DenError(msg)
- if dv0 != 1 and dv0 != d:
- msg = "va not square"
- raise DenError(msg)
-
- #===============#
- # Computation #
- #===============#
- data = x - mu
-
- if d == 1:
- # scalar case
- return _scalar_gauss_den(data[:, 0], mu[0, 0], va[0, 0], log)
- elif dv0 == 1:
- # Diagonal matrix case
- return _diag_gauss_den(data, mu, va, log)
- elif dv1 == dv0:
- # full case
- return _full_gauss_den(data, mu, va, log)
- else:
- raise DenError("variance mode not recognized, this is a bug")
-
-# Those 3 functions do almost all the actual computation
-def _scalar_gauss_den(x, mu, va, log):
- """ This function is the actual implementation
- of gaussian pdf in scalar case. It assumes all args
- are conformant, so it should not be used directly
-
- ** Expect centered data (ie with mean removed) **
-
- Call gauss_den instead"""
- d = mu.size
- inva = 1/va
- fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
- y = (x ** 2) * -0.5 * inva
- if not log:
- y = fac * N.exp(y)
- else:
- y = y + log(fac)
-
- return y
-
-def _diag_gauss_den(x, mu, va, log):
- """ This function is the actual implementation
- of gaussian pdf in scalar case. It assumes all args
- are conformant, so it should not be used directly
-
- ** Expect centered data (ie with mean removed) **
-
- Call gauss_den instead"""
- # Diagonal matrix case
- d = mu.size
- y = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
- if not log:
- for i in range(1, d):
- y *= _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
- else:
- for i in range(1, d):
- y += _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
- return y
-
-def _full_gauss_den(x, mu, va, log):
- """ This function is the actual implementation
- of gaussian pdf in full matrix case.
-
- It assumes all args are conformant, so it should
- not be used directly Call gauss_den instead
-
- ** Expect centered data (ie with mean removed) **
-
- Does not check if va is definite positive (on inversible
- for that matter), so the inverse computation and/or determinant
- would throw an exception."""
- d = mu.size
- inva = lin.inv(va)
- fac = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
-
- # # Slow version
- # n = N.size(x, 0)
- # y = N.zeros(n, float)
- # for i in range(n):
- # y[i] = N.matrixmultiply(x[i,:],
- # N.matrixmultiply(inva, N.transpose(x[i,:])))
- # y *= -0.5
-
- # we are using a trick with sum to "emulate"
- # the matrix multiplication inva * x without any explicit loop
- y = N.matrixmultiply(x, inva)
- y = -0.5 * N.sum(y * x, 1)
-
- if not log:
- y = fac * N.exp(y)
- else:
- y = y + N.log(fac)
-
- return y
-
-# To plot a confidence ellipse from multi-variate gaussian pdf
-def gauss_ell(mu, va, dim = [0, 1], npoints = 100):
- """ Given a mean and covariance for multi-variate
- gaussian, returns npoints points for the ellipse
- of confidence 0.39
-
- Returns the coordinate x and y of the ellipse"""
-
- # TODO: Get a confidence interval using the Chi2 distribution
- # of points at a given mahalanobis distance...
- mu = N.atleast_1d(mu)
- va = N.atleast_1d(va)
- c = N.array(dim)
-
- if mu.size == va.size:
- mode = 'diag'
- else:
- if va.ndim == 2:
- if va.shape[0] == va.shape[1]:
- mode = 'full'
- else:
- raise DenError("variance not square")
- else:
- raise DenError("mean and variance are not dim conformant")
-
- level = 0.39
-
- # Generates a circle of npoints
- theta = N.linspace(0, 2 * N.pi, npoints)
- circle = N.array([N.cos(theta), N.sin(theta)])
-
- # Get the dimension which we are interested in:
- mu = mu[dim]
- if mode == 'diag':
- va = va[dim]
- elps = N.outerproduct(mu, N.ones(npoints, float))
- elps += N.matrixmultiply(N.diag(N.sqrt(va)), circle)
- elif mode == 'full':
- va = va[c,:][:,c]
- # Method: compute the cholesky decomp of each cov matrix, that is
- # compute cova such as va = cova * cova'
- # WARN: scipy is different than matlab here, as scipy computes a lower
- # triangular cholesky decomp:
- # - va = cova * cova' (scipy)
- # - va = cova' * cova (matlab)
- # So take care when comparing results with matlab !
- cova = lin.cholesky(va)
- elps = N.outerproduct(mu, N.ones(npoints, float))
- elps += N.matrixmultiply(cova, circle)
- else:
- raise DenParam("var mode not recognized")
-
- return elps[0, :], elps[1, :]
-
-def test_gauss_den():
- """"""
- # import tables
- # import numpy as N
- #
- # filename = 'dendata.h5'
-
- # # # Dimension 1
- # # d = 1
- # # mu = 1.0
- # # va = 2.0
-
- # # X = N.randn(1e3, 1)
-
- # # Y = gauss_den(X, mu, va)
-
- # # h5file = tables.openFile(filename, "w")
-
- # # h5file.createArray(h5file.root, 'X', X)
- # # h5file.createArray(h5file.root, 'mu', mu)
- # # h5file.createArray(h5file.root, 'va', va)
- # # h5file.createArray(h5file.root, 'Y', Y)
-
- # # h5file.close()
-
- # # # Dimension 2, diag
- # # d = 2
- # # mu = N.array([1.0, -2.0])
- # # va = N.array([1.0, 2.0])
-
- # # X = N.randn(1e3, 2)
-
- # # Y = gauss_den(X, mu, va)
-
- # # h5file = tables.openFile(filename, "w")
-
- # # h5file.createArray(h5file.root, 'X', X)
- # # h5file.createArray(h5file.root, 'mu', mu)
- # # h5file.createArray(h5file.root, 'va', va)
- # # h5file.createArray(h5file.root, 'Y', Y)
-
- # # Dimension 2, full
- # d = 2
- # mu = N.array([[0.2, -1.0]])
- # va = N.array([[1.2, 0.1], [0.1, 0.5]])
-
- # X = N.randn(1e3, 2)
-
- # Y = gauss_den(X, mu, va)
-
- # h5file = tables.openFile(filename, "w")
-
- # h5file.createArray(h5file.root, 'X', X)
- # h5file.createArray(h5file.root, 'mu', mu)
- # h5file.createArray(h5file.root, 'va', va)
- # h5file.createArray(h5file.root, 'Y', Y)
-
- # h5file.close()
-
- import numpy.testing as testing
- #=================
- # Small test in 1d
- #=================
- va = 2.0
- mu = 1.0
- X = N.linspace(-2, 2, 10)[:, N.NewAxis]
-
- Yt = N.array([0.02973257230591, 0.05512079811082, 0.09257745306945,
- 0.14086453882683,
- 0.19418015562214, 0.24250166773127, 0.27436665745048, 0.28122547107069,
- 0.26114678964743, 0.21969564473386])
-
- Y = gauss_den(X, mu, va)
- try:
- testing.assert_array_almost_equal(Y, Yt)
- print "1d test succeded"
- except AssertionError:
- print "test fails in 1d"
-
- #============================
- # Small test in 2d (diagonal)
- #============================
- mu = N.atleast_2d([-1.0, 2.0])
- va = N.atleast_2d([2.0, 3.0])
- X1 = N.linspace(-2, 2, 10)[:, N.NewAxis]
- X2 = N.linspace(-1, 3, 10)[:, N.NewAxis]
- X = N.concatenate(([X1, X2]), 1)
-
- Yt = N.array([0.01129091565384, 0.02025416837152, 0.03081845516786,
- 0.03977576221540, 0.04354490552910, 0.04043592581117,
- 0.03184994053539, 0.02127948225225, 0.01205937178755,
- 0.00579694938623 ])
-
- Y = gauss_den(X, mu, va)
- try:
- testing.assert_array_almost_equal(Y, Yt)
- print "2d diag test succeded"
- except AssertionError:
- print "test fails in 2d diag"
-
- #============================
- # Small test in 2d (full mat)
- #============================
- mu = N.array([[0.2, -1.0]])
- va = N.array([[1.2, 0.1], [0.1, 0.5]])
- X1 = N.linspace(-2, 2, 10)[:, N.NewAxis]
- X2 = N.linspace(-3, 3, 10)[:, N.NewAxis]
- X = N.concatenate(([X1, X2]), 1)
-
- Yt = N.array([0.00096157109751, 0.01368908714856,
- 0.07380823191162, 0.15072050533842,
- 0.11656739937861, 0.03414436965525,
- 0.00378789836599, 0.00015915297541,
- 0.00000253261067, 0.00000001526368])
-
- Y = gauss_den(X, mu, va)
- try:
- testing.assert_array_almost_equal(Y, Yt)
- print "2d full test succeded"
- except AssertionError:
- print "test fails in 2d full"
-
-
-if __name__ == "__main__":
- import pylab
-
- #=========================================
- # Test plotting a simple diag 2d variance:
- #=========================================
- va = N.array([5, 3])
- mu = N.array([2, 3])
-
- # Generate a multivariate gaussian of mean mu and covariance va
- X = N.randn(1e3, 2)
- Yc = N.matrixmultiply(N.diag(N.sqrt(va)), X.transpose())
- Yc = Yc.transpose() + mu
-
- # Plotting
- Xe, Ye = gauss_ell(mu, va, npoints = 100)
- pylab.figure()
- pylab.plot(Yc[:, 0], Yc[:, 1], '.')
- pylab.plot(Xe, Ye, 'r')
-
- #=========================================
- # Test plotting a simple full 2d variance:
- #=========================================
- va = N.array([[0.2, 0.1],[0.1, 0.5]])
- mu = N.array([0, 3])
-
- # Generate a multivariate gaussian of mean mu and covariance va
- X = N.randn(1e3, 2)
- Yc = N.matrixmultiply(lin.cholesky(va), X.transpose())
- Yc = Yc.transpose() + mu
-
- # Plotting
- Xe, Ye = gauss_ell(mu, va, npoints = 100)
- pylab.figure()
- pylab.plot(Yc[:, 0], Yc[:, 1], '.')
- pylab.plot(Xe, Ye, 'r')
- pylab.show()
-
- savefig('example.png')
Deleted: trunk/Lib/sandbox/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/gmm_em.py 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/gmm_em.py 2006-10-12 13:45:14 UTC (rev 2267)
@@ -1,734 +0,0 @@
-# /usr/bin/python
-# Last Change: Thu May 25 03:00 PM 2006 J
-
-# TODO:
-# - interface with libem
-# - implements E and M in batch mode for mixtures, with special case for gmm
-# ( general EM should be outside this module. This one should be GMM specific, maybe)
-# - implements E and M in online mode
-# - python has list. Does it make sense to keep the h2m format for mean, va, when
-# we can use list instead ?
-# - a Gaussian Mixture Model should be a class, really !
-
-import numpy as N
-import numpy.linalg as lin
-import densities
-
-MAX_DEV = 1e-10
-
-# Error classes
-class GmmError(Exception):
- """Base class for exceptions in this module."""
- pass
-
-class GmmParamError(GmmError):
- """Exception raised for errors in gmm params
-
- Attributes:
- expression -- input expression in which the error occurred
- message -- explanation of the error
- """
- def __init__(self, message):
- self.message = message
-
- def __str__(self):
- return self.message
-
-
-# Function to generate a GMM, or valid parameters for GMM
-def gen_rand_index(p, n):
- """Generate a N samples vector containing random index between 1
- and length(p), each index i with probability p(i)"""
- # TODO Check args here
-
- # TODO: check each value of inverse distribution is
- # different
- invcdf = N.cumsum(p)
- uni = N.rand(n)
- index = N.zeros(n)
-
- # This one should be a bit faster
- for k in range(len(p)-1, 0, -1):
- blop = N.where(N.logical_and(invcdf[k-1] <= uni,
- uni < invcdf[k]))
- index[blop] = k
-
- return index
-
-def gen_gmm(w, mu, va, n):
- """Generate a gaussiam mixture model with weights w,
- mean mu and variances va. Each column of the parameters
- are one component parameter.
- """
- # Check args
- K, d, mode = check_gmm_param(w, mu, va)
-
- # Generate the mixture
- S = gen_rand_index(w, n) # State index (ie hidden var)
- X = N.randn(n, d) # standard gaussian
-
- if mode == 'diag':
- X = mu[S, :] + X * N.sqrt(va[S,:])
- elif mode == 'full':
- # Naive implementation
- # cho = N.zeros(va.shape, float)
- # for k in range(K):
- # # Using cholesky is more stable than sqrtm; sqrtm is not
- # # available in numpy anyway, only in scipy...
- # cho[k*d:k*d+d,:] = lin.cholesky(va[k*d:k*d+d,:])
- # for i in range(n):
- # X[i,:] = mu[S[i],:] + N.matrixmultiply(cho[S[i]*d:S[i]*d+d,:], X[i,:])
-
- # Faster:
- cho = N.zeros((K, va.shape[1], va.shape[1]), float)
- for k in range(K):
- # Using cholesky is more stable than sqrtm; sqrtm is not
- # available in numpy anyway, only in scipy...
- cho[k] = lin.cholesky(va[k*d:k*d+d,:])
-
- for s in range(K):
- tmpind = N.where(S == s)[0]
- X[tmpind] = N.matrixmultiply(X[tmpind], cho[s].transpose()) + mu[s]
- else:
- raise GmmParamError('cov matrix mode not recognized')
-
- return X
-
-def gen_gmm_param(d, K, varmode = 'diag', spread = 1):
- """Generate valid parameters for a gaussian mixture model.
- d is the dimension, K the number of components, and varmode
- the mode for cov matrices.
-
- Returns:
- - w
- - mu
- - va
- """
- w = abs(N.randn(K))
- w = w / sum(w)
-
- mu = spread * N.randn(K, d)
- if varmode == 'diag':
- va = abs(N.randn(K, d))
- elif varmode == 'full':
- va = N.randn(K * d, d)
- for k in range(K):
- va[k*d:k*d+d] = N.matrixmultiply( va[k*d:k*d+d],
- va[k*d:k*d+d].transpose())
- else:
- raise GmmParamError('cov matrix mode not recognized')
-
- return w, mu, va
-
-def check_gmm_param(w, mu, va):
- """Check that w, mu and va are valid parameters for
- a mixture of gaussian: w should sum to 1, there should
- be the same number of component in each param, the variances
- should be positive definite, etc...
-
- Params:
- w = vector or list of weigths of the mixture (K elements)
- mu = matrix: K * d
- va = list of variances (vector K * d or square matrices Kd * d)
-
- returns:
- K = number of components
- d = dimension
- mode = 'diag' if diagonal covariance, 'full' of full matrices
- """
-
- # Check that w is valid
- if N.fabs(N.sum(w) - 1) > MAX_DEV:
- raise GmmParamError('weight does not sum to 1')
-
- if not len(w.shape) == 1:
- raise GmmParamError('weight is not a vector')
-
- # Check that mean and va have the same number of components
- K = len(w)
-
- if N.ndim(mu) < 2:
- msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
- raise GmmParamError(msg)
- if N.ndim(va) < 2:
- msg = """va should be a K,d / K *d, d matrix, and a row vector if
- only 1 diag comp"""
- raise GmmParamError(msg)
-
- (Km, d) = mu.shape
- (Ka, da) = va.shape
-
- if not K == Km:
- msg = "not same number of component in mean and weights"
- raise GmmParamError(msg)
-
- if not d == da:
- msg = "not same number of dimensions in mean and variances"
- raise GmmParamError(msg)
-
- if Km == Ka:
- mode = 'diag'
- else:
- mode = 'full'
- if not Ka == Km*d:
- msg = "not same number of dimensions in mean and variances"
- raise GmmParamError(msg)
-
- return K, d, mode
-
-# For EM on GMM
-def multiple_gauss_den(data, mu, va):
- """Helper function to generate several Gaussian
- pdf (different parameters) from the same data"""
- mu = N.atleast_2d(mu)
- va = N.atleast_2d(va)
-
- K = mu.shape[0]
- n = data.shape[0]
- d = data.shape[1]
-
- y = N.zeros((n, K), float)
- if mu.size == va.size:
- for i in range(K):
- y[:, i] = densities.gauss_den(data, mu[i, :], va[i, :])
- else:
- for i in range(K):
- y[:, i] = densities.gauss_den(data, mu[i, :],
- va[d*i:d*i+d, :])
-
- return y
-
-def gmm_init_kmean(data, k, mode, init = [], niter = 10):
- """gmm_init_kmean(data, k, mode, init = [], niter = 10)
-
- Init EM for GMM with kmean from data, for k components.
-
- Args:
- - data: Each row of data is one frame of dimension d.
- - k: Number of components to look for
- - mode: Diagonal or Full covariance matrices
- - init: The initial centroids. The value given for k is
- ignored, and the number of row in initc is used instead.
- If initc is not given, then the centroids are initialized
- with the k first values of data.
- - niter: Number of iterations in kmean.
-
- Returns:
- (w, mu, va), initial parameters for a GMM.
-
- Method:
- Each weight is equiprobable, each mean is one centroid returned by kmean, and
- covariances for component i is initialized with covariance of
- data corresponding with label i. Other strategies are possible, this one
- is an easy one"""
- if len(init) == 0:
- init = data[0:k, :]
- else:
- k = initc.shape[0]
-
- if data.ndim == 1:
- d = 1
- else:
- d = N.shape(data)[1]
-
- (code, label) = kmean(data, init, niter)
-
- w = N.ones(k, float) / k
- mu = code.copy()
- if mode == 'diag':
- va = N.zeros((k, d), float)
- for i in range(k):
- for j in range(d):
- va[i,j] = N.cov(data[N.where(label==i), j], rowvar = 0)
- elif mode == 'full':
- va = N.zeros((k*d, d), float)
- for i in range(k):
- va[i*d:i*d+d,:] = N.cov(data[N.where(label==i)], rowvar = 0)
- else:
- raise GmmParamError("mode " + str(mode) + " not recognized")
-
- return w, mu, va
-
-def gmm_posterior(data, w, mu, va):
- """ Computes the latent variable distribution (a
- posteriori probability) knowing the explicit data
- for the Gaussian model (w, mu, var): gamma(t, i) =
- P[state = i | observation = data(t); w, mu, va]
-
- This is basically the E step of EM for GMM.
-
- the second returned value is the non normalized version
- of gamma, and may be needed for some computation,
- like eg likelihood"""
- n = data.shape[0]
- K = len(w)
-
- # compute the gaussian pdf
- tgd = multiple_gauss_den(data, mu, va)
- # multiply by the weight
- tgd *= w
- # Normalize to get a pdf
- gd = tgd / N.sum(tgd, axis=1)[:, N.NewAxis]
-
- return gd, tgd
-
-# This function is just calling gmm_update and gmm_posterior, with
-# initialization. This is ugly, and we should have a class to model a GMM
-# instead of all this code to try to guess correct values and parameters...
-def gmm_em(data, niter = 10, k = 2, mode = 'diag', w = [], mu = [], va = []):
- """
- gmm_em(data, niter = 10, k = 2, mode = 'diag', w = [], mu = [], va = []):
-
- Compute the parameters of a Gaussian Mixture Model using EM algorithm,
- with initial values w, mu and va (overwritten by the function).
-
- Args:
- - data: contains the observed features, one row is one frame, ie one
- observation of dimension d
- - niter: number of iterations
- - mode: 'diag' or 'full', depending on the wanted model for cov
- matrices.
- - K: number of components
- - w, mu, va initial parameters for the GMM. All or none must be given.
- If no initial values are given, initialized by gmm_init_kmean; if they
- are given, mode and k are ignored, and guessed from the given parameters
- instead.
-
- Returns:
- w, mu, va, like as found by EM, where like is the likelihood for each
- iteration.
- """
- if len(w) == 0:
- w, mu, va = gmm_init_kmean(data, k, mode, 5)
- else:
- k, d, mode = check_gmm_parm(w, mu, va)
-
- like = N.zeros(niter, float)
- for i in range(niter):
- g, tgd = gmm_posterior(data, w, mu, va)
- like[i] = N.sum(N.log(N.sum(tgd, 1)))
- w, mu, va = gmm_update(X, g, d, k, mode)
-
- return w, mu, va, like
-
-def gmm_update(data, gamma, d, K, varmode):
- """Computes update of the Gaussian Mixture Model (M step)
- from the a posteriori pdf, computed by gmm_posterior
- (E step).
- """
- n = data.shape[0]
- invn = 1.0/n
- mGamma = N.sum(gamma)
-
- if varmode == 'diag':
- mu = N.zeros((K, d), float)
- va = N.zeros((K, d), float)
- for k in range(K):
- x = N.sum(N.outerproduct(gamma[:, k],
- N.ones((1, d))) * data)
- xx = N.sum(N.outerproduct(gamma[:, k],
- N.ones((1, d))) * (data ** 2))
-
- mu[k,:] = x / mGamma[k]
- va[k,:] = xx / mGamma[k] - mu[k,:] ** 2
- w = invn * mGamma
-
- elif varmode == 'full':
- mu = N.zeros((K, d), float)
- va = N.zeros((K*d, d), float)
-
- # This is really slow because of the loop on each frame
- # ways to improve that: either use 3d matrices and see
- # the memory usage increasing in n*d*d,
- # using C (or pyrex, or ...) or using the trick commented
- # which loops on the dimensions instead. If the number of dimensions
- # is high, this won't help, though...
- for k in range(K):
- x = N.sum(N.outerproduct(gamma[:, k],
- N.ones((1, d), float)) * data)
- xx = N.zeros((d, d), float)
- # for i in range(n):
- # xx += gamma[i, k] * N.outerproduct(data[i,:],
- # data[i,:])
-
- # This should be much faster
- for i in range(d):
- for j in range(d):
- xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[:,k])
-
- mu[k,:] = x / mGamma[k]
- va[k*d:k*d+d,:] = xx / mGamma[k] - \
- N.outerproduct(mu[k,:], mu[k,:])
- w = invn * mGamma
- else:
- raise GmmParamError("varmode not recognized")
-
- return w, mu, va
-
-# Misc functions
-def gmm_ellipses(mu, va, c = [0, 1], npoints = 100):
- """Returns a list of ellipses describing the Gmm
- defined by mu and va. c is the dimension we are projecting
- the variances on a 2d space.
-
- Returns:
- -Xe: a list of x coordinates for the ellipses
- -Ye: a list of y coordinates for the ellipses
-
- Example:
- Suppose we have w, mu and va as parameters for a mixture, then:
-
- X = gen_gmm(w, mu, va, 1000)
- Xe, Ye = gmm_ellipses(mu, va)
- pylab.plot(X[:,0], X[:, 1], '.')
- for k in len(w):
- pylab.plot(Xe[k], Ye[k], 'r')
-
- Will plot samples X draw from the mixture model, and
- plot the ellipses of equi-probability from the mean with
- fixed level of confidence 0.39.
-
- TODO: be able to modify the confidence interval to arbitrary
- value (to do in gauss_ell)"""
- K = mu.shape[0]
- w = N.ones(K, float) / K
-
- K, d, mode = check_gmm_param(w, mu, va)
-
- # TODO: adjustable level (to do in gauss_ell).
- # For now, a level of 0.39 means that we draw
- # ellipses for 1 standard deviation.
- Xe = []
- Ye = []
- if mode == 'diag':
- for i in range(K):
- xe, ye = densities.gauss_ell(mu[i,:], va[i,:], dim = c,
- npoints = npoints)
- Xe.append(xe)
- Ye.append(ye)
- elif mode == 'full':
- for i in range(K):
- xe, ye = densities.gauss_ell(mu[i,:], va[i*d:i*d+d,:], dim = c,
- npoints = npoints)
- Xe.append(xe)
- Ye.append(ye)
-
- return Xe, Ye
-
-import c_gmm
-def kmean(data, init, iter = 10):
- """Simple kmean implementation for EM
-
- returns a tuple (code, label), where code are the final
- centroids, and label are the class label indec for each
- frame (ie row) of data"""
-
- data = N.atleast_2d(data)
- init = N.atleast_2d(init)
-
- (n, d) = data.shape
- (k, d1) = init.shape
-
- if not d == d1:
- msg = "data and init centers do not have same dimensions..."
- raise GmmParamError(msg)
-
- code = N.asarray(init.copy(), float)
- for i in range(iter):
- # Compute the nearest neighbour for each obs
- # using the current code book
- label = c_gmm._vq(data, code)
- # Update the code by computing centroids using the new code book
- for j in range(k):
- code[j,:] = N.mean(data[N.where(label==j)])
-
- return code, label
-
-def _vq(data, code):
- """ Please do not use directly. Use kmean instead"""
- # No attempt to be efficient has been made...
- (n, d) = data.shape
- (k, d) = code.shape
-
- label = N.zeros(n, int)
- for i in range(n):
- d = N.sum((data[i, :] - code) ** 2, 1)
- label[i] = N.argmin(d)
-
- return label
-
-# Test functions usable for now
-def test_kmean():
- X = N.array([[3.0, 3], [4, 3], [4, 2],
- [9, 2], [5, 1], [6, 2], [9, 4],
- [5, 2], [5, 4], [7, 4], [6, 5]])
-
- initc = N.concatenate(([[X[0]], [X[1]], [X[2]]]))
-
- codet1 = N.array([[3.0000, 3.0000],
- [6.2000, 4.0000],
- [5.8000, 1.8000]])
-
- codet2 = N.array([[11.0/3, 8.0/3],
- [6.7500, 4.2500],
- [6.2500, 1.7500]])
-
- code = initc.copy()
-
- code1 = kmean(X, code, 1)[0]
- code2 = kmean(X, code, 2)[0]
-
- import numpy.testing as testing
- try:
- testing.assert_array_almost_equal(code1, codet1)
- print "kmean test 1 succeded"
- except AssertionError:
- print "kmean test 1 failed"
-
- try:
- testing.assert_array_almost_equal(code2, codet2)
- print "kmean test 2 succeded"
- except AssertionError:
- print "kmean test 2 failed"
-
-# Test functions (unusable by other people for now....)
-def test_gmm_den():
- """"""
- import tables
-
- filename = 'gmm_test.h5'
-
- k = 3
- d = 2
- mode = 'full'
- w, mu, va = gen_gmm_param(d, k, mode, 2)
- X = gen_gmm(w, mu, va, 1e3)
-
- h5file = tables.openFile(filename, "w")
-
- h5file.createArray(h5file.root, 'mu', mu)
- h5file.createArray(h5file.root, 'va', va)
- h5file.createArray(h5file.root, 'X', X)
-
- Y1 = densities.gauss_den(X, mu[0,:], va[0:2,:])
- Y2 = densities.gauss_den(X, mu[1,:], va[2:4,:])
- Y3 = densities.gauss_den(X, mu[2,:], va[4:6,:])
-
- Y = multiple_gauss_den(X, mu, va)
-
- h5file.createArray(h5file.root, 'Y', Y)
- h5file.createArray(h5file.root, 'Y2', Y2)
- h5file.createArray(h5file.root, 'Y1', Y1)
- h5file.createArray(h5file.root, 'Y3', Y3)
-
- h5file.close()
-
-def test_gmm_em():
- """"""
- import numpy as N
- import tables
- import pylab as P
- filename = 'data.h5'
-
- k = 3
- d = 2
- mode = 'full'
- wr, mur, var = gen_gmm_param(d, k, mode, 2)
- X = gen_gmm(wr, mur, var, 1e3)
-
- h5file = tables.openFile(filename, "w")
-
- h5file.createArray(h5file.root, 'wr', wr)
- h5file.createArray(h5file.root, 'mur', mur)
- h5file.createArray(h5file.root, 'var', var)
- h5file.createArray(h5file.root, 'X', X)
-
- P.plot(X[:, 0], X[:, 1], '.')
- Xe, Ye = gmm_ellipses(mur, var, npoints = 100)
- for i in range(k):
- P.plot(Xe[i], Ye[i], 'g')
-
- # init EM: simply use kmean
- initc = X[0:k, :]
- (code, label) = kmean(X, initc, 10)
-
- h5file.createArray(h5file.root, 'initc', initc)
-
- w0 = N.ones(k, float) / k
- mu0 = code.copy()
- if mode == 'diag':
- va0 = N.zeros((k, d), float)
- for i in range(k):
- for j in range(d):
- va0[i,j] = N.cov(X[N.where(label==i), j], rowvar = 0)
- elif mode == 'full':
- va0 = N.zeros((k*d, d), float)
- for i in range(k):
- va0[i*d:i*d+d,:] = N.cov(X[N.where(label==i)], rowvar = 0)
-
- h5file.createArray(h5file.root, 'w0', w0)
- h5file.createArray(h5file.root, 'mu0', mu0)
- h5file.createArray(h5file.root, 'va0', va0)
-
-
- # # Use random values
- # w0 = N.ones(k, float) / k
- # mu0 = N.randn(k, d)
- # va0 = N.fabs(N.randn(k, d))
-
- w = w0.copy()
- mu = mu0.copy()
- va = va0.copy()
-
- X0e, Y0e = gmm_ellipses(mu, va, npoints = 100)
- for i in range(k):
- P.plot(X0e[i], Y0e[i], 'k')
-
- niter = 10
- like = N.zeros(niter, float)
-
- for i in range(niter):
- g, tgd = gmm_posterior(X, w, mu, va)
- #h5file.createArray(h5file.root, 'g', g)
- #h5file.createArray(h5file.root, 'tgd', tgd)
- like[i] = N.sum(N.log(N.sum(tgd, 1)))
- print like[i]
- w, mu, va = gmm_update(X, g, d, k, mode)
-
- h5file.createArray(h5file.root, 'w', w)
- h5file.createArray(h5file.root, 'mu', mu)
- h5file.createArray(h5file.root, 'va', va)
-
- X0e, Y0e = gmm_ellipses(mu, va, npoints = 100)
- for i in range(k):
- P.plot(X0e[i], Y0e[i], 'r')
- P.figure()
- P.plot(like)
- h5file.close()
- P.show()
-
-def _bench1():
- #===========================================
- # Diag GMM with 5 components of dimension 20
- #===========================================
- k = 5
- d = 20
- mode = 'diag'
-
- print "Generating the mixture"
- # Generate a model with k components, d dimensions
- wr, mur, var = gen_gmm_param(d, k, mode, 3)
- X = gen_gmm(wr, mur, var, 1e4)
-
- print "Init the mixture"
- # Init the mixture with kmean
- w0, mu0, va0 = gmm_init_kmean(X, k, mode, niter = 5)
-
- # # Use random values instead of kmean
- # w0 = N.ones(k, float) / k
- # mu0 = N.randn(k, d)
- # va0 = N.fabs(N.randn(k, d))
-
- print "EM computing..."
- # Copy the initial values because we want to draw them later...
- w = w0.copy()
- mu = mu0.copy()
- va = va0.copy()
-
- # The actual EM, with likelihood computation
- niter = 10
- like = N.zeros(niter, float)
-
- for i in range(niter):
- print "post"
- g, tgd = gmm_posterior(X, w, mu, va)
- like[i] = N.sum(N.log(N.sum(tgd, 1)))
- print "update"
- w, mu, va = gmm_update(X, g, d, k, mode)
-
-
-def benchmark():
- import profile
- profile.run('_bench1()', 'bench1prof')
-
-if __name__ == "__main__":
- #=============================
- # Simple GMM with 3 components
- #=============================
- import pylab as P
- k = 4
- d = 2
- mode = 'diag'
-
- print "Generating the mixture"
- # Generate a model with k components, d dimensions
- wr, mur, var = gen_gmm_param(d, k, mode, 3)
- X = gen_gmm(wr, mur, var, 1e3)
-
- print "Init the mixture"
- # Init the mixture with kmean
- w0, mu0, va0 = gmm_init_kmean(X, k, mode, niter = 5)
-
- # # Use random values instead of kmean
- # w0 = N.ones(k, float) / k
- # mu0 = N.randn(k, d)
- # va0 = N.fabs(N.randn(k, d))
-
- # Copy the initial values because we want to draw them later...
- w = w0.copy()
- mu = mu0.copy()
- va = va0.copy()
-
- # The actual EM, with likelihood computation
- niter = 10
- like = N.zeros(niter, float)
-
- print "computing..."
- for i in range(niter):
- print "post"
- g, tgd = gmm_posterior(X, w, mu, va)
- like[i] = N.sum(N.log(N.sum(tgd, 1)))
- print "update"
- w, mu, va = gmm_update(X, g, d, k, mode)
-
- print "drawing..."
- # Draw what is happening
- P.subplot(2, 1, 1)
- P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')
-
- # Real confidence ellipses
- Xre, Yre = gmm_ellipses(mur, var)
- P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
- for i in range(1,k):
- P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
-
- # Initial confidence ellipses as found by kmean
- X0e, Y0e = gmm_ellipses(mu0, va0)
- P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
- for i in range(1,k):
- P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
-
- # Values found by EM
- Xe, Ye = gmm_ellipses(mu, va)
- P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
- for i in range(1,k):
- P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
- P.legend(loc = 0)
- P.subplot(2, 1, 2)
- P.plot(like)
- P.title('log likelihood')
-
- # # Export the figure
- # F = P.gcf()
- # DPI = F.get_dpi()
- # DefaultSize = F.get_size_inches()
- # # the default is 100dpi for savefig:
- # F.savefig("example1.png")
-
- # # Now make the image twice as big, while keeping the fonts and all the
- # # same size
- # F.set_figsize_inches( (DefaultSize[0]*2, DefaultSize[1]*2) )
- # Size = F.get_size_inches()
- # print "Size in Inches", Size
- # F.savefig("example2.png")
- P.show()
Added: trunk/Lib/sandbox/pyem/pyem/__init__.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/__init__.py 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/__init__.py 2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,4 @@
+#! /usr/bin/env python
+# Last Change: Wed Jul 12 12:00 PM 2006 J
+
+version = '0.2'
Added: trunk/Lib/sandbox/pyem/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/densities.py 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/densities.py 2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,373 @@
+#! /usr/bin/python
+#
+# Copyrighted David Cournapeau
+# Last Change: Fri Jun 30 06:00 PM 2006 J
+
+import numpy as N
+import numpy.linalg as lin
+
+# Error classes
+class DenError(Exception):
+ """Base class for exceptions in this module.
+
+ Attributes:
+ expression -- input expression in which the error occurred
+ message -- explanation of the error"""
+ def __init__(self, message):
+ self.message = message
+
+ def __str__(self):
+ return self.message
+
+# The following function do all the fancy stuff to check that parameters
+# are Ok, and call the right implementation if args are OK.
+def gauss_den(x, mu, va, log = False):
+ """ Compute multivariate Gaussian density at points x for
+ mean mu and variance va.
+
+ Vector are row vectors, except va which can be a matrix
+ (row vector variance for diagonal variance)
+
+ If log is True, than the log density is returned
+ (useful for underflow ?)"""
+ mu = N.atleast_2d(mu)
+ va = N.atleast_2d(va)
+ x = N.atleast_2d(x)
+
+ #=======================#
+ # Checking parameters #
+ #=======================#
+ if len(N.shape(mu)) != 2:
+ raise DenError("mu is not rank 2")
+
+ if len(N.shape(va)) != 2:
+ raise DenError("va is not rank 2")
+
+ if len(N.shape(x)) != 2:
+ raise DenError("x is not rank 2")
+
+ (n, d) = x.shape
+ (dm0, dm1) = mu.shape
+ (dv0, dv1) = va.shape
+
+ # Check x and mu same dimension
+ if dm0 != 1:
+ msg = "mean must be a row vector!"
+ raise DenError(msg)
+ if dm1 != d:
+ msg = "x and mu not same dim"
+ raise DenError(msg)
+ # Check va and mu same size
+ if dv1 != d:
+ msg = "mu and va not same dim"
+ raise DenError(msg)
+ if dv0 != 1 and dv0 != d:
+ msg = "va not square"
+ raise DenError(msg)
+
+ #===============#
+ # Computation #
+ #===============#
+ data = x - mu
+
+ if d == 1:
+ # scalar case
+ return _scalar_gauss_den(data[:, 0], mu[0, 0], va[0, 0], log)
+ elif dv0 == 1:
+ # Diagonal matrix case
+ return _diag_gauss_den(data, mu, va, log)
+ elif dv1 == dv0:
+ # full case
+ return _full_gauss_den(data, mu, va, log)
+ else:
+ raise DenError("variance mode not recognized, this is a bug")
+
+# Those 3 functions do almost all the actual computation
+def _scalar_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in scalar case. It assumes all args
+ are conformant, so it should not be used directly
+
+ ** Expect centered data (ie with mean removed) **
+
+ Call gauss_den instead"""
+ d = mu.size
+ inva = 1/va
+ fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
+ y = (x ** 2) * -0.5 * inva
+ if not log:
+ y = fac * N.exp(y)
+ else:
+ y = y + log(fac)
+
+ return y
+
+def _diag_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in scalar case. It assumes all args
+ are conformant, so it should not be used directly
+
+ ** Expect centered data (ie with mean removed) **
+
+ Call gauss_den instead"""
+ # Diagonal matrix case
+ d = mu.size
+ y = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
+ if not log:
+ for i in range(1, d):
+ y *= _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
+ else:
+ for i in range(1, d):
+ y += _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
+ return y
+
+def _full_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in full matrix case.
+
+ It assumes all args are conformant, so it should
+ not be used directly Call gauss_den instead
+
+ ** Expect centered data (ie with mean removed) **
+
+ Does not check if va is definite positive (on inversible
+ for that matter), so the inverse computation and/or determinant
+ would throw an exception."""
+ d = mu.size
+ inva = lin.inv(va)
+ fac = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
+
+ # # Slow version
+ # n = N.size(x, 0)
+ # y = N.zeros(n, float)
+ # for i in range(n):
+ # y[i] = N.matrixmultiply(x[i,:],
+ # N.matrixmultiply(inva, N.transpose(x[i,:])))
+ # y *= -0.5
+
+ # we are using a trick with sum to "emulate"
+ # the matrix multiplication inva * x without any explicit loop
+ y = N.matrixmultiply(x, inva)
+ y = -0.5 * N.sum(y * x, 1)
+
+ if not log:
+ y = fac * N.exp(y)
+ else:
+ y = y + N.log(fac)
+
+ return y
+
+# To plot a confidence ellipse from multi-variate gaussian pdf
+def gauss_ell(mu, va, dim = [0, 1], npoints = 100):
+ """ Given a mean and covariance for multi-variate
+ gaussian, returns npoints points for the ellipse
+ of confidence 0.39
+
+ Returns the coordinate x and y of the ellipse"""
+
+ # TODO: Get a confidence interval using the Chi2 distribution
+ # of points at a given mahalanobis distance...
+ mu = N.atleast_1d(mu)
+ va = N.atleast_1d(va)
+ c = N.array(dim)
+
+ if mu.size == va.size:
+ mode = 'diag'
+ else:
+ if va.ndim == 2:
+ if va.shape[0] == va.shape[1]:
+ mode = 'full'
+ else:
+ raise DenError("variance not square")
+ else:
+ raise DenError("mean and variance are not dim conformant")
+
+ level = 0.39
+
+ # Generates a circle of npoints
+ theta = N.linspace(0, 2 * N.pi, npoints)
+ circle = N.array([N.cos(theta), N.sin(theta)])
+
+ # Get the dimension which we are interested in:
+ mu = mu[dim]
+ if mode == 'diag':
+ va = va[dim]
+ elps = N.outerproduct(mu, N.ones(npoints, float))
+ elps += N.matrixmultiply(N.diag(N.sqrt(va)), circle)
+ elif mode == 'full':
+ va = va[c,:][:,c]
+ # Method: compute the cholesky decomp of each cov matrix, that is
+ # compute cova such as va = cova * cova'
+ # WARN: scipy is different than matlab here, as scipy computes a lower
+ # triangular cholesky decomp:
+ # - va = cova * cova' (scipy)
+ # - va = cova' * cova (matlab)
+ # So take care when comparing results with matlab !
+ cova = lin.cholesky(va)
+ elps = N.outerproduct(mu, N.ones(npoints, float))
+ elps += N.matrixmultiply(cova, circle)
+ else:
+ raise DenParam("var mode not recognized")
+
+ return elps[0, :], elps[1, :]
+
+def test_gauss_den():
+ """"""
+ # import tables
+ # import numpy as N
+ #
+ # filename = 'dendata.h5'
+
+ # # # Dimension 1
+ # # d = 1
+ # # mu = 1.0
+ # # va = 2.0
+
+ # # X = N.randn(1e3, 1)
+
+ # # Y = gauss_den(X, mu, va)
+
+ # # h5file = tables.openFile(filename, "w")
+
+ # # h5file.createArray(h5file.root, 'X', X)
+ # # h5file.createArray(h5file.root, 'mu', mu)
+ # # h5file.createArray(h5file.root, 'va', va)
+ # # h5file.createArray(h5file.root, 'Y', Y)
+
+ # # h5file.close()
+
+ # # # Dimension 2, diag
+ # # d = 2
+ # # mu = N.array([1.0, -2.0])
+ # # va = N.array([1.0, 2.0])
+
+ # # X = N.randn(1e3, 2)
+
+ # # Y = gauss_den(X, mu, va)
+
+ # # h5file = tables.openFile(filename, "w")
+
+ # # h5file.createArray(h5file.root, 'X', X)
+ # # h5file.createArray(h5file.root, 'mu', mu)
+ # # h5file.createArray(h5file.root, 'va', va)
+ # # h5file.createArray(h5file.root, 'Y', Y)
+
+ # # Dimension 2, full
+ # d = 2
+ # mu = N.array([[0.2, -1.0]])
+ # va = N.array([[1.2, 0.1], [0.1, 0.5]])
+
+ # X = N.randn(1e3, 2)
+
+ # Y = gauss_den(X, mu, va)
+
+ # h5file = tables.openFile(filename, "w")
+
+ # h5file.createArray(h5file.root, 'X', X)
+ # h5file.createArray(h5file.root, 'mu', mu)
+ # h5file.createArray(h5file.root, 'va', va)
+ # h5file.createArray(h5file.root, 'Y', Y)
+
+ # h5file.close()
+
+ import numpy.testing as testing
+ #=================
+ # Small test in 1d
+ #=================
+ va = 2.0
+ mu = 1.0
+ X = N.linspace(-2, 2, 10)[:, N.NewAxis]
+
+ Yt = N.array([0.02973257230591, 0.05512079811082, 0.09257745306945,
+ 0.14086453882683,
+ 0.19418015562214, 0.24250166773127, 0.27436665745048, 0.28122547107069,
+ 0.26114678964743, 0.21969564473386])
+
+ Y = gauss_den(X, mu, va)
+ try:
+ testing.assert_array_almost_equal(Y, Yt)
+ print "1d test succeded"
+ except AssertionError:
+ print "test fails in 1d"
+
+ #============================
+ # Small test in 2d (diagonal)
+ #============================
+ mu = N.atleast_2d([-1.0, 2.0])
+ va = N.atleast_2d([2.0, 3.0])
+ X1 = N.linspace(-2, 2, 10)[:, N.NewAxis]
+ X2 = N.linspace(-1, 3, 10)[:, N.NewAxis]
+ X = N.concatenate(([X1, X2]), 1)
+
+ Yt = N.array([0.01129091565384, 0.02025416837152, 0.03081845516786,
+ 0.03977576221540, 0.04354490552910, 0.04043592581117,
+ 0.03184994053539, 0.02127948225225, 0.01205937178755,
+ 0.00579694938623 ])
+
+ Y = gauss_den(X, mu, va)
+ try:
+ testing.assert_array_almost_equal(Y, Yt)
+ print "2d diag test succeded"
+ except AssertionError:
+ print "test fails in 2d diag"
+
+ #============================
+ # Small test in 2d (full mat)
+ #============================
+ mu = N.array([[0.2, -1.0]])
+ va = N.array([[1.2, 0.1], [0.1, 0.5]])
+ X1 = N.linspace(-2, 2, 10)[:, N.NewAxis]
+ X2 = N.linspace(-3, 3, 10)[:, N.NewAxis]
+ X = N.concatenate(([X1, X2]), 1)
+
+ Yt = N.array([0.00096157109751, 0.01368908714856,
+ 0.07380823191162, 0.15072050533842,
+ 0.11656739937861, 0.03414436965525,
+ 0.00378789836599, 0.00015915297541,
+ 0.00000253261067, 0.00000001526368])
+
+ Y = gauss_den(X, mu, va)
+ try:
+ testing.assert_array_almost_equal(Y, Yt)
+ print "2d full test succeded"
+ except AssertionError:
+ print "test fails in 2d full"
+
+
+if __name__ == "__main__":
+ import pylab
+
+ #=========================================
+ # Test plotting a simple diag 2d variance:
+ #=========================================
+ va = N.array([5, 3])
+ mu = N.array([2, 3])
+
+ # Generate a multivariate gaussian of mean mu and covariance va
+ X = N.randn(1e3, 2)
+ Yc = N.matrixmultiply(N.diag(N.sqrt(va)), X.transpose())
+ Yc = Yc.transpose() + mu
+
+ # Plotting
+ Xe, Ye = gauss_ell(mu, va, npoints = 100)
+ pylab.figure()
+ pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+ pylab.plot(Xe, Ye, 'r')
+
+ #=========================================
+ # Test plotting a simple full 2d variance:
+ #=========================================
+ va = N.array([[0.2, 0.1],[0.1, 0.5]])
+ mu = N.array([0, 3])
+
+ # Generate a multivariate gaussian of mean mu and covariance va
+ X = N.randn(1e3, 2)
+ Yc = N.matrixmultiply(lin.cholesky(va), X.transpose())
+ Yc = Yc.transpose() + mu
+
+ # Plotting
+ Xe, Ye = gauss_ell(mu, va, npoints = 100)
+ pylab.figure()
+ pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+ pylab.plot(Xe, Ye, 'r')
+ pylab.show()
Added: trunk/Lib/sandbox/pyem/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/gmm_em.py 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/gmm_em.py 2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,759 @@
+# /usr/bin/python
+# Last Change: Fri Jun 30 07:00 PM 2006 J
+
+# TODO:
+# - interface with libem
+# - implements E and M in batch mode for mixtures, with special case for gmm
+# ( general EM should be outside this module. This one should be GMM specific, maybe)
+# - implements E and M in online mode
+# - python has list. Does it make sense to keep the h2m format for mean, va, when
+# we can use list instead ?
+# - a Gaussian Mixture Model should be a class, really !
+
+import numpy as N
+import numpy.linalg as lin
+import densities
+
+MAX_DEV = 1e-10
+
+# Error classes
+class GmmError(Exception):
+ """Base class for exceptions in this module."""
+ pass
+
+class GmmParamError(GmmError):
+ """Exception raised for errors in gmm params
+
+ Attributes:
+ expression -- input expression in which the error occurred
+ message -- explanation of the error
+ """
+ def __init__(self, message):
+ self.message = message
+
+ def __str__(self):
+ return self.message
+
+# Base class for mixture models: a mixture model can be initialized,
+# have parameters, have its parameters changed, have function to returns sufficient
+# statistics, generate data from the model, have the functions relative to EM
+class Mixture:
+ def __init__(self, np, prior):
+ self.np = np
+ self.prior = prior
+
+ def em_post(self):
+ pass
+
+ def em_update(self):
+ pass
+
+ # Check that the current state is consistent (such as priori weighting to 1, etc...)
+ def check_state(self):
+ pass
+
+ # Return npoints points generated from the model
+ def generate(self, npoints):
+ pass
+
+# The Gaussian Mixture
+class GMM(Mixture):
+ pass
+
+# Function to generate a GMM, or valid parameters for GMM
+def gen_rand_index(p, n):
+ """Generate a N samples vector containing random index between 1
+ and length(p), each index i with probability p(i)"""
+ # TODO Check args here
+
+ # TODO: check each value of inverse distribution is
+ # different
+ invcdf = N.cumsum(p)
+ uni = N.rand(n)
+ index = N.zeros(n)
+
+ # This one should be a bit faster
+ for k in range(len(p)-1, 0, -1):
+ blop = N.where(N.logical_and(invcdf[k-1] <= uni,
+ uni < invcdf[k]))
+ index[blop] = k
+
+ return index
+
+def gen_gmm(w, mu, va, n):
+ """Generate a gaussiam mixture model with weights w,
+ mean mu and variances va. Each column of the parameters
+ are one component parameter.
+ """
+ # Check args
+ K, d, mode = check_gmm_param(w, mu, va)
+
+ # Generate the mixture
+ S = gen_rand_index(w, n) # State index (ie hidden var)
+ X = N.randn(n, d) # standard gaussian
+
+ if mode == 'diag':
+ X = mu[S, :] + X * N.sqrt(va[S,:])
+ elif mode == 'full':
+ # Naive implementation
+ # cho = N.zeros(va.shape, float)
+ # for k in range(K):
+ # # Using cholesky is more stable than sqrtm; sqrtm is not
+ # # available in numpy anyway, only in scipy...
+ # cho[k*d:k*d+d,:] = lin.cholesky(va[k*d:k*d+d,:])
+ # for i in range(n):
+ # X[i,:] = mu[S[i],:] + N.matrixmultiply(cho[S[i]*d:S[i]*d+d,:], X[i,:])
+
+ # Faster:
+ cho = N.zeros((K, va.shape[1], va.shape[1]), float)
+ for k in range(K):
+ # Using cholesky is more stable than sqrtm; sqrtm is not
+ # available in numpy anyway, only in scipy...
+ cho[k] = lin.cholesky(va[k*d:k*d+d,:])
+
+ for s in range(K):
+ tmpind = N.where(S == s)[0]
+ X[tmpind] = N.matrixmultiply(X[tmpind], cho[s].transpose()) + mu[s]
+ else:
+ raise GmmParamError('cov matrix mode not recognized')
+
+ return X
+
+def gen_gmm_param(d, K, varmode = 'diag', spread = 1):
+ """Generate valid parameters for a gaussian mixture model.
+ d is the dimension, K the number of components, and varmode
+ the mode for cov matrices.
+
+ Returns:
+ - w
+ - mu
+ - va
+ """
+ w = abs(N.randn(K))
+ w = w / sum(w)
+
+ mu = spread * N.randn(K, d)
+ if varmode == 'diag':
+ va = abs(N.randn(K, d))
+ elif varmode == 'full':
+ va = N.randn(K * d, d)
+ for k in range(K):
+ va[k*d:k*d+d] = N.matrixmultiply( va[k*d:k*d+d],
+ va[k*d:k*d+d].transpose())
+ else:
+ raise GmmParamError('cov matrix mode not recognized')
+
+ return w, mu, va
+
+def check_gmm_param(w, mu, va):
+ """Check that w, mu and va are valid parameters for
+ a mixture of gaussian: w should sum to 1, there should
+ be the same number of component in each param, the variances
+ should be positive definite, etc...
+
+ Params:
+ w = vector or list of weigths of the mixture (K elements)
+ mu = matrix: K * d
+ va = list of variances (vector K * d or square matrices Kd * d)
+
+ returns:
+ K = number of components
+ d = dimension
+ mode = 'diag' if diagonal covariance, 'full' of full matrices
+ """
+
+ # Check that w is valid
+ if N.fabs(N.sum(w) - 1) > MAX_DEV:
+ raise GmmParamError('weight does not sum to 1')
+
+ if not len(w.shape) == 1:
+ raise GmmParamError('weight is not a vector')
+
+ # Check that mean and va have the same number of components
+ K = len(w)
+
+ if N.ndim(mu) < 2:
+ msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
+ raise GmmParamError(msg)
+ if N.ndim(va) < 2:
+ msg = """va should be a K,d / K *d, d matrix, and a row vector if
+ only 1 diag comp"""
+ raise GmmParamError(msg)
+
+ (Km, d) = mu.shape
+ (Ka, da) = va.shape
+
+ if not K == Km:
+ msg = "not same number of component in mean and weights"
+ raise GmmParamError(msg)
+
+ if not d == da:
+ msg = "not same number of dimensions in mean and variances"
+ raise GmmParamError(msg)
+
+ if Km == Ka:
+ mode = 'diag'
+ else:
+ mode = 'full'
+ if not Ka == Km*d:
+ msg = "not same number of dimensions in mean and variances"
+ raise GmmParamError(msg)
+
+ return K, d, mode
+
+# For EM on GMM
+def multiple_gauss_den(data, mu, va):
+ """Helper function to generate several Gaussian
+ pdf (different parameters) from the same data"""
+ mu = N.atleast_2d(mu)
+ va = N.atleast_2d(va)
+
+ K = mu.shape[0]
+ n = data.shape[0]
+ d = data.shape[1]
+
+ y = N.zeros((n, K), float)
+ if mu.size == va.size:
+ for i in range(K):
+ y[:, i] = densities.gauss_den(data, mu[i, :], va[i, :])
+ else:
+ for i in range(K):
+ y[:, i] = densities.gauss_den(data, mu[i, :],
+ va[d*i:d*i+d, :])
+
+ return y
+
+def gmm_init_kmean(data, k, mode, init = [], niter = 10):
+ """gmm_init_kmean(data, k, mode, init = [], niter = 10)
+
+ Init EM for GMM with kmean from data, for k components.
+
+ Args:
+ - data: Each row of data is one frame of dimension d.
+ - k: Number of components to look for
+ - mode: Diagonal or Full covariance matrices
+ - init: The initial centroids. The value given for k is
+ ignored, and the number of row in initc is used instead.
+ If initc is not given, then the centroids are initialized
+ with the k first values of data.
+ - niter: Number of iterations in kmean.
+
+ Returns:
+ (w, mu, va), initial parameters for a GMM.
+
+ Method:
+ Each weight is equiprobable, each mean is one centroid returned by kmean, and
+ covariances for component i is initialized with covariance of
+ data corresponding with label i. Other strategies are possible, this one
+ is an easy one"""
+ if len(init) == 0:
+ init = data[0:k, :]
+ else:
+ k = initc.shape[0]
+
+ if data.ndim == 1:
+ d = 1
+ else:
+ d = N.shape(data)[1]
+
+ (code, label) = kmean(data, init, niter)
+
+ w = N.ones(k, float) / k
+ mu = code.copy()
+ if mode == 'diag':
+ va = N.zeros((k, d), float)
+ for i in range(k):
+ for j in range(d):
+ va[i,j] = N.cov(data[N.where(label==i), j], rowvar = 0)
+ elif mode == 'full':
+ va = N.zeros((k*d, d), float)
+ for i in range(k):
+ va[i*d:i*d+d,:] = N.cov(data[N.where(label==i)], rowvar = 0)
+ else:
+ raise GmmParamError("mode " + str(mode) + " not recognized")
+
+ return w, mu, va
+
+def gmm_posterior(data, w, mu, va):
+ """ Computes the latent variable distribution (a
+ posteriori probability) knowing the explicit data
+ for the Gaussian model (w, mu, var): gamma(t, i) =
+ P[state = i | observation = data(t); w, mu, va]
+
+ This is basically the E step of EM for GMM.
+
+ the second returned value is the non normalized version
+ of gamma, and may be needed for some computation,
+ like eg likelihood"""
+ n = data.shape[0]
+ K = len(w)
+
+ # compute the gaussian pdf
+ tgd = multiple_gauss_den(data, mu, va)
+ # multiply by the weight
+ tgd *= w
+ # Normalize to get a pdf
+ gd = tgd / N.sum(tgd, axis=1)[:, N.NewAxis]
+
+ return gd, tgd
+
+# This function is just calling gmm_update and gmm_posterior, with
+# initialization. This is ugly, and we should have a class to model a GMM
+# instead of all this code to try to guess correct values and parameters...
+def gmm_em(data, niter = 10, k = 2, mode = 'diag', w = [], mu = [], va = []):
+ """
+ gmm_em(data, niter = 10, k = 2, mode = 'diag', w = [], mu = [], va = []):
+
+ Compute the parameters of a Gaussian Mixture Model using EM algorithm,
+ with initial values w, mu and va (overwritten by the function).
+
+ Args:
+ - data: contains the observed features, one row is one frame, ie one
+ observation of dimension d
+ - niter: number of iterations
+ - mode: 'diag' or 'full', depending on the wanted model for cov
+ matrices.
+ - K: number of components
+ - w, mu, va initial parameters for the GMM. All or none must be given.
+ If no initial values are given, initialized by gmm_init_kmean; if they
+ are given, mode and k are ignored, and guessed from the given parameters
+ instead.
+
+ Returns:
+ w, mu, va, like as found by EM, where like is the likelihood for each
+ iteration.
+ """
+ if len(w) == 0:
+ w, mu, va = gmm_init_kmean(data, k, mode, 5)
+ else:
+ k, d, mode = check_gmm_parm(w, mu, va)
+
+ like = N.zeros(niter, float)
+ for i in range(niter):
+ g, tgd = gmm_posterior(data, w, mu, va)
+ like[i] = N.sum(N.log(N.sum(tgd, 1)))
+ w, mu, va = gmm_update(X, g, d, k, mode)
+
+ return w, mu, va, like
+
+def gmm_update(data, gamma, d, K, varmode):
+ """Computes update of the Gaussian Mixture Model (M step)
+ from the a posteriori pdf, computed by gmm_posterior
+ (E step).
+ """
+ n = data.shape[0]
+ invn = 1.0/n
+ mGamma = N.sum(gamma)
+
+ if varmode == 'diag':
+ mu = N.zeros((K, d), float)
+ va = N.zeros((K, d), float)
+ for k in range(K):
+ x = N.sum(N.outerproduct(gamma[:, k],
+ N.ones((1, d))) * data)
+ xx = N.sum(N.outerproduct(gamma[:, k],
+ N.ones((1, d))) * (data ** 2))
+
+ mu[k,:] = x / mGamma[k]
+ va[k,:] = xx / mGamma[k] - mu[k,:] ** 2
+ w = invn * mGamma
+
+ elif varmode == 'full':
+ mu = N.zeros((K, d), float)
+ va = N.zeros((K*d, d), float)
+
+ # This is really slow because of the loop on each frame
+ # ways to improve that: either use 3d matrices and see
+ # the memory usage increasing in n*d*d,
+ # using C (or pyrex, or ...) or using the trick commented
+ # which loops on the dimensions instead. If the number of dimensions
+ # is high, this won't help, though...
+ for k in range(K):
+ x = N.sum(N.outerproduct(gamma[:, k],
+ N.ones((1, d), float)) * data)
+ xx = N.zeros((d, d), float)
+ # for i in range(n):
+ # xx += gamma[i, k] * N.outerproduct(data[i,:],
+ # data[i,:])
+
+ # This should be much faster
+ for i in range(d):
+ for j in range(d):
+ xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[:,k])
+
+ mu[k,:] = x / mGamma[k]
+ va[k*d:k*d+d,:] = xx / mGamma[k] - \
+ N.outerproduct(mu[k,:], mu[k,:])
+ w = invn * mGamma
+ else:
+ raise GmmParamError("varmode not recognized")
+
+ return w, mu, va
+
+# Misc functions
+def gmm_ellipses(mu, va, c = [0, 1], npoints = 100):
+ """Returns a list of ellipses describing the Gmm
+ defined by mu and va. c is the dimension we are projecting
+ the variances on a 2d space.
+
+ Returns:
+ -Xe: a list of x coordinates for the ellipses
+ -Ye: a list of y coordinates for the ellipses
+
+ Example:
+ Suppose we have w, mu and va as parameters for a mixture, then:
+
+ X = gen_gmm(w, mu, va, 1000)
+ Xe, Ye = gmm_ellipses(mu, va)
+ pylab.plot(X[:,0], X[:, 1], '.')
+ for k in len(w):
+ pylab.plot(Xe[k], Ye[k], 'r')
+
+ Will plot samples X draw from the mixture model, and
+ plot the ellipses of equi-probability from the mean with
+ fixed level of confidence 0.39.
+
+ TODO: be able to modify the confidence interval to arbitrary
+ value (to do in gauss_ell)"""
+ K = mu.shape[0]
+ w = N.ones(K, float) / K
+
+ K, d, mode = check_gmm_param(w, mu, va)
+
+ # TODO: adjustable level (to do in gauss_ell).
+ # For now, a level of 0.39 means that we draw
+ # ellipses for 1 standard deviation.
+ Xe = []
+ Ye = []
+ if mode == 'diag':
+ for i in range(K):
+ xe, ye = densities.gauss_ell(mu[i,:], va[i,:], dim = c,
+ npoints = npoints)
+ Xe.append(xe)
+ Ye.append(ye)
+ elif mode == 'full':
+ for i in range(K):
+ xe, ye = densities.gauss_ell(mu[i,:], va[i*d:i*d+d,:], dim = c,
+ npoints = npoints)
+ Xe.append(xe)
+ Ye.append(ye)
+
+ return Xe, Ye
+
+import c_gmm
+def kmean(data, init, iter = 10):
+ """Simple kmean implementation for EM
+
+ returns a tuple (code, label), where code are the final
+ centroids, and label are the class label indec for each
+ frame (ie row) of data"""
+
+ data = N.atleast_2d(data)
+ init = N.atleast_2d(init)
+
+ (n, d) = data.shape
+ (k, d1) = init.shape
+
+ if not d == d1:
+ msg = "data and init centers do not have same dimensions..."
+ raise GmmParamError(msg)
+
+ code = N.asarray(init.copy(), float)
+ for i in range(iter):
+ # Compute the nearest neighbour for each obs
+ # using the current code book
+ label = c_gmm._vq(data, code)
+ # Update the code by computing centroids using the new code book
+ for j in range(k):
+ code[j,:] = N.mean(data[N.where(label==j)])
+
+ return code, label
+
+def _vq(data, code):
+ """ Please do not use directly. Use kmean instead"""
+ # No attempt to be efficient has been made...
+ (n, d) = data.shape
+ (k, d) = code.shape
+
+ label = N.zeros(n, int)
+ for i in range(n):
+ d = N.sum((data[i, :] - code) ** 2, 1)
+ label[i] = N.argmin(d)
+
+ return label
+
+# Test functions usable for now
+def test_kmean():
+ X = N.array([[3.0, 3], [4, 3], [4, 2],
+ [9, 2], [5, 1], [6, 2], [9, 4],
+ [5, 2], [5, 4], [7, 4], [6, 5]])
+
+ initc = N.concatenate(([[X[0]], [X[1]], [X[2]]]))
+
+ codet1 = N.array([[3.0000, 3.0000],
+ [6.2000, 4.0000],
+ [5.8000, 1.8000]])
+
+ codet2 = N.array([[11.0/3, 8.0/3],
+ [6.7500, 4.2500],
+ [6.2500, 1.7500]])
+
+ code = initc.copy()
+
+ code1 = kmean(X, code, 1)[0]
+ code2 = kmean(X, code, 2)[0]
+
+ import numpy.testing as testing
+ try:
+ testing.assert_array_almost_equal(code1, codet1)
+ print "kmean test 1 succeded"
+ except AssertionError:
+ print "kmean test 1 failed"
+
+ try:
+ testing.assert_array_almost_equal(code2, codet2)
+ print "kmean test 2 succeded"
+ except AssertionError:
+ print "kmean test 2 failed"
+
+# Test functions (unusable by other people for now....)
+def test_gmm_den():
+ """"""
+ import tables
+
+ filename = 'gmm_test.h5'
+
+ k = 3
+ d = 2
+ mode = 'full'
+ w, mu, va = gen_gmm_param(d, k, mode, 2)
+ X = gen_gmm(w, mu, va, 1e3)
+
+ h5file = tables.openFile(filename, "w")
+
+ h5file.createArray(h5file.root, 'mu', mu)
+ h5file.createArray(h5file.root, 'va', va)
+ h5file.createArray(h5file.root, 'X', X)
+
+ Y1 = densities.gauss_den(X, mu[0,:], va[0:2,:])
+ Y2 = densities.gauss_den(X, mu[1,:], va[2:4,:])
+ Y3 = densities.gauss_den(X, mu[2,:], va[4:6,:])
+
+ Y = multiple_gauss_den(X, mu, va)
+
+ h5file.createArray(h5file.root, 'Y', Y)
+ h5file.createArray(h5file.root, 'Y2', Y2)
+ h5file.createArray(h5file.root, 'Y1', Y1)
+ h5file.createArray(h5file.root, 'Y3', Y3)
+
+ h5file.close()
+
+def test_gmm_em():
+ """"""
+ import numpy as N
+ import tables
+ import pylab as P
+ filename = 'data.h5'
+
+ k = 3
+ d = 2
+ mode = 'full'
+ wr, mur, var = gen_gmm_param(d, k, mode, 2)
+ X = gen_gmm(wr, mur, var, 1e3)
+
+ h5file = tables.openFile(filename, "w")
+
+ h5file.createArray(h5file.root, 'wr', wr)
+ h5file.createArray(h5file.root, 'mur', mur)
+ h5file.createArray(h5file.root, 'var', var)
+ h5file.createArray(h5file.root, 'X', X)
+
+ P.plot(X[:, 0], X[:, 1], '.')
+ Xe, Ye = gmm_ellipses(mur, var, npoints = 100)
+ for i in range(k):
+ P.plot(Xe[i], Ye[i], 'g')
+
+ # init EM: simply use kmean
+ initc = X[0:k, :]
+ (code, label) = kmean(X, initc, 10)
+
+ h5file.createArray(h5file.root, 'initc', initc)
+
+ w0 = N.ones(k, float) / k
+ mu0 = code.copy()
+ if mode == 'diag':
+ va0 = N.zeros((k, d), float)
+ for i in range(k):
+ for j in range(d):
+ va0[i,j] = N.cov(X[N.where(label==i), j], rowvar = 0)
+ elif mode == 'full':
+ va0 = N.zeros((k*d, d), float)
+ for i in range(k):
+ va0[i*d:i*d+d,:] = N.cov(X[N.where(label==i)], rowvar = 0)
+
+ h5file.createArray(h5file.root, 'w0', w0)
+ h5file.createArray(h5file.root, 'mu0', mu0)
+ h5file.createArray(h5file.root, 'va0', va0)
+
+
+ # # Use random values
+ # w0 = N.ones(k, float) / k
+ # mu0 = N.randn(k, d)
+ # va0 = N.fabs(N.randn(k, d))
+
+ w = w0.copy()
+ mu = mu0.copy()
+ va = va0.copy()
+
+ X0e, Y0e = gmm_ellipses(mu, va, npoints = 100)
+ for i in range(k):
+ P.plot(X0e[i], Y0e[i], 'k')
+
+ niter = 10
+ like = N.zeros(niter, float)
+
+ for i in range(niter):
+ g, tgd = gmm_posterior(X, w, mu, va)
+ #h5file.createArray(h5file.root, 'g', g)
+ #h5file.createArray(h5file.root, 'tgd', tgd)
+ like[i] = N.sum(N.log(N.sum(tgd, 1)))
+ print like[i]
+ w, mu, va = gmm_update(X, g, d, k, mode)
+
+ h5file.createArray(h5file.root, 'w', w)
+ h5file.createArray(h5file.root, 'mu', mu)
+ h5file.createArray(h5file.root, 'va', va)
+
+ X0e, Y0e = gmm_ellipses(mu, va, npoints = 100)
+ for i in range(k):
+ P.plot(X0e[i], Y0e[i], 'r')
+ P.figure()
+ P.plot(like)
+ h5file.close()
+ P.show()
+
+def _bench1():
+ #===========================================
+ # Diag GMM with 5 components of dimension 20
+ #===========================================
+ k = 5
+ d = 20
+ mode = 'diag'
+
+ print "Generating the mixture"
+ # Generate a model with k components, d dimensions
+ wr, mur, var = gen_gmm_param(d, k, mode, 3)
+ X = gen_gmm(wr, mur, var, 1e4)
+
+ print "Init the mixture"
+ # Init the mixture with kmean
+ w0, mu0, va0 = gmm_init_kmean(X, k, mode, niter = 5)
+
+ # # Use random values instead of kmean
+ # w0 = N.ones(k, float) / k
+ # mu0 = N.randn(k, d)
+ # va0 = N.fabs(N.randn(k, d))
+
+ print "EM computing..."
+ # Copy the initial values because we want to draw them later...
+ w = w0.copy()
+ mu = mu0.copy()
+ va = va0.copy()
+
+ # The actual EM, with likelihood computation
+ niter = 10
+ like = N.zeros(niter, float)
+
+ for i in range(niter):
+ print "post"
+ g, tgd = gmm_posterior(X, w, mu, va)
+ like[i] = N.sum(N.log(N.sum(tgd, 1)))
+ print "update"
+ w, mu, va = gmm_update(X, g, d, k, mode)
+
+
+def benchmark():
+ import profile
+ profile.run('_bench1()', 'bench1prof')
+
+if __name__ == "__main__":
+ #=============================
+ # Simple GMM with 3 components
+ #=============================
+ import pylab as P
+ k = 4
+ d = 2
+ mode = 'diag'
+
+ print "Generating the mixture"
+ # Generate a model with k components, d dimensions
+ wr, mur, var = gen_gmm_param(d, k, mode, 3)
+ X = gen_gmm(wr, mur, var, 1e3)
+
+ print "Init the mixture"
+ # Init the mixture with kmean
+ w0, mu0, va0 = gmm_init_kmean(X, k, mode, niter = 5)
+
+ # # Use random values instead of kmean
+ # w0 = N.ones(k, float) / k
+ # mu0 = N.randn(k, d)
+ # va0 = N.fabs(N.randn(k, d))
+
+ # Copy the initial values because we want to draw them later...
+ w = w0.copy()
+ mu = mu0.copy()
+ va = va0.copy()
+
+ # The actual EM, with likelihood computation
+ niter = 10
+ like = N.zeros(niter, float)
+
+ print "computing..."
+ for i in range(niter):
+ print "post"
+ g, tgd = gmm_posterior(X, w, mu, va)
+ like[i] = N.sum(N.log(N.sum(tgd, 1)))
+ print "update"
+ w, mu, va = gmm_update(X, g, d, k, mode)
+
+ print "drawing..."
+ # Draw what is happening
+ P.subplot(2, 1, 1)
+ P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')
+
+ # Real confidence ellipses
+ Xre, Yre = gmm_ellipses(mur, var)
+ P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
+ for i in range(1,k):
+ P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
+
+ # Initial confidence ellipses as found by kmean
+ X0e, Y0e = gmm_ellipses(mu0, va0)
+ P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
+ for i in range(1,k):
+ P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
+
+ # Values found by EM
+ Xe, Ye = gmm_ellipses(mu, va)
+ P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
+ for i in range(1,k):
+ P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
+ P.legend(loc = 0)
+ P.subplot(2, 1, 2)
+ P.plot(like)
+ P.title('log likelihood')
+
+ # # Export the figure
+ # F = P.gcf()
+ # DPI = F.get_dpi()
+ # DefaultSize = F.get_size_inches()
+ # # the default is 100dpi for savefig:
+ # F.savefig("example1.png")
+
+ # # Now make the image twice as big, while keeping the fonts and all the
+ # # same size
+ # F.set_figsize_inches( (DefaultSize[0]*2, DefaultSize[1]*2) )
+ # Size = F.get_size_inches()
+ # print "Size in Inches", Size
+ # F.savefig("example2.png")
+ P.show()
Added: trunk/Lib/sandbox/pyem/pyem/src/Makefile
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/Makefile 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/src/Makefile 2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,26 @@
+CC = colorgcc
+LD = gcc
+
+PYREX = python2.4-pyrexc
+
+PYTHONINC = -I/usr/include/python2.4
+NUMPYINC = -I/usr/lib/python2.4/site-packages/numpy/core/include
+OPTIMS = -O3 -ffast-math -march=pentium4
+WARN = -W -Wall
+
+CFLAGS = $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN)
+
+c_gmm.so: c_gmm.o
+ $(LD) -shared -o $@ $< -Wl,-soname,$@
+
+c_gmm.o: c_gmm.c
+ $(CC) -c $(CFLAGS) -fPIC $<
+
+c_gmm.c: c_gmm.pyx c_numpy.pxd c_python.pxd
+ $(PYREX) $<
+
+clean:
+ rm -f c_gmm.c
+ rm -f *.o
+ rm -f *.so
+ rm -f *.pyc
Added: trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx 2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,66 @@
+# Last Change: Tue May 16 11:00 AM 2006 J
+
+cimport c_numpy
+cimport c_python
+import numpy
+
+c_numpy.import_array()
+
+# pyrex version of _vq. Much faster in high dimension/high number of K
+# (ie more than 3-4)
+def _vq(data, init):
+ (n, d) = data.shape
+ label = numpy.zeros(n, int)
+ _imp_vq(data, init, label)
+
+ return label
+
+def _imp_vq(c_numpy.ndarray data, c_numpy.ndarray init, c_numpy.ndarray label):
+ cdef int n
+ cdef int d
+ cdef int nc
+ cdef int i
+ cdef int j
+ cdef int k
+ cdef int *labeld
+ cdef double *da, *code
+ cdef double dist
+ cdef double acc
+
+ n = data.dimensions[0]
+ d = data.dimensions[1]
+ nc = init.dimensions[0]
+
+ if not data.dtype == numpy.dtype(numpy.float64):
+ print '_vq not (yet) implemented for dtype %s'%dtype.name
+ return
+ da = (<double*>data.data)
+
+ if not init.dtype == numpy.dtype(numpy.float64):
+ print '_vq not (yet) implemented for dtype %s'%dtype.name
+ return
+ code = (<double*>init.data)
+
+ if not label.dtype == numpy.dtype(numpy.int32):
+ print '_vq not (yet) implemented for dtype %s'%dtype.name
+ return
+ labeld = (<int*>label.data)
+
+ for i from 0<=i<n:
+ acc = 0
+ lab = 0
+ for j from 0<=j<d:
+ acc = acc + (da[i * d + j] - code[j]) * \
+ (da[i * d + j] - code[j])
+ dist = acc
+ for k from 1<=k<nc:
+ acc = 0
+ for j from 0<=j<d:
+ acc = acc + (da[i * d + j] - code[k * d + j]) * \
+ (da[i * d + j] - code[k * d + j])
+ if acc < dist:
+ dist = acc
+ lab = k
+ labeld[i] = lab
+
+ return lab
Added: trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_numpy.pxd 2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,59 @@
+# :Author: Robert Kern
+# :Copyright: 2004, Enthought, Inc.
+# :License: BSD Style
+
+
+cdef extern from "numpy/arrayobject.h":
+ ctypedef enum PyArray_TYPES:
+ PyArray_BOOL
+ PyArray_BYTE
+ PyArray_UBYTE
+ PyArray_SHORT
+ PyArray_USHORT
+ PyArray_INT
+ PyArray_UINT
+ PyArray_LONG
+ PyArray_ULONG
+ PyArray_LONGLONG
+ PyArray_ULONGLONG
+ PyArray_FLOAT
+ PyArray_DOUBLE
+ PyArray_LONGDOUBLE
+ PyArray_CFLOAT
+ PyArray_CDOUBLE
+ PyArray_CLONGDOUBLE
+ PyArray_OBJECT
+ PyArray_STRING
+ PyArray_UNICODE
+ PyArray_VOID
+ PyArray_NTYPES
+ PyArray_NOTYPE
+
+ ctypedef int intp
+
+ ctypedef extern class numpy.dtype [object PyArray_Descr]:
+ cdef int type_num, elsize, alignment
+ cdef char type, kind, byteorder, hasobject
+ cdef object fields, typeobj
+
+ ctypedef extern class numpy.ndarray [object PyArrayObject]:
+ cdef char *data
+ cdef int nd
+ cdef intp *dimensions
+ cdef intp *strides
+ cdef object base
+ cdef dtype descr
+ cdef int flags
+
+ ndarray PyArray_SimpleNew(int ndims, intp* dims, int item_type)
+ int PyArray_Check(object obj)
+ ndarray PyArray_ContiguousFromObject(object obj, PyArray_TYPES type,
+ int mindim, int maxdim)
+ intp PyArray_SIZE(ndarray arr)
+ void *PyArray_DATA(ndarray arr)
+ ndarray PyArray_FromAny(object obj, dtype newtype, int mindim, int maxdim,
+ int requirements, object context)
+ ndarray PyArray_NewFromDescr(object subtype, dtype newtype, int nd, intp* dims,
+ intp* strides, void* data, int flags, object parent)
+
+ void import_array()
Added: trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd 2006-10-12 13:45:05 UTC (rev 2266)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_python.pxd 2006-10-12 13:45:14 UTC (rev 2267)
@@ -0,0 +1,20 @@
+# -*- Mode: Python -*- Not really, but close enough
+
+# Expose as much of the Python C API as we need here
+
+cdef extern from "stdlib.h":
+ ctypedef int size_t
+
+cdef extern from "Python.h":
+ ctypedef int Py_intptr_t
+ void* PyMem_Malloc(size_t)
+ void* PyMem_Realloc(void *p, size_t n)
+ void PyMem_Free(void *p)
+ char* PyString_AsString(object string)
+ object PyString_FromString(char *v)
+ object PyString_InternFromString(char *v)
+ int PyErr_CheckSignals()
+ object PyFloat_FromDouble(double v)
+ void Py_XINCREF(object o)
+ void Py_XDECREF(object o)
+ void Py_CLEAR(object o) # use instead of decref
More information about the Scipy-svn
mailing list