[Scipy-svn] r2278 - in trunk/Lib/sandbox/pyem: . pyem pyem/src
scipy-svn at scipy.org
scipy-svn at scipy.org
Thu Oct 12 09:47:44 EDT 2006
Author: cdavid
Date: 2006-10-12 08:47:30 -0500 (Thu, 12 Oct 2006)
New Revision: 2278
Added:
trunk/Lib/sandbox/pyem/pyem/_c_densities.py
trunk/Lib/sandbox/pyem/pyem/src/c_gden.c
Modified:
trunk/Lib/sandbox/pyem/.bzrignore
trunk/Lib/sandbox/pyem/Changelog
trunk/Lib/sandbox/pyem/MANIFEST.in
trunk/Lib/sandbox/pyem/TODO
trunk/Lib/sandbox/pyem/pyem/densities.py
trunk/Lib/sandbox/pyem/pyem/gauss_mix.py
trunk/Lib/sandbox/pyem/pyem/gmm_em.py
trunk/Lib/sandbox/pyem/pyem/kmean.py
trunk/Lib/sandbox/pyem/pyem/profile_densities.py
trunk/Lib/sandbox/pyem/pyem/profile_gmm.py
trunk/Lib/sandbox/pyem/pyem/src/Makefile
Log:
[pyem @ david at ar.media.kyoto-u.ac.jp-20060828130328-12bff151dfc6cefc]
Various cleanup, include ctypes version of diag gden; update profile
function to include ctypes version
David Cournapeau <david at ar.media.kyoto-u.ac.jp> | 2006-08-28 22:03:28 +0900 (Mon, 28 Aug 2006)
Modified: trunk/Lib/sandbox/pyem/.bzrignore
===================================================================
--- trunk/Lib/sandbox/pyem/.bzrignore 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/.bzrignore 2006-10-12 13:47:30 UTC (rev 2278)
@@ -12,3 +12,10 @@
gmmprof
valgrind-python.supp
valgrind-python.supp
+pyem/
+pyem/matcode/
+pyem/tmp/
+pyem/tmp/kmean.py
+pyem/tmp/blop.py
+pyem/tmp/
+pyem/tmp
Modified: trunk/Lib/sandbox/pyem/Changelog
===================================================================
--- trunk/Lib/sandbox/pyem/Changelog 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/Changelog 2006-10-12 13:47:30 UTC (rev 2278)
@@ -1,3 +1,10 @@
+pyem (0.5.2) Mon, 28 Aug 2006 13:20:13 +0900
+
+ * Add a plot method to Gm class, so that it is easier
+ to plot a GM model interactively (needs matplotlib)
+
+-- David Cournapeau <david at ar.media.kyoto-u.ac.jp>
+
pyem (0.5.2) Thu, 24 Aug 2006 19:42:28 +0900
* put version to 0.5.2
Modified: trunk/Lib/sandbox/pyem/MANIFEST.in
===================================================================
--- trunk/Lib/sandbox/pyem/MANIFEST.in 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/MANIFEST.in 2006-10-12 13:47:30 UTC (rev 2278)
@@ -3,3 +3,4 @@
include Changelog
include TODO
include LICENSE.txt
+exclude pyem/_c_densities.py
Modified: trunk/Lib/sandbox/pyem/TODO
===================================================================
--- trunk/Lib/sandbox/pyem/TODO 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/TODO 2006-10-12 13:47:30 UTC (rev 2278)
@@ -1,15 +1,18 @@
-# Last Change: Tue Aug 22 10:00 PM 2006 J
+# Last Change: Mon Aug 28 10:00 PM 2006 J
-Things which must be implemented for a 1.0 version:
- - test for various length and model size.
+Things which must be implemented for a 1.0 version (in importante order)
+ - test for various length and model size
- a small help/tutorial
+ - be complient with scipy module dev guidelines (DEVELOPERS.TXT)
+ - On-line EM
- use scipy.cluster.vq instead of custom kmeans algo (find
- a way to get label from vq.kmeans)
+ a way to get label from vq.kmeans: depends on scipy changes)
+ - C implementation of Gaussian densities at least (partially done,
+ but need integration into distutils + portable way of detecting/
+ loading shared library with ctypes)
Things which would be nice:
- - C implementation of Gaussian densities at least.
- Integrate libem (libem should be modified so
that it would be easy to package and distribute)
- - On-line EM
- Other initialization schemes
- Other mixture models
Added: trunk/Lib/sandbox/pyem/pyem/_c_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/_c_densities.py 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/pyem/_c_densities.py 2006-10-12 13:47:30 UTC (rev 2278)
@@ -0,0 +1,186 @@
+#! /usr/bin/python
+#
+# Copyrighted David Cournapeau
+# Last Change: Mon Aug 28 06:00 PM 2006 J
+
+# This module uses a C implementation through ctypes, for diagonal cases
+# TODO:
+# - portable way to find/open the shared library
+# - full cov matrice
+
+import numpy as N
+import numpy.linalg as lin
+from numpy.random import randn
+from scipy.stats import chi2
+import densities as D
+
+# Error classes
+class DenError(Exception):
+ """Base class for exceptions in this module.
+
+ Attributes:
+ expression -- input expression in which the error occurred
+ message -- explanation of the error"""
+ def __init__(self, message):
+ self.message = message
+
+ def __str__(self):
+ return self.message
+
+# The following function do all the fancy stuff to check that parameters
+# are Ok, and call the right implementation if args are OK.
+def gauss_den(x, mu, va, log = False):
+ """ Compute multivariate Gaussian density at points x for
+ mean mu and variance va.
+
+ Vector are row vectors, except va which can be a matrix
+ (row vector variance for diagonal variance)
+
+ If log is True, than the log density is returned
+ (useful for underflow ?)"""
+ mu = N.atleast_2d(mu)
+ va = N.atleast_2d(va)
+ x = N.atleast_2d(x)
+
+ #=======================#
+ # Checking parameters #
+ #=======================#
+ if len(N.shape(mu)) != 2:
+ raise DenError("mu is not rank 2")
+
+ if len(N.shape(va)) != 2:
+ raise DenError("va is not rank 2")
+
+ if len(N.shape(x)) != 2:
+ raise DenError("x is not rank 2")
+
+ (n, d) = x.shape
+ (dm0, dm1) = mu.shape
+ (dv0, dv1) = va.shape
+
+ # Check x and mu same dimension
+ if dm0 != 1:
+ msg = "mean must be a row vector!"
+ raise DenError(msg)
+ if dm1 != d:
+ msg = "x and mu not same dim"
+ raise DenError(msg)
+ # Check va and mu same size
+ if dv1 != d:
+ msg = "mu and va not same dim"
+ raise DenError(msg)
+ if dv0 != 1 and dv0 != d:
+ msg = "va not square"
+ raise DenError(msg)
+
+ #===============#
+ # Computation #
+ #===============#
+ if d == 1:
+ # scalar case
+ return _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log)
+ elif dv0 == 1:
+ # Diagonal matrix case
+ return _diag_gauss_den(x, mu, va, log)
+ elif dv1 == dv0:
+ # full case
+ return _full_gauss_den(x, mu, va, log)
+ else:
+ raise DenError("variance mode not recognized, this is a bug")
+
+# Those 3 functions do almost all the actual computation
+def _scalar_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in scalar case. It assumes all args
+ are conformant, so it should not be used directly
+
+ ** Expect centered data (ie with mean removed) **
+
+ Call gauss_den instead"""
+ d = mu.size
+ inva = 1/va
+ fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
+ y = ((x-mu) ** 2) * -0.5 * inva
+ if not log:
+ y = fac * N.exp(y)
+ else:
+ y = y + log(fac)
+
+ return y
+
+def _diag_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in scalar case. It assumes all args
+ are conformant, so it should not be used directly
+
+ ** Expect centered data (ie with mean removed) **
+
+ Call gauss_den instead"""
+ # Diagonal matrix case
+ d = mu.size
+ n = x.shape[0]
+ if not log:
+ from ctypes import cdll, c_uint, c_int, c_double, POINTER
+ _gden = cdll.LoadLibrary('src/libgden.so')
+ _gden.gden_diag.restype = c_int
+ _gden.gden_diag.argtypes = [POINTER(c_double), c_uint, c_uint,
+ POINTER(c_double), POINTER(c_double), POINTER(c_double)]
+
+ y = N.zeros(n)
+ inva= 1/va
+ _gden.gden_diag(x.ctypes.data_as(POINTER(c_double)),
+ n, d,
+ mu.ctypes.data_as(POINTER(c_double)),
+ inva.ctypes.data_as(POINTER(c_double)),
+ y.ctypes.data_as(POINTER(c_double)))
+ else:
+ y = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log)
+ for i in range(1, d):
+ y += _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log)
+ return y
+
+def _full_gauss_den(x, mu, va, log):
+ """ This function is the actual implementation
+ of gaussian pdf in full matrix case.
+
+ It assumes all args are conformant, so it should
+ not be used directly Call gauss_den instead
+
+ ** Expect centered data (ie with mean removed) **
+
+ Does not check if va is definite positive (on inversible
+ for that matter), so the inverse computation and/or determinant
+ would throw an exception."""
+ d = mu.size
+ inva = lin.inv(va)
+ fac = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))
+
+ # we are using a trick with sum to "emulate"
+ # the matrix multiplication inva * x without any explicit loop
+ y = N.dot((x-mu), inva)
+ y = -0.5 * N.sum(y * (x-mu), 1)
+
+ if not log:
+ y = fac * N.exp(y)
+ else:
+ y = y + N.log(fac)
+
+ return y
+
+if __name__ == "__main__":
+ #=========================================
+ # Test accuracy between pure and C python
+ #=========================================
+ mu = N.array([2.0, 3])
+ va = N.array([5.0, 3])
+
+ # Generate a multivariate gaussian of mean mu and covariance va
+ nframes = 1e4
+ X = randn(nframes, 2)
+ Yc = N.dot(N.diag(N.sqrt(va)), X.transpose())
+ Yc = Yc.transpose() + mu
+
+ Y = D.gauss_den(Yc, mu, va)
+ Yt = gauss_den(Yc, mu, va)
+
+ print "Diff is " + str(N.sqrt(N.sum((Y-Yt) ** 2))/nframes/2)
Modified: trunk/Lib/sandbox/pyem/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/densities.py 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/pyem/densities.py 2006-10-12 13:47:30 UTC (rev 2278)
@@ -1,7 +1,7 @@
#! /usr/bin/python
#
# Copyrighted David Cournapeau
-# Last Change: Tue Aug 22 08:00 PM 2006 J
+# Last Change: Mon Aug 28 12:00 PM 2006 J
import numpy as N
import numpy.linalg as lin
Modified: trunk/Lib/sandbox/pyem/pyem/gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/gauss_mix.py 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/pyem/gauss_mix.py 2006-10-12 13:47:30 UTC (rev 2278)
@@ -1,5 +1,5 @@
# /usr/bin/python
-# Last Change: Tue Aug 22 09:00 PM 2006 J
+# Last Change: Mon Aug 28 01:00 PM 2006 J
# Module to implement GaussianMixture class.
@@ -18,6 +18,12 @@
#
# For now, we have to init with meta-parameters, and set
# the parameters afterward. There should be a better way ?
+
+# TODO:
+# - change bounds methods of GM class instanciations so that it cannot
+# be used as long as w, mu and va are not set
+# - We have to use scipy now for chisquare pdf, so there may be other
+# methods to be used, ie for implementing random index.
class GmParamError:
"""Exception raised for errors in gmm params
@@ -178,6 +184,20 @@
gen_param = classmethod(gen_param)
+ def plot(self, *args, **kargs):
+ """Plot the ellipsoides directly for the model
+
+ Returns a list of lines, so that their style can be modified. By default,
+ the style is red color, and nolegend for all of them"""
+ k = self.k
+ Xe, Ye = self.conf_ellipses(*args, **kargs)
+ try:
+ import pylab as P
+ return [P.plot(Xe[i], Ye[i], 'r', label='_nolegend_')[0] for i in range(k)]
+ #for i in range(k):
+ # P.plot(Xe[i], Ye[i], 'r')
+ except ImportError:
+ raise GmParamError("matplotlib not found, cannot plot...")
# Function to generate a random index: this is kept outside any class,
# as the function can be useful for other
@@ -274,17 +294,16 @@
# Sample nframes frames from the model
X = gm.sample(nframes)
- # Plot
+ # Plot the data
import pylab as P
-
P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')
# Real confidence ellipses with confidence level
- level = 0.39
- Xre, Yre = gm.conf_ellipses(level = level)
- P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides at level %.2f' % level)
- for i in range(1,k):
- P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
+ level = 0.50
+ h = gm.plot(level=level)
+ # set the first ellipse label, which will appear in the legend
+ h[0].set_label('confidence ell at level ' + str(level))
+
P.legend(loc = 0)
P.show()
Modified: trunk/Lib/sandbox/pyem/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/gmm_em.py 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/pyem/gmm_em.py 2006-10-12 13:47:30 UTC (rev 2278)
@@ -1,9 +1,14 @@
# /usr/bin/python
-# Last Change: Thu Aug 24 02:00 PM 2006 J
+# Last Change: Mon Aug 28 05:00 PM 2006 J
+# TODO:
+# - which methods to avoid va shrinking to 0 ?
+# - online EM
+
import numpy as N
import numpy.linalg as lin
from numpy.random import randn
+#import _c_densities as densities
import densities
from kmean import kmean
from gauss_mix import GM
@@ -42,7 +47,7 @@
class ExpMixtureModel(MixtureModel):
"""Class to model mixture of exponential pdf (eg Gaussian, exponential, Laplace,
etc..). This is a special case because some parts of EM are common for those
- modelsi..."""
+ models..."""
pass
class GMM(ExpMixtureModel):
@@ -150,10 +155,6 @@
va = N.zeros((k, d))
gamma = gamma.transpose()
for c in range(k):
- # x = N.sum(N.outerproduct(gamma[:, k],
- # N.ones((1, d))) * data)
- # xx = N.sum(N.outerproduct(gamma[:, k],
- # N.ones((1, d))) * (data ** 2))
x = N.dot(gamma[c:c+1,:], data)[0,:]
xx = N.dot(gamma[c:c+1,:], data ** 2)[0,:]
@@ -222,6 +223,11 @@
return like
+class OnlineEM:
+ "An online EM trainer. "
+ def __init__(self):
+ raise GmmError("not implemented yet")
+
# Misc functions
def multiple_gauss_den(data, mu, va):
"""Helper function to generate several Gaussian
@@ -282,13 +288,10 @@
print "Init a model for learning, with kmean for initialization"
lgm = GM(d, k, mode)
gmm = GMM(lgm, 'kmean')
-
gmm.init(data)
+
# Keep the initialized model for drawing
gm0 = copy.copy(lgm)
- print gm0.w
- print gm0.mu
- print gm0.va
# The actual EM, with likelihood computation
niter = 10
Modified: trunk/Lib/sandbox/pyem/pyem/kmean.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/kmean.py 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/pyem/kmean.py 2006-10-12 13:47:30 UTC (rev 2278)
@@ -1,8 +1,46 @@
# /usr/bin/python
-# Last Change: Thu Aug 24 08:00 PM 2006 J
+# Last Change: Mon Aug 28 09:00 PM 2006 J
import numpy as N
+def _py_vq(data, code):
+ """ Please do not use directly. Use kmean instead"""
+ # No attempt to be efficient has been made...
+ (n, d) = data.shape
+ (k, d) = code.shape
+
+ label = N.zeros(n, int)
+ for i in range(n):
+ d = N.sum((data[i, :] - code) ** 2, 1)
+ label[i] = N.argmin(d)
+
+ return label
+
+# Try to import pyrex function for vector quantization. If not available,
+# falls back on pure python implementation.
+#%KMEANIMPORT%
+#try:
+# from scipy.cluster.vq import kmeans as kmean
+#except ImportError:
+# try:
+# from c_gmm import _vq
+# except:
+# print """c_gmm._vq not found, using pure python implementation instead.
+# Kmean will be REALLY slow"""
+# _vq = _py_vq
+try:
+ from sccipy.cluster.vq import vq
+ print "using scipy.cluster.vq"
+ def _vq(*args, **kw): return vq(*args, **kw)[0]
+except ImportError:
+ try:
+ from c_gmm import _vq
+ print "using pyrex vq"
+ except ImportError:
+ print """c_gmm._vq not found, using pure python implementation instead.
+ Kmean will be REALLY slow"""
+ _vq = _py_vq
+
def kmean(data, init, iter = 10):
"""Simple kmean implementation for EM
@@ -31,38 +69,6 @@
return code, label
-def _py_vq(data, code):
- """ Please do not use directly. Use kmean instead"""
- # No attempt to be efficient has been made...
- (n, d) = data.shape
- (k, d) = code.shape
-
- label = N.zeros(n, int)
- for i in range(n):
- d = N.sum((data[i, :] - code) ** 2, 1)
- label[i] = N.argmin(d)
-
- return label
-
-# Try to import pyrex function for vector quantization. If not available,
-# falls back on pure python implementation.
-#%KMEANIMPORT%
-#try:
-# from scipy.cluster.vq import kmeans as kmean
-#except ImportError:
-# try:
-# from c_gmm import _vq
-# except:
-# print """c_gmm._vq not found, using pure python implementation instead.
-# Kmean will be REALLY slow"""
-# _vq = _py_vq
-try:
- from c_gmm import _vq
-except:
- print """c_gmm._vq not found, using pure python implementation instead.
- Kmean will be REALLY slow"""
- _vq = _py_vq
-
# Test functions usable for now
def test_kmean():
X = N.array([[3.0, 3], [4, 3], [4, 2],
Modified: trunk/Lib/sandbox/pyem/pyem/profile_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/profile_densities.py 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/pyem/profile_densities.py 2006-10-12 13:47:30 UTC (rev 2278)
@@ -1,16 +1,16 @@
import numpy as N
from numpy.random import randn
import densities as D
+import _c_densities as DC
import tables
-def bench1(mode = 'diag'):
+def bench(func, mode = 'diag'):
#===========================================
# Diag Gaussian of dimension 20
#===========================================
- d = 20
- n = 1e4
- niter = 20
- mode = 'diag'
+ d = 30
+ n = 1e5
+ niter = 100
# Generate a model with k components, d dimensions
mu = randn(1, d)
@@ -23,7 +23,7 @@
X = randn(n, d)
print "Compute %d times densities, %d dimension, %d frames" % (niter, d, n)
for i in range(niter):
- Y = D.gauss_den(X, mu, va)
+ Y = func(X, mu, va)
# Check values
h5file = tables.openFile('diag.dat', "r")
@@ -31,17 +31,27 @@
mu = h5file.root.mu.read()
va = h5file.root.va.read()
Yt = h5file.root.output.read()
- Y = D.gauss_den(X, mu, va)
+ Y = func(X, mu, va)
try:
N.testing.assert_array_almost_equal(Y, Yt)
except AssertionError:
+ print N.sum(Y)
print N.sqrt(N.sum((Y-Yt) **2)) / n
raise "Not accurate !!!!"
+def benchpy():
+ bench(D.gauss_den)
+
+def benchc():
+ bench(DC.gauss_den)
+
if __name__ == "__main__":
import profile
- profile.run('bench1()', 'gdenprof')
import pstats
+ profile.run('benchpy()', 'gdenprof')
p = pstats.Stats('gdenprof')
print p.sort_stats('cumulative').print_stats(20)
+ profile.run('benchc()', 'gdenprof')
+ p = pstats.Stats('gdenprof')
+ print p.sort_stats('cumulative').print_stats(20)
Modified: trunk/Lib/sandbox/pyem/pyem/profile_gmm.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/profile_gmm.py 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/pyem/profile_gmm.py 2006-10-12 13:47:30 UTC (rev 2278)
@@ -42,6 +42,7 @@
print "computing..."
for i in range(niter):
+ print "iter %d" % i
g, tgd = gmm.sufficient_statistics(data)
like[i] = N.sum(N.log(N.sum(tgd, 1)))
gmm.update_em(data, g)
Modified: trunk/Lib/sandbox/pyem/pyem/src/Makefile
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/Makefile 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/pyem/src/Makefile 2006-10-12 13:47:30 UTC (rev 2278)
@@ -5,7 +5,7 @@
PYTHONINC = -I/usr/include/python2.4
NUMPYINC = -I/usr/lib/python2.4/site-packages/numpy/core/include
-OPTIMS = -O2 -ffast-math -march=pentium4
+OPTIMS = -O3 -funroll-all-loops -ffast-math -march=pentium4
WARN = -W -Wall
CFLAGS = $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN)
Added: trunk/Lib/sandbox/pyem/pyem/src/c_gden.c
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_gden.c 2006-10-12 13:47:25 UTC (rev 2277)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_gden.c 2006-10-12 13:47:30 UTC (rev 2278)
@@ -0,0 +1,33 @@
+#include <stddef.h>
+#include <stdio.h>
+#include <math.h>
+
+int gden_diag(const double *in, const size_t n, const size_t d,
+ const double* mu, const double* inva, double* out)
+{
+ size_t j, i;
+ double fac = 1.0;
+ double acc, tmp;
+
+ /*
+ * Cache some precomputing
+ */
+ for(j = 0; j < d; ++j) {
+ fac *= sqrt(inva[j] / (2 * M_PI) );
+ //printf("inva[%d] is %f, fac %f\n", j, inva[j], fac);
+ }
+
+ /*
+ * actual computing
+ */
+ for(i = 0; i < n; ++i) {
+ acc = 0;
+ for(j = 0; j < d; ++j) {
+ tmp = (in[d * i + j] - mu[j]);
+ acc += tmp * tmp * -0.5 * inva[j];
+ }
+ out[i] = exp(acc) * fac;
+ }
+
+ return 0;
+}
More information about the Scipy-svn
mailing list