[Scipy-svn] r2294 - trunk/Lib/sandbox/pyem

Mon Oct 23 05:52:11 EDT 2006

Author: cdavid
Date: 2006-10-23 04:52:04 -0500 (Mon, 23 Oct 2006)
New Revision: 2294

Added:
   trunk/Lib/sandbox/pyem/example2.py
Modified:
   trunk/Lib/sandbox/pyem/Changelog
   trunk/Lib/sandbox/pyem/example.py
   trunk/Lib/sandbox/pyem/gmm_em.py
   trunk/Lib/sandbox/pyem/info.py
Log:
 * pyem: added GM.bic function to compute Bayesian Information Criterion 
 for automatic model selection + various docstrings fixes



Modified: trunk/Lib/sandbox/pyem/Changelog
===================================================================

--- trunk/Lib/sandbox/pyem/Changelog	2006-10-23 07:39:07 UTC (rev 2293)
+++ trunk/Lib/sandbox/pyem/Changelog	2006-10-23 09:52:04 UTC (rev 2294)
@@ -1,3 +1,10 @@
+pyem (0.5.5) Mon, 23 Oct 2006 18:48:15 +0900
+
+	* Bump to 0.5.5
+	* Added bic method to GMM.
+	* A few fixes for docstrings
+	* Added bibliography to the doc
+
 pyem (0.5.4) Fri, 20 Oct 2006 12:52:01 +0900
 
 	* Bump to 0.5.4

Modified: trunk/Lib/sandbox/pyem/example.py
===================================================================
--- trunk/Lib/sandbox/pyem/example.py	2006-10-23 07:39:07 UTC (rev 2293)
+++ trunk/Lib/sandbox/pyem/example.py	2006-10-23 09:52:04 UTC (rev 2294)
@@ -29,7 +29,7 @@
 nframes = 1e3
 
 #+++++++++++++++++++++++++++++++++++++++++++
-# Create an artificial GMM model, samples it
+# Create an artificial GM model, samples it
 #+++++++++++++++++++++++++++++++++++++++++++
 w, mu, va   = GM.gen_param(d, k, mode, spread = 1.5)
 gm          = GM.fromvalues(w, mu, va)
@@ -80,7 +80,7 @@
     # Values found by EM
     h   = lgm.plot(level = level)
     [i.set_color('r') for i in h]
-    h[0].set_label('kmean confidence ellipsoides')
+    h[0].set_label('EM confidence ellipsoides')
 
     P.legend(loc = 0)
 else:

Added: trunk/Lib/sandbox/pyem/example2.py
===================================================================
--- trunk/Lib/sandbox/pyem/example2.py	2006-10-23 07:39:07 UTC (rev 2293)
+++ trunk/Lib/sandbox/pyem/example2.py	2006-10-23 09:52:04 UTC (rev 2294)
@@ -0,0 +1,104 @@
+#! /usr/bin/env python
+
+# Example of use of pyem toolbox. Feel free to change parameters
+# such as dimension, number of components, mode of covariance.
+#
+# You can also try less trivial things such as adding outliers, sampling
+# a mixture with full covariance and estimating it with a mixture with diagonal
+# gaussians (replace the mode of the learned model lgm)
+#
+# Later, I hope to add functions for number of component estimation using eg BIC
+
+import numpy as N
+from numpy.random import seed
+
+from scipy.sandbox.pyem import GM, GMM, EM
+import copy
+
+seed(2)
+#+++++++++++++++++++++++++++++
+# Meta parameters of the model
+#   - k: Number of components
+#   - d: dimension of each Gaussian
+#   - mode: Mode of covariance matrix: full or diag (string)
+#   - nframes: number of frames (frame = one data point = one
+#   row of d elements)
+k       = 4 
+d       = 2
+mode    = 'diag'
+nframes = 1e3
+
+#+++++++++++++++++++++++++++++++++++++++++++
+# Create an artificial GMM model, samples it
+#+++++++++++++++++++++++++++++++++++++++++++
+w, mu, va   = GM.gen_param(d, k, mode, spread = 1.0)
+gm          = GM.fromvalues(w, mu, va)
+
+# Sample nframes frames  from the model
+data    = gm.sample(nframes)
+
+#++++++++++++++++++++++++
+# Learn the model with EM
+#++++++++++++++++++++++++
+
+lgm     = []
+kmax    = 6
+bics    = N.zeros(kmax)
+for i in range(kmax):
+    # Init the model with an empty Gaussian Mixture, and create a Gaussian 
+    # Mixture Model from it
+    lgm.append(GM(d, i+1, mode))
+    gmm = GMM(lgm[i], 'kmean')
+
+    # The actual EM, with likelihood computation. The threshold
+    # is compared to the (linearly appromixated) derivative of the likelihood
+    em      = EM()
+    em.train(data, gmm, maxiter = 30, thresh = 1e-10)
+    bics[i] = gmm.bic(data)
+
+print "Original model has %d clusters, bics says %d" % (k, N.argmax(bics)+1) 
+
+#+++++++++++++++
+# Draw the model
+#+++++++++++++++
+import pylab as P
+P.subplot(3, 2, 1)
+
+for k in range(kmax):
+    P.subplot(3, 2, k+1)
+    # Level is the confidence level for confidence ellipsoids: 1.0 means that
+    # all points will be (almost surely) inside the ellipsoid
+    level   = 0.8
+    if not d == 1:
+        P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
+
+        # h keeps the handles of the plot, so that you can modify 
+        # its parameters like label or color
+        h   = lgm[k].plot(level = level)
+        [i.set_color('r') for i in h]
+        h[0].set_label('EM confidence ellipsoides')
+
+        h   = gm.plot(level = level)
+        [i.set_color('g') for i in h]
+        h[0].set_label('Real confidence ellipsoides')
+    else:
+        # The 1d plotting function is quite elaborate: the confidence
+        # interval are represented by filled areas, the pdf of the mixture and
+        # the pdf of each component is drawn (optional)
+        h   = gm.plot1d(level = level)
+        [i.set_color('g') for i in h['pdf']]
+        h['pdf'][0].set_label('true pdf')
+
+        h0  = gm0.plot1d(level = level)
+        [i.set_color('k') for i in h0['pdf']]
+        h0['pdf'][0].set_label('initial pdf')
+
+        hl  = lgm.plot1d(fill = 1, level = level)
+        [i.set_color('r') for i in hl['pdf']]
+        hl['pdf'][0].set_label('pdf found by EM')
+
+        P.legend(loc = 0)
+
+P.legend(loc = 0)
+P.show()
+# P.save('2d diag.png')

Modified: trunk/Lib/sandbox/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/gmm_em.py	2006-10-23 07:39:07 UTC (rev 2293)
+++ trunk/Lib/sandbox/pyem/gmm_em.py	2006-10-23 09:52:04 UTC (rev 2294)
@@ -1,5 +1,5 @@
 # /usr/bin/python
-# Last Change: Fri Oct 06 05:00 PM 2006 J
+# Last Change: Mon Oct 23 06:00 PM 2006 J
 
 # TODO:
 #   - which methods to avoid va shrinking to 0 ? There are several options, 
@@ -84,6 +84,8 @@
 
         self.gm.set_param(w, mu, va)
 
+        self.isinit = True
+
     def init_random(self, data):
         """ Init the model at random."""
         k   = self.gm.k
@@ -97,6 +99,8 @@
                     mode %s yet""", mode)
 
         self.gm.set_param(w, mu, va)
+        
+        self.isinit = True
 
     # TODO: 
     #   - format of parameters ? For variances, list of variances matrix,
@@ -115,6 +119,7 @@
             raise GmmParamError('init method %s not recognized' + str(init))
 
         self.init   = init_methods[init]
+        self.isinit = False
 
     def sufficient_statistics(self, data):
         """ Return normalized and non-normalized sufficient statistics
@@ -188,6 +193,53 @@
 
         self.gm.set_param(w, mu, va)
 
+    def likelihood(self, data):
+        """ Returns the current log likelihood of the model given
+        the data """
+        assert(self.isinit)
+        # compute the gaussian pdf
+        tgd	= multiple_gauss_den(data, self.gm.mu, self.gm.va)
+        # multiply by the weight
+        tgd	*= self.gm.w
+
+        return N.sum(N.log(N.sum(tgd, axis = 1)), axis = 0)
+
+    def bic(self, data):
+        """ Returns the BIC (Bayesian Information Criterion), 
+        also called Schwarz information criterion. Can be used 
+        to choose between different models which have different
+        number of clusters. The BIC is defined as:
+
+        BIC = 2 * ln(L) - k * ln(n)
+
+        where:
+            * ln(L) is the log-likelihood of the estimated model
+            * k is the number of degrees of freedom
+            * n is the number of frames
+        
+        Not that depending on the literature, BIC may be defined as the opposite
+        of the definition given here. """
+
+        if self.gm.mode == 'diag':
+            """ for a diagonal model, we have
+            k - 1 (k weigths, but one constraint of normality)
+            + k * d (means) + k * d (variances) """
+            free_deg    = self.gm.k * (self.gm.d * 2 + 1) - 1
+        elif self.gm.mode == 'full':
+            """ for a full model, we have
+            k - 1 (k weigths, but one constraint of normality)
+            + k * d (means) + k * d * d / 2 (each covariance matrice
+            has d **2 params, but with positivity constraint) """
+            if self.gm.d == 1:
+                free_deg    = self.gm.k * 3 - 1
+            else:
+                free_deg    = self.gm.k * (self.gm.d + 1 + self.gm.d ** 2 / 2) - 1
+
+        lk  = self.likelihood(data)
+        n   = N.shape(data)[0]
+        return bic(lk, free_deg, n)
+
+
 class EM:
     """An EM trainer. An EM trainer
     trains from data, with a model
@@ -234,6 +286,10 @@
         return like
     
 # Misc functions
+def bic(lk, deg, n):
+    """ Expects lk to be log likelihood """
+    return 2 * lk - deg * N.log(n)
+
 def has_em_converged(like, plike, thresh):
     """ given likelihood of current iteration like and previous
     iteration plike, returns true is converged: based on comparison

Modified: trunk/Lib/sandbox/pyem/info.py
===================================================================
--- trunk/Lib/sandbox/pyem/info.py	2006-10-23 07:39:07 UTC (rev 2293)
+++ trunk/Lib/sandbox/pyem/info.py	2006-10-23 09:52:04 UTC (rev 2294)
@@ -7,30 +7,60 @@
 (diagonal and full covariance matrices), Gaussian mixtures, Gaussian mixtures models
 and an Em trainer.
 
-More specifically, the module contains the following file:
+More specifically, the module defines the following classes, functions:
 
-- densities.py: functions to compute multivariate Gaussian pdf and ellipsoides of
-confidence (gauss_den)
-- gauss_mix.py: defines the GM (Gaussian Mixture) class. A Gaussian Mixture can be
+- densities.gauss_den: function to compute multivariate Gaussian pdf 
+- gauss_mix.GM: defines the GM (Gaussian Mixture) class. A Gaussian Mixture can be
 created from its parameters weights, mean and variances, or from its meta parameters
 d (dimension of the Gaussian) and k (number of components in the mixture). A Gaussian
 Model can then be sampled or plot (if d>1, plot confidence ellipsoids projected on 
 2 chosen dimensions, if d == 1, plot the pdf of each component and fill the zone
 of confidence for a given level)
-- gmm_em.py: defines a class GMM (Gaussian Mixture Model). This class is constructed
+- gmm_em.GMM: defines a class GMM (Gaussian Mixture Model). This class is constructed
 from a GM model gm, and can be used to train gm. The GMM can be initiated by
 kmean or at random, and can compute sufficient statistics, and update its parameters
 from the sufficient statistics.
-- kmean.py: implements a kmean algorithm. We cannot use scipy.cluster.vq kmeans, since
+- kmean.kmean: implements a kmean algorithm. We cannot use scipy.cluster.vq kmeans, since
 its does not give membership of observations.
 
-Example of use: simply execute gmm_em.py for an example of training a GM and plotting
-the results compared to true values
+Example of use: 
+    #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+    # Create an artificial 2 dimension, 3 clusters GM model, samples it
+    #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+    w, mu, va   = GM.gen_param(2, 3, 'diag', spread = 1.5)
+    gm          = GM.fromvalues(w, mu, va)
 
+    # Sample 1000 frames  from the model
+    data    = gm.sample(1000)
+
+    #++++++++++++++++++++++++
+    # Learn the model with EM
+    #++++++++++++++++++++++++
+    # Init the model
+    lgm = GM(d, k, mode)
+    gmm = GMM(lgm, 'kmean')
+
+    # The actual EM, with likelihood computation. The threshold
+    # is compared to the (linearly appromixated) derivative of the likelihood
+    em      = EM()
+    like    = em.train(data, gmm, maxiter = 30, thresh = 1e-8)
+
+Files example.py and example2.py show more capabilities of the toolbox, including
+plotting capabilities (using matplotlib) and model selection using Bayesian 
+Information Criterion (BIC).
+
+Bibliography:
+    * Maximum likelihood from incomplete data via the EM algorithm in Journal of 
+    the Royal Statistical Society, Series B, 39(1):1--38, 1977, by A. P. Dempster, 
+    N. M. Laird, and D. B. Rubin
+    * Bayesian Approaches to Gaussian Mixture Modelling (1998) by 
+    Stephen J. Roberts, Dirk Husmeier, Iead Rezek, William Penny in 
+    IEEE Transactions on Pattern Analysis and Machine Intelligence
+     
 Copyright: David Cournapeau 2006
 License: BSD-style (see LICENSE.txt in main source directory)
 """
-version = '0.5.4'
+version = '0.5.5'
 
 depends = ['linalg', 'stats']
 ignore  = False