[Scipy-svn] r2944 - trunk/Lib/cluster

Thu Apr 26 06:28:30 EDT 2007

Author: cdavid
Date: 2007-04-26 05:28:26 -0500 (Thu, 26 Apr 2007)
New Revision: 2944

Modified:
   trunk/Lib/cluster/vq.py
Log:
Some minor cosmetic changes

Modified: trunk/Lib/cluster/vq.py
===================================================================

--- trunk/Lib/cluster/vq.py	2007-04-26 10:01:36 UTC (rev 2943)
+++ trunk/Lib/cluster/vq.py	2007-04-26 10:28:26 UTC (rev 2944)
@@ -211,11 +211,16 @@
             min_dist[i] gives the distance between the ith observation and its
             corresponding code.
     """
-    No, Nf = shape(obs) #No = observation count, Nf = feature count
+    d = shape(obs)[1]
+
     # code books and observations should have same number of features
-    assert(Nf == code_book.shape[1])
-    diff = obs[newaxis,:,:]-code_book[:,newaxis,:]
-    dist = sqrt(N.sum(diff*diff, -1))
+    if not d == code_book.shape[1]:
+        raise ValueError("""
+            code book(%d) and obs(%d) should have the same 
+            number of features (eg columns)""" % (code_book.shape[1], d))
+    
+    diff = obs[newaxis, :, :] - code_book[:, newaxis, :]
+    dist = sqrt(N.sum(diff * diff, -1))
     code = argmin(dist, 0)
     min_dist = minimum.reduce(dist, 0) #the next line I think is equivalent
                                       #  - and should be faster
@@ -223,7 +228,7 @@
                                   # much difference.
     return code, min_dist
 
-def kmeans_(obs, guess, thresh=1e-5):
+def _kmeans(obs, guess, thresh=1e-5):
     """ "raw" version of kmeans.
 
     :Returns:
@@ -244,37 +249,37 @@
     Note: not whitened in this example.
 
     >>> from numpy import array
-    >>> from scipy.cluster.vq import kmeans_
+    >>> from scipy.cluster.vq import _kmeans
     >>> features  = array([[ 1.9,2.3],
     ...                    [ 1.5,2.5],
     ...                    [ 0.8,0.6],
     ...                    [ 0.4,1.8],
     ...                    [ 1.0,1.0]])
     >>> book = array((features[0],features[2]))
-    >>> kmeans_(features,book)
+    >>> _kmeans(features,book)
     (array([[ 1.7       ,  2.4       ],
            [ 0.73333333,  1.13333333]]), 0.40563916697728591)
 
     """
 
-    code_book = array(guess,copy=True)
+    code_book = array(guess, copy = True)
     Nc = code_book.shape[0]
-    avg_dist=[]
+    avg_dist = []
     diff = thresh+1.
-    while diff>thresh:
+    while diff > thresh:
         #compute membership and distances between obs and code_book
-        obs_code, distort = vq(obs,code_book)
-        avg_dist.append(mean(distort,axis=-1))
+        obs_code, distort = vq(obs, code_book)
+        avg_dist.append(mean(distort, axis=-1))
         #recalc code_book as centroids of associated obs
         if(diff > thresh):
             has_members = []
             for i in arange(Nc):
-                cell_members = compress(equal(obs_code,i),obs,0)
+                cell_members = compress(equal(obs_code, i), obs, 0)
                 if cell_members.shape[0] > 0:
-                    code_book[i] = mean(cell_members,0)
+                    code_book[i] = mean(cell_members, 0)
                     has_members.append(i)
             #remove code_books that didn't have any members
-            code_book = take(code_book,has_members,0)
+            code_book = take(code_book, has_members, 0)
         if len(avg_dist) > 1:
             diff = avg_dist[-2] - avg_dist[-1]
     #print avg_dist
@@ -306,6 +311,10 @@
         distortion : float
             The distortion between the observations and the codes.
 
+    :SeeAlso:
+        - kmeans2: similar function, but with more options for initialization,
+          and returns label of each observation
+
     Examples
     --------
 
@@ -340,21 +349,20 @@
     if int(iter) < 1:
         raise ValueError, 'iter must be >= to 1.'
     if type(k_or_guess) == type(array([])):
-        guess = k_or_guess
-        result = kmeans_(obs,guess,thresh=thresh)
+        guess   = k_or_guess
+        result  = _kmeans(obs, guess, thresh = thresh)
     else:
-        best_dist = 100000 #initialize best distance value to a large value
+        #initialize best distance value to a large value
+        best_dist = 100000
         No = obs.shape[0]
         k = k_or_guess
         #print 'kmeans iter: ',
         for i in range(iter):
-            #print i,
             #the intial code book is randomly selected from observations
-            guess = take(obs,randint(0,No,k),0)
-            book,dist = kmeans_(obs,guess,thresh=thresh)
+            guess       = take(obs, randint(0, No, k), 0)
+            book, dist  = _kmeans(obs, guess, thresh = thresh)
             if dist < best_dist:
                 best_book = book
                 best_dist = dist
-        #print
-        result = best_book,best_dist
+        result = best_book, best_dist
     return result