[Scipy-svn] r2944 - trunk/Lib/cluster
scipy-svn at scipy.org
scipy-svn at scipy.org
Thu Apr 26 06:28:30 EDT 2007
Author: cdavid
Date: 2007-04-26 05:28:26 -0500 (Thu, 26 Apr 2007)
New Revision: 2944
Modified:
trunk/Lib/cluster/vq.py
Log:
Some minor cosmetic changes
Modified: trunk/Lib/cluster/vq.py
===================================================================
--- trunk/Lib/cluster/vq.py 2007-04-26 10:01:36 UTC (rev 2943)
+++ trunk/Lib/cluster/vq.py 2007-04-26 10:28:26 UTC (rev 2944)
@@ -211,11 +211,16 @@
min_dist[i] gives the distance between the ith observation and its
corresponding code.
"""
- No, Nf = shape(obs) #No = observation count, Nf = feature count
+ d = shape(obs)[1]
+
# code books and observations should have same number of features
- assert(Nf == code_book.shape[1])
- diff = obs[newaxis,:,:]-code_book[:,newaxis,:]
- dist = sqrt(N.sum(diff*diff, -1))
+ if not d == code_book.shape[1]:
+ raise ValueError("""
+ code book(%d) and obs(%d) should have the same
+ number of features (eg columns)""" % (code_book.shape[1], d))
+
+ diff = obs[newaxis, :, :] - code_book[:, newaxis, :]
+ dist = sqrt(N.sum(diff * diff, -1))
code = argmin(dist, 0)
min_dist = minimum.reduce(dist, 0) #the next line I think is equivalent
# - and should be faster
@@ -223,7 +228,7 @@
# much difference.
return code, min_dist
-def kmeans_(obs, guess, thresh=1e-5):
+def _kmeans(obs, guess, thresh=1e-5):
""" "raw" version of kmeans.
:Returns:
@@ -244,37 +249,37 @@
Note: not whitened in this example.
>>> from numpy import array
- >>> from scipy.cluster.vq import kmeans_
+ >>> from scipy.cluster.vq import _kmeans
>>> features = array([[ 1.9,2.3],
... [ 1.5,2.5],
... [ 0.8,0.6],
... [ 0.4,1.8],
... [ 1.0,1.0]])
>>> book = array((features[0],features[2]))
- >>> kmeans_(features,book)
+ >>> _kmeans(features,book)
(array([[ 1.7 , 2.4 ],
[ 0.73333333, 1.13333333]]), 0.40563916697728591)
"""
- code_book = array(guess,copy=True)
+ code_book = array(guess, copy = True)
Nc = code_book.shape[0]
- avg_dist=[]
+ avg_dist = []
diff = thresh+1.
- while diff>thresh:
+ while diff > thresh:
#compute membership and distances between obs and code_book
- obs_code, distort = vq(obs,code_book)
- avg_dist.append(mean(distort,axis=-1))
+ obs_code, distort = vq(obs, code_book)
+ avg_dist.append(mean(distort, axis=-1))
#recalc code_book as centroids of associated obs
if(diff > thresh):
has_members = []
for i in arange(Nc):
- cell_members = compress(equal(obs_code,i),obs,0)
+ cell_members = compress(equal(obs_code, i), obs, 0)
if cell_members.shape[0] > 0:
- code_book[i] = mean(cell_members,0)
+ code_book[i] = mean(cell_members, 0)
has_members.append(i)
#remove code_books that didn't have any members
- code_book = take(code_book,has_members,0)
+ code_book = take(code_book, has_members, 0)
if len(avg_dist) > 1:
diff = avg_dist[-2] - avg_dist[-1]
#print avg_dist
@@ -306,6 +311,10 @@
distortion : float
The distortion between the observations and the codes.
+ :SeeAlso:
+ - kmeans2: similar function, but with more options for initialization,
+ and returns label of each observation
+
Examples
--------
@@ -340,21 +349,20 @@
if int(iter) < 1:
raise ValueError, 'iter must be >= to 1.'
if type(k_or_guess) == type(array([])):
- guess = k_or_guess
- result = kmeans_(obs,guess,thresh=thresh)
+ guess = k_or_guess
+ result = _kmeans(obs, guess, thresh = thresh)
else:
- best_dist = 100000 #initialize best distance value to a large value
+ #initialize best distance value to a large value
+ best_dist = 100000
No = obs.shape[0]
k = k_or_guess
#print 'kmeans iter: ',
for i in range(iter):
- #print i,
#the intial code book is randomly selected from observations
- guess = take(obs,randint(0,No,k),0)
- book,dist = kmeans_(obs,guess,thresh=thresh)
+ guess = take(obs, randint(0, No, k), 0)
+ book, dist = _kmeans(obs, guess, thresh = thresh)
if dist < best_dist:
best_book = book
best_dist = dist
- #print
- result = best_book,best_dist
+ result = best_book, best_dist
return result
More information about the Scipy-svn
mailing list