[Scipy-svn] r2945 - in trunk/Lib/cluster: . tests
scipy-svn at scipy.org
scipy-svn at scipy.org
Thu Apr 26 08:24:26 EDT 2007
Author: cdavid
Date: 2007-04-26 07:24:16 -0500 (Thu, 26 Apr 2007)
New Revision: 2945
Modified:
trunk/Lib/cluster/tests/test_vq.py
trunk/Lib/cluster/vq.py
Log:
Add kmeans2, a more sophisticated kmeans implementation (different initialization methods available)
Modified: trunk/Lib/cluster/tests/test_vq.py
===================================================================
--- trunk/Lib/cluster/tests/test_vq.py 2007-04-26 10:28:26 UTC (rev 2944)
+++ trunk/Lib/cluster/tests/test_vq.py 2007-04-26 12:24:16 UTC (rev 2945)
@@ -1,7 +1,7 @@
#! /usr/bin/env python
# David Cournapeau
-# Last Change: Thu Apr 26 05:00 PM 2007 J
+# Last Change: Thu Apr 26 09:00 PM 2007 J
# For now, just copy the tests from sandbox.pyem, so we can check that
# kmeans works OK for trivial examples.
@@ -12,7 +12,7 @@
import numpy as N
set_package_path()
-from cluster.vq import kmeans, kmeans_, py_vq, py_vq2
+from cluster.vq import kmeans, kmeans2, py_vq, py_vq2, _py_vq_1d
restore_path()
#Optional:
@@ -60,22 +60,60 @@
except ImportError:
print "== Error while importing _vq, not testing C imp of vq =="
+ #def check_vq_1d(self, level=1):
+ # data = X[:, 0]
+ # initc = data[:3]
+ # code = initc.copy()
+ # print _py_vq_1d(data, initc)
+
class test_kmean(NumpyTestCase):
- def check_kmeans(self, level=1):
+ def check_kmeans_simple(self, level=1):
initc = N.concatenate(([[X[0]], [X[1]], [X[2]]]))
code = initc.copy()
- #code1 = kmeans(X, code, iter = 1)[0]
+ code1 = kmeans(X, code, iter = 1)[0]
- #assert_array_almost_equal(code1, CODET2)
+ assert_array_almost_equal(code1, CODET2)
def check_kmeans_lost_cluster(self, level=1):
"""This will cause kmean to have a cluster with no points."""
data = N.fromfile(open(DATAFILE1), sep = ", ")
data = data.reshape((200, 2))
- initk = N.array([[-1.8127404, -0.67128041], [ 2.04621601, 0.07401111],
+ initk = N.array([[-1.8127404, -0.67128041],
+ [ 2.04621601, 0.07401111],
[-2.31149087,-0.05160469]])
res = kmeans(data, initk)
+ def check_kmeans2_simple(self, level=1):
+ """Testing simple call to kmeans2 and its results."""
+ initc = N.concatenate(([[X[0]], [X[1]], [X[2]]]))
+ code = initc.copy()
+ code1 = kmeans2(X, code, niter = 1)[0]
+ code2 = kmeans2(X, code, niter = 2)[0]
+
+ assert_array_almost_equal(code1, CODET1)
+ assert_array_almost_equal(code2, CODET2)
+
+ #def check_kmeans2_rank1(self, level=1):
+ # """Testing simple call to kmeans2 with rank 1 data."""
+ # data = N.fromfile(open(DATAFILE1), sep = ", ")
+ # data = data.reshape((200, 2))
+ # data1 = data[:, 0]
+ # data2 = data[:, 1]
+
+ # initc = data1[:3]
+ # code = initc.copy()
+ # print _py_vq_1d(data1, code)
+ # code1 = kmeans2(data1, code, niter = 1)[0]
+ # code2 = kmeans2(data1, code, niter = 2)[0]
+
+ def check_kmeans2_init(self, level = 1):
+ """Testing that kmeans2 init methods work."""
+ data = N.fromfile(open(DATAFILE1), sep = ", ")
+ data = data.reshape((200, 2))
+
+ kmeans2(data, 3, minit = 'random')
+ kmeans2(data, 3, minit = 'points')
+
if __name__ == "__main__":
NumpyTest().run()
Modified: trunk/Lib/cluster/vq.py
===================================================================
--- trunk/Lib/cluster/vq.py 2007-04-26 10:28:26 UTC (rev 2944)
+++ trunk/Lib/cluster/vq.py 2007-04-26 12:24:16 UTC (rev 2945)
@@ -19,6 +19,7 @@
__all__ = ['whiten', 'vq', 'kmeans']
+import warnings
from numpy.random import randint
from numpy import shape, zeros, sqrt, argmin, minimum, array, \
@@ -146,7 +147,7 @@
:Parameters:
obs : ndarray
- Expect a rank 2 array. Each row is one observation.
+ Expects a rank 2 array. Each row is one observation.
code_book : ndarray
Code book to use. Same format than obs. Should have same number of
features (eg columns) than obs.
@@ -168,10 +169,18 @@
"""
# n = number of observations
# d = number of features
- (n, d) = shape(obs)
+ if N.ndim(obs) == 1:
+ if not N.ndim(obs) == N.ndim(code_book):
+ raise ValueError("Observation and code_book should have the same rank")
+ else:
+ return _py_vq_1d(obs, code_book)
+ else:
+ (n, d) = shape(obs)
- # code books and observations should have same number of features
- if not d == code_book.shape[1]:
+ # code books and observations should have same number of features and same shape
+ if not N.ndim(obs) == N.ndim(code_book):
+ raise ValueError("Observation and code_book should have the same rank")
+ elif not d == code_book.shape[1]:
raise ValueError("""
code book(%d) and obs(%d) should have the same
number of features (eg columns)""" % (code_book.shape[1], d))
@@ -185,6 +194,35 @@
return code, sqrt(min_dist)
+def _py_vq_1d(obs, code_book):
+ """ Python version of vq algorithm for rank 1 only.
+
+ :Parameters:
+ obs : ndarray
+ Expects a rank 1 array. Each item is one observation.
+ code_book : ndarray
+ Code book to use. Same format than obs. Should rank 1 too.
+
+ :Returns:
+ code : ndarray
+ code[i] gives the label of the ith obversation, that its code is
+ code_book[code[i]].
+ mind_dist : ndarray
+ min_dist[i] gives the distance between the ith observation and its
+ corresponding code.
+ """
+ raise RuntimeError("_py_vq_1d buggy, do not use rank 1 arrays for now")
+ n = obs.size
+ nc = code_book.size
+ dist = N.zeros((n, nc))
+ for i in range(nc):
+ dist[:, i] = N.sum(obs - code_book[i])
+ print dist
+ code = argmin(dist)
+ min_dist= dist[code]
+
+ return code, sqrt(min_dist)
+
def py_vq2(obs, code_book):
"""2nd Python version of vq algorithm.
@@ -366,3 +404,147 @@
best_dist = dist
result = best_book, best_dist
return result
+
+def _kpoints(data, k):
+ """Pick k points at random in data (one row = one observation).
+
+ This is done by taking the k first values of a random permutation of 1..N
+ where N is the number of observation.
+
+ :Parameters:
+ data : ndarray
+ Expect a rank 1 or 2 array. Rank 1 are assumed to describe one
+ dimensional data, rank 2 multidimensional data, in which case one
+ row is one observation.
+ k : int
+ Number of samples to generate.
+ """
+ if data.ndim > 1:
+ n = data.shape[0]
+ else:
+ n = data.size
+
+ p = N.random.permutation(n)
+ x = data[p[:k], :].copy()
+
+ return x
+
+def _krandinit(data, k):
+ """Returns k samples of a random variable which parameters depend on data.
+
+ More precisely, it returns k observations sampled from a Gaussian random
+ variable which mean and covariances are the one estimated from data.
+
+ :Parameters:
+ data : ndarray
+ Expect a rank 1 or 2 array. Rank 1 are assumed to describe one
+ dimensional data, rank 2 multidimensional data, in which case one
+ row is one observation.
+ k : int
+ Number of samples to generate.
+ """
+ mu = N.mean(data, 0)
+ cov = N.cov(data, rowvar = 0)
+
+ # k rows, d cols (one row = one obs)
+ # Generate k sample of a random variable ~ Gaussian(mu, cov)
+ x = N.random.randn(k, mu.size)
+ x = N.dot(x, N.linalg.cholesky(cov).T) + mu
+
+ return x
+
+_valid_init_meth = {'random': _krandinit, 'points': _kpoints}
+
+def kmeans2(data, k, minit = 'random', niter = 10):
+ """Classify a set of points into k clusters using kmean algorithm.
+
+ The algorithm works by minimizing the euclidian distance between data points
+ of cluster means. This version is more complete than kmean (has several
+ initialisation methods).
+
+ :Parameters:
+ data : ndarray
+ Expect a rank 1 or 2 array. Rank 1 are assumed to describe one
+ dimensional data, rank 2 multidimensional data, in which case one
+ row is one observation.
+ k : int or ndarray
+ Number of clusters. If a ndarray is given instead, it is
+ interpreted as initial cluster to use instead.
+ minit : string
+ Method for initialization. Available methods are random, points and
+ uniform:
+
+ random uses k points drawn from a Gaussian random generator which
+ mean and variances are estimated from the data.
+
+ points choses k points at random from the points in data.
+
+ uniform choses k points from the data such are they form a uniform
+ grid od the dataset.
+
+ niter : int
+ Number of iterations to run.
+
+ :Returns:
+ clusters : ndarray
+ the found clusters (one cluster per row).
+ label : ndarray
+ label[i] gives the label of the ith obversation, that its centroid is
+ cluster[label[i]].
+
+ """
+ # If data is rank 1, then we have 1 dimension problem.
+ nd = N.ndim(data)
+ if nd == 1:
+ d = 1
+ elif nd == 2:
+ d = data.shape[1]
+ else:
+ raise ValueError("Input of rank > 2 not supported")
+
+ # If k is not a single value, then it should be compatible with data's
+ # shape
+ if N.size(k) > 1:
+ if not nd == N.ndim(k):
+ raise ValueError("k is not an int and has not same rank than data")
+ if d == 1:
+ nc = len(k)
+ else:
+ (nc, dc) = k.shape
+ if not dc == d:
+ raise ValueError("k is not an int and has not same rank than\
+ data")
+ clusters = k.copy()
+ else:
+ nc = k
+ try:
+ init = _valid_init_meth[minit]
+ except KeyError:
+ raise ValueError("unknown init method %s" % str(minit))
+ clusters = init(data, k)
+
+ assert not niter == 0
+ return _kmeans2(data, clusters, niter, nc)
+
+def _kmeans2(data, code, niter, nc):
+ """ "raw" version of kmeans2. Do not use directly.
+
+ Run kmeans with a given initial codebook.
+
+ :undocumented
+ """
+ for i in range(niter):
+ # Compute the nearest neighbour for each obs
+ # using the current code book
+ label = vq(data, code)[0]
+ # Update the code by computing centroids using the new code book
+ for j in range(nc):
+ mbs = N.where(label==j)
+ if mbs[0].size > 0:
+ code[j, :] = N.mean(data[mbs], axis=0)
+ else:
+ warnings.warn("one cluster has no member anymore ! You should"\
+ " rerun kmean with different initialization !")
+
+ return code, label
+
More information about the Scipy-svn
mailing list