[Scipy-svn] r4780 - in branches/spatial/scipy/spatial: . tests

scipy-svn at scipy.org scipy-svn at scipy.org
Sun Oct 5 04:55:43 EDT 2008


Author: peridot
Date: 2008-10-05 03:55:42 -0500 (Sun, 05 Oct 2008)
New Revision: 4780

Modified:
   branches/spatial/scipy/spatial/kdtree.py
   branches/spatial/scipy/spatial/tests/test_kdtree.py
Log:
Added distance matrix implementation, sparse distance matrix, and a few comments.


Modified: branches/spatial/scipy/spatial/kdtree.py
===================================================================
--- branches/spatial/scipy/spatial/kdtree.py	2008-10-05 08:01:17 UTC (rev 4779)
+++ branches/spatial/scipy/spatial/kdtree.py	2008-10-05 08:55:42 UTC (rev 4780)
@@ -2,6 +2,7 @@
 # Released under the scipy license
 import numpy as np
 from heapq import heappush, heappop
+import scipy.sparse
 
 def distance_p(x,y,p=2):
     """Compute the pth power of the L**p distance between x and y
@@ -46,7 +47,12 @@
         return np.prod(self.maxes-self.mins)
     
     def split(self, d, split):
-        """Produce two hyperrectangles by splitting along axis d."""
+        """Produce two hyperrectangles by splitting along axis d.
+        
+        In general, if you need to compute maximum and minimum
+        distances to the children, it can be done more efficiently
+        by updating the maximum and minimum distances to the parent.
+        """ # FIXME: do this
         mid = np.copy(self.maxes)
         mid[d] = split
         less = Rectangle(self.mins, mid)
@@ -572,3 +578,98 @@
         else:
             raise ValueError("r must be either a single value or a one-dimensional array of values")
         
+    def sparse_distance_matrix(self, other, max_distance, p=2.):
+        """Compute a sparse distance matrix
+
+        Computes a distance matrix between two KDTrees, leaving as zero
+        any distance greater than max_distance.
+
+        Parameters
+        ==========
+
+        other : KDTree
+
+        max_distance : positive float
+
+        Returns
+        =======
+
+        result : dok_matrix
+            Sparse matrix representing the results in "dictionary of keys" format.
+        """
+        result = scipy.sparse.dok_matrix((self.n,other.n))
+
+        def traverse(node1, rect1, node2, rect2):
+            if rect1.min_distance_rectangle(rect2, p)>max_distance:
+                return
+            elif isinstance(node1, KDTree.leafnode):
+                if isinstance(node2, KDTree.leafnode):
+                    for i in node1.idx:
+                        for j in node2.idx:
+                            d = distance(self.data[i],other.data[j],p)
+                            if d<=max_distance:
+                                result[i,j] = d
+                else:
+                    less, greater = rect2.split(node2.split_dim, node2.split)
+                    traverse(node1,rect1,node2.less,less)
+                    traverse(node1,rect1,node2.greater,greater)
+            elif isinstance(node2, KDTree.leafnode):
+                less, greater = rect1.split(node1.split_dim, node1.split)
+                traverse(node1.less,less,node2,rect2)
+                traverse(node1.greater,greater,node2,rect2)
+            else:
+                less1, greater1 = rect1.split(node1.split_dim, node1.split)
+                less2, greater2 = rect2.split(node2.split_dim, node2.split)
+                traverse(node1.less,less1,node2.less,less2)
+                traverse(node1.less,less1,node2.greater,greater2)
+                traverse(node1.greater,greater1,node2.less,less2)
+                traverse(node1.greater,greater1,node2.greater,greater2)
+        traverse(self.tree, Rectangle(self.maxes, self.mins),
+                 other.tree, Rectangle(other.maxes, other.mins))
+
+        return result
+
+
+def distance_matrix(x,y,p=2,threshold=1000000):
+    """Compute the distance matrix.
+
+    Computes the matrix of all pairwise distances.
+
+    Parameters
+    ==========
+
+    x : array-like, m by k
+    y : array-like, n by k
+    p : float 1<=p<=infinity
+        Which Minkowski p-norm to use.
+    threshold : positive integer
+        If m*n*k>threshold use a python loop instead of creating
+        a very large temporary.
+
+    Returns
+    =======
+
+    result : array-like, m by n
+
+
+    """
+
+    x = np.asarray(x)
+    m, k = x.shape
+    y = np.asarray(y)
+    n, kk = y.shape
+
+    if k != kk:
+        raise ValueError("x contains %d-dimensional vectors but y contains %d-dimensional vectors" % (k, kk))
+
+    if m*n*k <= threshold:
+        return distance(x[:,np.newaxis,:],y[np.newaxis,:,:],p)
+    else:
+        result = np.empty((m,n),dtype=np.float) #FIXME: figure out the best dtype
+        if m<n:
+            for i in range(m):
+                result[i,:] = distance(x[i],y,p)
+        else:
+            for j in range(n):
+                result[:,j] = distance(x,y[j],p)
+        return result

Modified: branches/spatial/scipy/spatial/tests/test_kdtree.py
===================================================================
--- branches/spatial/scipy/spatial/tests/test_kdtree.py	2008-10-05 08:01:17 UTC (rev 4779)
+++ branches/spatial/scipy/spatial/tests/test_kdtree.py	2008-10-05 08:55:42 UTC (rev 4780)
@@ -3,7 +3,7 @@
 from numpy.testing import *
 
 import numpy as np
-from scipy.spatial import KDTree, distance, Rectangle
+from scipy.spatial import KDTree, distance, Rectangle, distance_matrix
 
 class ConsistencyTests:
     def test_nearest(self):
@@ -337,6 +337,43 @@
         for r,result in zip(rs, results):
             assert_equal(self.T1.count_neighbors(self.T2, r), result)
 
+class test_sparse_distance_matrix:
+    def setUp(self):
+        n = 100
+        k = 4
+        self.T1 = KDTree(np.random.randn(n,k),leafsize=2)
+        self.T2 = KDTree(np.random.randn(n,k),leafsize=2)
+        self.r = 0.3
 
+    def test_consistency_with_neighbors(self):
+        M = self.T1.sparse_distance_matrix(self.T2, self.r)
+        r = self.T1.query_ball_tree(self.T2, self.r)
+        for i,l in enumerate(r):
+            for j in l:
+                assert_equal(M[i,j],distance(self.T1.data[i],self.T2.data[j]))
+        for ((i,j),d) in M.items():
+            assert j in r[i]
+
+def test_distance_matrix():
+    m = 10
+    n = 11
+    k = 4
+    xs = np.random.randn(m,k)
+    ys = np.random.randn(n,k)
+    ds = distance_matrix(xs,ys)
+    assert_equal(ds.shape, (m,n))
+    for i in range(m):
+        for j in range(n):
+            assert_equal(distance(xs[i],ys[j]),ds[i,j])
+def test_distance_matrix_looping():
+    m = 10
+    n = 11
+    k = 4
+    xs = np.random.randn(m,k)
+    ys = np.random.randn(n,k)
+    ds = distance_matrix(xs,ys)
+    dsl = distance_matrix(xs,ys,threshold=1)
+    assert_equal(ds,dsl)
+
 if __name__=="__main__":
     run_module_suite()




More information about the Scipy-svn mailing list