[pypy-svn] r23430 - in pypy/dist/pypy/tool/algo: . test

arigo at codespeak.net arigo at codespeak.net
Fri Feb 17 00:20:05 CET 2006


Author: arigo
Date: Fri Feb 17 00:20:02 2006
New Revision: 23430

Added:
   pypy/dist/pypy/tool/algo/BB.sml
   pypy/dist/pypy/tool/algo/fset.py   (contents, props changed)
   pypy/dist/pypy/tool/algo/test/test_fset.py   (contents, props changed)
Log:
Check-in for reference.  We'll see if using this really makes
pypy.jit.hintannotator faster on large inputs, or if we need
to be more clever.


Added: pypy/dist/pypy/tool/algo/BB.sml
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/tool/algo/BB.sml	Fri Feb 17 00:20:02 2006
@@ -0,0 +1,287 @@
+(*
+    Copyright 1992-1996 Stephen Adams.
+
+    This software may be used freely provided that:
+      1. This copyright notice is attached to any copy, derived work,
+         or work including all or part of this software.
+      2. Any derived work must contain a prominent notice stating that
+         it has been altered from the original.
+
+*)
+
+(* Address:  Electronics & Computer Science
+             University of Southampton
+	     Southampton  SO9 5NH
+	     Great Britian
+   E-mail:   sra at ecs.soton.ac.uk
+
+   Comments:
+
+     1.  The implementation is based on Binary search trees of Bounded
+         Balance, similar to Nievergelt & Reingold, SIAM J. Computing
+         2(1), March 1973.  The main advantage of these trees is that
+         they keep the size of the tree in the node, giving a constant
+         time size operation.
+
+     2.  The bounded balance criterion is simpler than N&R's alpha.
+         Simply, one subtree must not have more than `weight' times as
+         many elements as the opposite subtree.  Rebalancing is
+         guaranteed to reinstate the criterion for weight>2.23, but
+         the occasional incorrect behaviour for weight=2 is not
+         detrimental to performance.
+
+     3.  There are two implementations of union.  The default,
+         hedge_union, is much more complex and usually 20% faster.  I
+         am not sure that the performance increase warrants the
+         complexity (and time it took to write), but I am leaving it
+         in for the competition.  It is derived from the original
+         union by replacing the split_lt(gt) operations with a lazy
+         version. The `obvious' version is called old_union.
+*)
+
+structure B (*: INTSET*) =
+    struct
+
+	local
+
+	    type T = int
+	    val lt : T*T->bool = op <
+
+	    (* weight is a parameter to the rebalancing process. *)
+	    val weight:int = 3
+
+	    datatype  Set = E | T of T * int * Set * Set
+
+	    fun size E = 0
+	      | size (T(_,n,_,_)) = n
+	    
+	    (*fun N(v,l,r) = T(v,1+size(l)+size(r),l,r)*)
+	    fun N(v,E,              E)               = T(v,1,E,E)
+	      | N(v,E,              r as T(_,n,_,_)) = T(v,n+1,E,r)
+	      | N(v,l as T(_,n,_,_),E)               = T(v,n+1,l,E)
+	      | N(v,l as T(_,n,_,_),r as T(_,m,_,_)) = T(v,n+m+1,l,r)
+
+	    fun single_L (a,x,T(b,_,y,z)) = N(b,N(a,x,y),z)
+	      | single_L _ = raise Match
+	    fun single_R (b,T(a,_,x,y),z) = N(a,x,N(b,y,z))
+	      | single_R _ = raise Match
+	    fun double_L (a,w,T(c,_,T(b,_,x,y),z)) = N(b,N(a,w,x),N(c,y,z))
+	      | double_L _ = raise Match
+	    fun double_R (c,T(a,_,w,T(b,_,x,y)),z) = N(b,N(a,w,x),N(c,y,z))
+	      | double_R _ = raise Match
+
+	    fun T' (v,E,E) = T(v,1,E,E)
+	      | T' (v,E,r as T(_,_,E,E))     = T(v,2,E,r)
+	      | T' (v,l as T(_,_,E,E),E)     = T(v,2,l,E)
+
+	      | T' (p as (_,E,T(_,_,T(_,_,_,_),E))) = double_L p
+	      | T' (p as (_,T(_,_,E,T(_,_,_,_)),E)) = double_R p
+
+	      (* these cases almost never happen with small weight*)
+	      | T' (p as (_,E,T(_,_,T(_,ln,_,_),T(_,rn,_,_)))) =
+		if ln<rn then single_L p else double_L p
+	      | T' (p as (_,T(_,_,T(_,ln,_,_),T(_,rn,_,_)),E)) =
+		if ln>rn then single_R p else double_R p
+
+	      | T' (p as (_,E,T(_,_,E,_)))  = single_L p
+	      | T' (p as (_,T(_,_,_,E),E))  = single_R p
+
+	      | T' (p as (v,l as T(lv,ln,ll,lr),r as T(rv,rn,rl,rr))) =
+		if rn>=weight*ln then (*right is too big*)
+		    let val rln = size rl
+			val rrn = size rr
+		    in
+			if rln < rrn then  single_L p  else  double_L p
+		    end
+		    
+		else if ln>=weight*rn then  (*left is too big*)
+		    let val lln = size ll
+			val lrn = size lr
+		    in
+			if lrn < lln then  single_R p  else  double_R p
+		    end
+
+		else
+	             T(v,ln+rn+1,l,r)
+
+	    fun add (E,x) = T(x,1,E,E)
+	      | add (set as T(v,_,l,r),x) =
+	        if lt(x,v) then T'(v,add(l,x),r)
+		else if lt(v,x) then T'(v,l,add(r,x))
+		     else set
+
+	    fun concat3 (E,v,r) = add(r,v)
+	      | concat3 (l,v,E) = add(l,v)
+	      | concat3 (l as T(v1,n1,l1,r1), v, r as T(v2,n2,l2,r2)) =
+		if weight*n1 < n2 then T'(v2,concat3(l,v,l2),r2)
+		else if weight*n2 < n1 then T'(v1,l1,concat3(r1,v,r))
+		     else N(v,l,r)
+
+	    fun split_lt (E,x) = E
+	      | split_lt (t as T(v,_,l,r),x) =
+		if lt(x,v) then split_lt(l,x)
+		else if lt(v,x) then concat3(l,v,split_lt(r,x))
+		     else l
+
+	    fun split_gt (E,x) = E
+	      | split_gt (t as T(v,_,l,r),x) =
+		if lt(v,x) then split_gt(r,x)
+		else if lt(x,v) then concat3(split_gt(l,x),v,r)
+		     else r
+
+	    fun min (T(v,_,E,_)) = v
+	      | min (T(v,_,l,_)) = min l
+	      | min _            = raise Match
+		
+	    and delete' (E,r) = r
+	      | delete' (l,E) = l
+	      | delete' (l,r) = let val min_elt = min r in
+                          		T'(min_elt,l,delmin r)
+				end
+	    and delmin (T(_,_,E,r)) = r
+	      | delmin (T(v,_,l,r)) = T'(v,delmin l,r)
+	      | delmin _ = raise Match
+
+	    fun concat (E,  s2) = s2
+	      | concat (s1, E)  = s1
+	      | concat (t1 as T(v1,n1,l1,r1), t2 as T(v2,n2,l2,r2)) =
+		if weight*n1 < n2 then T'(v2,concat(t1,l2),r2)
+		else if weight*n2 < n1 then T'(v1,l1,concat(r1,t2))
+		     else T'(min t2,t1, delmin t2)
+
+	    fun fold(f,base,set) =
+		let fun fold'(base,E) = base
+		      | fold'(base,T(v,_,l,r)) = fold'(f(v,fold'(base,r)),l)
+		in 
+		    fold'(base,set)
+		end
+
+	in
+
+	    val empty = E
+		
+	    fun singleton x = T(x,1,E,E)
+
+
+	    local
+		fun trim (lo,hi,E) = E
+		  | trim (lo,hi,s as T(v,_,l,r)) =
+		    if  lt(lo,v)  then
+			if  lt(v,hi)  then  s
+			else  trim(lo,hi,l)
+		    else trim(lo,hi,r)
+
+			    
+		fun uni_bd (s,E,lo,hi) = s
+		  | uni_bd (E,T(v,_,l,r),lo,hi) = 
+		     concat3(split_gt(l,lo),v,split_lt(r,hi))
+		  | uni_bd (T(v,_,l1,r1), s2 as T(v2,_,l2,r2),lo,hi) =
+			concat3(uni_bd(l1,trim(lo,v,s2),lo,v),
+				v, 
+				uni_bd(r1,trim(v,hi,s2),v,hi))
+	          (* inv:  lo < v < hi *)
+
+               (*all the other versions of uni and trim are
+               specializations of the above two functions with
+               lo=-infinity and/or hi=+infinity *)
+
+		fun trim_lo (_ ,E) = E
+		  | trim_lo (lo,s as T(v,_,_,r)) =
+		        if lt(lo,v) then s else trim_lo(lo,r)
+		fun trim_hi (_ ,E) = E
+		  | trim_hi (hi,s as T(v,_,l,_)) =
+		        if lt(v,hi) then s else trim_hi(hi,l)
+			    
+		fun uni_hi (s,E,hi) = s
+		  | uni_hi (E,T(v,_,l,r),hi) = 
+		     concat3(l,v,split_lt(r,hi))
+		  | uni_hi (T(v,_,l1,r1), s2 as T(v2,_,l2,r2),hi) =
+			concat3(uni_hi(l1,trim_hi(v,s2),v),
+				v, 
+				uni_bd(r1,trim(v,hi,s2),v,hi))
+
+		fun uni_lo (s,E,lo) = s
+		  | uni_lo (E,T(v,_,l,r),lo) = 
+		     concat3(split_gt(l,lo),v,r)
+		  | uni_lo (T(v,_,l1,r1), s2 as T(v2,_,l2,r2),lo) =
+			concat3(uni_bd(l1,trim(lo,v,s2),lo,v),
+				v, 
+				uni_lo(r1,trim_lo(v,s2),v))
+
+		fun uni (s,E) = s
+		  | uni (E,s as T(v,_,l,r)) = s
+		  | uni (T(v,_,l1,r1), s2 as T(v2,_,l2,r2)) =
+			concat3(uni_hi(l1,trim_hi(v,s2),v),
+				v, 
+				uni_lo(r1,trim_lo(v,s2),v))
+
+	    in
+		val hedge_union = uni
+	    end
+
+
+	    fun old_union (E,s2)  = s2
+	      | old_union (s1,E)  = s1
+	      | old_union (s1 as T(v,_,l,r),s2) = 
+		let val l2 = split_lt(s2,v)
+		    val r2 = split_gt(s2,v)
+		in
+		    concat3(old_union(l,l2),v,old_union(r,r2))
+		end
+
+            (* The old_union version is about 20% slower than
+               hedge_union in most cases *)
+
+	    val union = hedge_union
+	    (*val union = old_union*)
+
+	    val add = add
+
+	    fun difference (E,s)  = E
+	      | difference (s,E)  = s
+	      | difference (s, T(v,_,l,r)) =
+		let val l2 = split_lt(s,v)
+		    val r2 = split_gt(s,v)
+		in
+		    concat(difference(l2,l),difference(r2,r))
+		end
+
+	    fun member (x,set) =
+		let fun mem E = false
+		      | mem (T(v,_,l,r)) =
+			if lt(x,v) then mem l else if lt(v,x) then mem r else true
+		in mem set end
+
+	    (*fun intersection (a,b) = difference(a,difference(a,b))*)
+
+	    fun intersection (E,_) = E
+	      | intersection (_,E) = E
+	      | intersection (s, T(v,_,l,r)) =
+		let val l2 = split_lt(s,v)
+		    val r2 = split_gt(s,v)
+		in
+		    if member(v,s) then
+			concat3(intersection(l2,l),v,intersection(r2,r))
+		    else
+			concat(intersection(l2,l),intersection(r2,r))
+		end
+
+	    fun members set = fold(op::,[],set)
+
+	    fun cardinality E = 0
+	      | cardinality (T(_,n,_,_)) = n
+	    
+	    fun delete (E,x) = E
+	      | delete (set as T(v,_,l,r),x) =
+		if lt(x,v) then T'(v,delete(l,x),r)
+		else if lt(v,x) then T'(v,l,delete(r,x))
+		     else delete'(l,r)
+
+	    fun fromList l = List.fold (fn(x,y)=>add(y,x)) l E
+
+	    type  intset = Set
+
+	end
+    end
+
+structure IntSet : INTSET =B;

Added: pypy/dist/pypy/tool/algo/fset.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/tool/algo/fset.py	Fri Feb 17 00:20:02 2006
@@ -0,0 +1,244 @@
+__all__ = ['FSet', 'emptyset']
+
+# Reference:
+#   "Implementing sets efficiently in a functional language"
+#   http://swiss.csail.mit.edu/~adams/BB/
+#   See BB.sml in the current directory.
+
+
+class FSet(object):
+    """Functional Set.
+    Behaves like a frozenset from Python 2.4 (incomplete, though).
+    This version is meant to have a better complexity than frozenset for
+    operations involving a lot of single-element adds and unions.
+    For example, a long chain of 'set.union([x]).union([y]).union([z])...'
+    takes quadratic time with frozensets, but only n*log(n) with FSets.
+    """
+    __slots__ = ['_left', '_value', '_right', '_count']
+
+    def __new__(cls, items=()):
+        if isinstance(items, FSet):
+            return items
+        items = list(items)
+        if len(items) == 1:
+            return node(emptyset, items[0], emptyset)
+        if not items:
+            return emptyset
+        items.sort()
+        any = items[0]
+        items = [x for i, x in enumerate(items) if x != items[i-1]]
+        if not items:
+            items.append(any)
+        def maketree(start, stop):
+            if start == stop:
+                return emptyset
+            else:
+                mid = (start+stop)//2
+                return node(maketree(start, mid), items[mid],
+                            maketree(mid+1, stop))
+        return maketree(0, len(items))
+
+    def __len__(self):
+        return self._count
+
+    def __repr__(self):
+        return '{%s}' % (', '.join([repr(n) for n in self]),)
+
+    def __iter__(self):
+        return treeiter(self)
+
+    def union(self, other):
+        return uniontree(self, FSet(other))
+
+    def __or__(self, other):
+        if not isinstance(other, FSet):
+            return NotImplemented
+        return uniontree(self, other)
+
+    def __eq__(self, other):
+        if not isinstance(other, FSet):
+            return NotImplemented
+        if self is other:
+            return True
+        if eqtree(self, other):
+            other._left = self._left
+            other._value = self._value
+            other._right = self._right
+            return True
+        return False
+
+    def __ne__(self, other):
+        res = self.__eq__(other)
+        if res is NotImplemented:
+            return NotImplemented
+        return not res
+
+    def __hash__(self):
+        return hash(tuple(self)) ^ 1043498183
+
+    def __contains__(self, value):
+        return contains(self, value)
+
+emptyset = object.__new__(FSet)
+emptyset._count = 0
+
+# ____________________________________________________________
+# creation and balancing stuff
+
+WEIGHT = 3
+
+def node(left, value, right):
+    result = object.__new__(FSet)
+    result._left = left
+    result._value = value
+    result._right = right
+    result._count = left._count + right._count + 1
+    return result
+
+def node_balance_fast(left, value, right):
+    # used when an original tree was balanced, and changed by at most
+    # one element (as in adding or deleting one item).
+    ln = left._count
+    rn = right._count
+    if ln <= 1 and rn <= 1:
+        return node(left, value, right)
+    elif rn > WEIGHT * ln:   # right too big
+        if right._left._count < right._right._count:
+            return single_L(left, value, right)
+        else:
+            return double_L(left, value, right)
+    elif ln > WEIGHT * rn:   # left too big
+        if left._right._count < left._left._count:
+            return single_R(left, value, right)
+        else:
+            return double_R(left, value, right)
+    else:
+        return node(left, value, right)
+
+def node_balance(left, value, right):
+    if left is emptyset:
+        return add(right, value)
+    elif right is emptyset:
+        return add(left, value)
+    elif WEIGHT * left._count < right._count:
+        t = node_balance(left, value, right._left)
+        return node_balance_fast(t, right._value, right._right)
+    elif WEIGHT * right._count < left._count:
+        t = node_balance(left._right, value, right)
+        return node_balance_fast(left._left, left._value, t)
+    else:
+        return node(left, value, right)
+
+def add(tree, value):
+    if tree is emptyset:
+        return node(emptyset, value, emptyset)
+    elif value < tree._value:
+        t = add(tree._left, value)
+        return node_balance_fast(t, tree._value, tree._right)
+    elif value == tree._value:
+        return tree
+    else:
+        t = add(tree._right, value)
+        return node_balance_fast(tree._left, tree._value, t)
+
+def single_L(left, value, right):
+    return node(node(left, value, right._left), right._value, right._right)
+
+def single_R(left, value, right):
+    return node(left._left, left._value, node(left._right, value, right))
+
+def double_L(left, value, right):
+    rl = right._left
+    n1 = node(left, value, rl._left)
+    n2 = node(rl._right, right._value, right._right)
+    return node(n1, rl._value, n2)
+
+def double_R(left, value, right):
+    lr = left._right
+    n1 = node(left._left, left._value, lr._left)
+    n2 = node(lr._right, value, right)
+    return node(n1, lr._value, n2)
+
+# ____________________________________________________________
+# union
+
+def uniontree(tree1, tree2):
+    if tree2._count <= 1:
+        if tree2 is emptyset:
+            return tree1
+        else:
+            return add(tree1, tree2._value)
+    elif tree1._count <= 1:
+        if tree1 is emptyset:
+            return tree2
+        else:
+            return add(tree2, tree1._value)
+    else:
+        left2, right2 = splittree(tree2, tree1._value)
+        return node_balance(uniontree(tree1._left, left2), tree1._value,
+                            uniontree(tree1._right, right2))
+
+def splittree(tree, value):
+    if tree is emptyset:
+        return emptyset, emptyset
+    elif tree._value < value:
+        t1, t2 = splittree(tree._right, value)
+        return node_balance(tree._left, tree._value, t1), t2
+    elif tree._value == value:
+        return tree._left, tree._right
+    else:
+        t1, t2 = splittree(tree._left, value)
+        return t1, node_balance(t2, tree._value, tree._right)
+
+# ____________________________________________________________
+# utilities
+
+def treeiter(tree):
+    if tree is emptyset:
+        return
+    path = []
+    while True:
+        while tree._left is not emptyset:
+            path.append(tree)
+            tree = tree._left
+        yield tree._value
+        tree = tree._right
+        while tree is emptyset:
+            if not path:
+                return
+            tree = path.pop()
+            yield tree._value
+            tree = tree._right
+
+def eqtree(tree1, tree2):
+    if tree1 is tree2:
+        return True
+    if tree1._count != tree2._count:
+        return False
+    assert tree1 is not emptyset and tree2 is not emptyset
+    left2, right2 = splittree(tree2, tree1._value)
+    if left2._count + right2._count == tree2._count:
+        return False    # _value was not in tree2
+    return eqtree(tree1._left, left2) and eqtree(tree1._right, right2)
+
+def contains(tree, value):
+    while tree is not emptyset:
+        if value < tree._value:
+            tree = tree._left
+        elif value == tree._value:
+            return True
+        else:
+            tree = tree._right
+    return False
+
+
+_no = object()
+def checktree(tree, bmin=_no, bmax=_no):
+    if tree is not emptyset:
+        if bmin is not _no:
+            assert bmin < tree._value
+        if bmax is not _no:
+            assert tree._value < bmax
+        assert tree._count == tree._left._count + tree._right._count + 1
+        checktree(tree._left, bmin, tree._value)
+        checktree(tree._right, tree._value, bmax)

Added: pypy/dist/pypy/tool/algo/test/test_fset.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/tool/algo/test/test_fset.py	Fri Feb 17 00:20:02 2006
@@ -0,0 +1,76 @@
+from pypy.tool.algo.fset import FSet, checktree, emptyset
+import random
+
+
+def test_empty():
+    assert FSet() is FSet([]) is emptyset
+    assert len(emptyset) == 0
+    assert list(emptyset) == []
+    checktree(emptyset)
+
+def test_iter():
+    s = FSet(range(42))
+    assert len(s) == 42
+    assert list(s) == range(42)
+    checktree(s)
+
+def test_new():
+    s = FSet(range(6, 42) + range(13))
+    assert len(s) == 42
+    assert list(s) == range(42)
+    assert FSet(s) is s
+    checktree(s)
+
+def test_union():
+    s1 = FSet([1, 10, 100, 1000])
+    assert list(s1.union([])) == [1, 10, 100, 1000]
+    assert list(s1.union([100])) == [1, 10, 100, 1000]
+    assert list(s1.union([3, 4, 5])) == [1, 3, 4, 5, 10, 100, 1000]
+    assert list(s1.union([1000, 1200, 1400])) == [1, 10, 100, 1000, 1200, 1400]
+    assert list(s1.union(s1)) == [1, 10, 100, 1000]
+
+def test_or():
+    s1 = FSet([0, 3, 6])
+    s2 = FSet([1, 3])
+    assert list(s1 | s2) == [0, 1, 3, 6]
+
+def test_eq():
+    assert FSet([0, 3]) == FSet([0, 3])
+    assert FSet([]) == emptyset
+    assert FSet(range(42)) == FSet(range(42))
+    assert FSet([]) != FSet([5])
+    assert FSet(range(42)) != FSet(range(43))
+
+def test_hash():
+    assert hash(emptyset) != hash(FSet([1])) != hash(FSet([1, 2]))
+    assert hash(FSet([1, 2])) == hash(FSet([1]) | FSet([2]))
+
+def test_len():
+    assert len(FSet([1, 2]) | FSet([2, 3])) == 3
+
+def test_reasonable_speed(N=1000):
+    d = emptyset
+    for i in range(N):
+        d |= FSet([i])
+    checktree(d)
+    assert list(d) == range(N)
+    d = emptyset
+    for i in range(N-1, -1, -1):
+        d |= FSet([i])
+    checktree(d)
+    assert list(d) == range(N)
+    d = emptyset
+    lst = range(N)
+    random.shuffle(lst)
+    for i in lst:
+        d |= FSet([i])
+    checktree(d)
+    assert list(d) == range(N)
+
+def test_contains():
+    assert 5 not in emptyset
+    lst = range(0, 20, 2)
+    random.shuffle(lst)
+    d = FSet(lst)
+    for x in range(20):
+        assert (x in d) == (x in lst)



More information about the Pypy-commit mailing list