[pypy-svn] r71472 - in pypy/trunk/pypy: rpython/lltypesystem rpython/memory/gc rpython/memory/gc/test translator/c/test

arigo at codespeak.net arigo at codespeak.net
Wed Feb 24 21:17:10 CET 2010


Author: arigo
Date: Wed Feb 24 21:17:05 2010
New Revision: 71472

Modified:
   pypy/trunk/pypy/rpython/lltypesystem/llarena.py
   pypy/trunk/pypy/rpython/lltypesystem/llmemory.py
   pypy/trunk/pypy/rpython/memory/gc/generation.py
   pypy/trunk/pypy/rpython/memory/gc/hybrid.py
   pypy/trunk/pypy/rpython/memory/gc/semispace.py
   pypy/trunk/pypy/rpython/memory/gc/test/test_direct.py
   pypy/trunk/pypy/translator/c/test/test_newgc.py
Log:
Merge the gc-better-hash branch.

It improves the identityhash of young objects -- they are now not just
the address, because the address of young objects is always within the
small nursery; they are the address plus a number that changes at every
nursery collection.

Also, it contains a bit of work to let the identityhash logic be tested
by test_direct.py, at least on SemiSpace and Generation GC -- there is
still an issue with testing it on the Hybrid GC.  (Of course, it's still
fully tested by test_newgc.py.)



Modified: pypy/trunk/pypy/rpython/lltypesystem/llarena.py
==============================================================================
--- pypy/trunk/pypy/rpython/lltypesystem/llarena.py	(original)
+++ pypy/trunk/pypy/rpython/lltypesystem/llarena.py	Wed Feb 24 21:17:05 2010
@@ -227,7 +227,7 @@
         return self.arena._getid() + self.offset
 
 
-def _getfakearenaaddress(addr):
+def getfakearenaaddress(addr):
     """Logic to handle test_replace_object_with_stub()."""
     if isinstance(addr, fakearenaaddress):
         return addr
@@ -306,7 +306,7 @@
       * 1: clear, optimized for a very large area of memory
       * 2: clear, optimized for a small or medium area of memory
     """
-    arena_addr = _getfakearenaaddress(arena_addr)
+    arena_addr = getfakearenaaddress(arena_addr)
     arena_addr.arena.reset(zero, arena_addr.offset, size)
 
 def arena_reserve(addr, size, check_alignment=True):
@@ -315,7 +315,7 @@
     overlap.  The size must be symbolic; in non-translated version
     this is used to know what type of lltype object to allocate."""
     from pypy.rpython.memory.lltypelayout import memory_alignment
-    addr = _getfakearenaaddress(addr)
+    addr = getfakearenaaddress(addr)
     if check_alignment and (addr.offset & (memory_alignment-1)) != 0:
         raise ArenaError("object at offset %d would not be correctly aligned"
                          % (addr.offset,))
@@ -324,7 +324,7 @@
 def arena_shrink_obj(addr, newsize):
     """ Mark object as shorter than it was
     """
-    addr = _getfakearenaaddress(addr)
+    addr = getfakearenaaddress(addr)
     addr.arena.shrink_obj(addr.offset, newsize)
 
 def round_up_for_allocation(size, minsize=0):
@@ -505,3 +505,11 @@
 register_external(arena_new_view, [llmemory.Address], llmemory.Address,
                   'll_arena.arena_new_view', llimpl=llimpl_arena_new_view,
                   llfakeimpl=arena_new_view, sandboxsafe=True)
+
+def llimpl_getfakearenaaddress(addr):
+    return addr
+register_external(getfakearenaaddress, [llmemory.Address], llmemory.Address,
+                  'll_arena.getfakearenaaddress',
+                  llimpl=llimpl_getfakearenaaddress,
+                  llfakeimpl=getfakearenaaddress,
+                  sandboxsafe=True)

Modified: pypy/trunk/pypy/rpython/lltypesystem/llmemory.py
==============================================================================
--- pypy/trunk/pypy/rpython/lltypesystem/llmemory.py	(original)
+++ pypy/trunk/pypy/rpython/lltypesystem/llmemory.py	Wed Feb 24 21:17:05 2010
@@ -490,7 +490,7 @@
         if self.ptr is not None and self.ptr._was_freed():
             # hack to support llarena.test_replace_object_with_stub()
             from pypy.rpython.lltypesystem import llarena
-            return llarena._getfakearenaaddress(self)
+            return llarena.getfakearenaaddress(self)
         else:
             return self
 

Modified: pypy/trunk/pypy/rpython/memory/gc/generation.py
==============================================================================
--- pypy/trunk/pypy/rpython/memory/gc/generation.py	(original)
+++ pypy/trunk/pypy/rpython/memory/gc/generation.py	Wed Feb 24 21:17:05 2010
@@ -1,13 +1,14 @@
 import sys
 from pypy.rpython.memory.gc.semispace import SemiSpaceGC
 from pypy.rpython.memory.gc.semispace import GCFLAG_EXTERNAL, GCFLAG_FORWARDED
-from pypy.rpython.memory.gc.semispace import GCFLAG_HASHTAKEN
+from pypy.rpython.memory.gc.semispace import GC_HASH_TAKEN_ADDR
 from pypy.rpython.lltypesystem.llmemory import NULL, raw_malloc_usage
 from pypy.rpython.lltypesystem import lltype, llmemory, llarena
 from pypy.rpython.memory.support import DEFAULT_CHUNK_SIZE
 from pypy.rlib.objectmodel import free_non_gc_object
 from pypy.rlib.debug import ll_assert
 from pypy.rlib.debug import debug_print, debug_start, debug_stop
+from pypy.rlib.rarithmetic import intmask
 from pypy.rpython.lltypesystem.lloperation import llop
 
 # The following flag is never set on young objects, i.e. the ones living
@@ -43,6 +44,8 @@
                           'min_nursery_size': 48*1024,
                           'auto_nursery_size': True}
 
+    nursery_hash_base = -1
+
     def __init__(self, config, chunk_size=DEFAULT_CHUNK_SIZE,
                  nursery_size=128,
                  min_nursery_size=128,
@@ -234,7 +237,7 @@
     GCFLAGS_FOR_NEW_YOUNG_OBJECTS = 0   # NO_YOUNG_PTRS never set on young objs
     GCFLAGS_FOR_NEW_EXTERNAL_OBJECTS = (GCFLAG_EXTERNAL | GCFLAG_FORWARDED |
                                         GCFLAG_NO_YOUNG_PTRS |
-                                        GCFLAG_HASHTAKEN)
+                                        GC_HASH_TAKEN_ADDR)
 
     # ____________________________________________________________
     # Support code for full collections
@@ -355,8 +358,21 @@
             self.nursery_top = self.nursery + self.nursery_size
             self.free = self.nursery_top
         self.nursery_free = self.nursery
+        # at this point we know that the nursery is empty
+        self.change_nursery_hash_base()
         return self.nursery_free
 
+    def change_nursery_hash_base(self):
+        # The following should be enough to ensure that young objects
+        # tend to always get a different hash.  It also makes sure that
+        # nursery_hash_base is not a multiple of 4, to avoid collisions
+        # with the hash of non-young objects.
+        hash_base = self.nursery_hash_base
+        hash_base += self.nursery_size - 1
+        if (hash_base & 3) == 0:
+            hash_base -= 1
+        self.nursery_hash_base = intmask(hash_base)
+
     # NB. we can use self.copy() to move objects out of the nursery,
     # but only if the object was really in the nursery.
 
@@ -534,6 +550,9 @@
     def _id_grow_older(self, obj, id, ignored):
         self.objects_with_id.setitem(obj, id)
 
+    def _compute_current_nursery_hash(self, obj):
+        return intmask(llmemory.cast_adr_to_int(obj) + self.nursery_hash_base)
+
     def heap_stats_walk_roots(self):
         self.last_generation_root_objects.foreach(
             self._track_heap_ext, None)

Modified: pypy/trunk/pypy/rpython/memory/gc/hybrid.py
==============================================================================
--- pypy/trunk/pypy/rpython/memory/gc/hybrid.py	(original)
+++ pypy/trunk/pypy/rpython/memory/gc/hybrid.py	Wed Feb 24 21:17:05 2010
@@ -2,9 +2,11 @@
 from pypy.rpython.memory.gc.semispace import SemiSpaceGC
 from pypy.rpython.memory.gc.generation import GenerationGC
 from pypy.rpython.memory.gc.semispace import GCFLAG_EXTERNAL, GCFLAG_FORWARDED
-from pypy.rpython.memory.gc.semispace import GCFLAG_HASHTAKEN, GCFLAG_HASHFIELD
+from pypy.rpython.memory.gc.semispace import GCFLAG_HASHMASK
 from pypy.rpython.memory.gc.generation import GCFLAG_NO_YOUNG_PTRS
 from pypy.rpython.memory.gc.generation import GCFLAG_NO_HEAP_PTRS
+from pypy.rpython.memory.gc.semispace import GC_HASH_TAKEN_ADDR
+from pypy.rpython.memory.gc.semispace import GC_HASH_HASFIELD
 from pypy.rpython.lltypesystem import lltype, llmemory, llarena
 from pypy.rpython.lltypesystem.llmemory import raw_malloc_usage
 from pypy.rpython.lltypesystem.lloperation import llop
@@ -359,7 +361,7 @@
         # it's not an issue.
         totalsize = self.size_gc_header() + objsize
         tid = self.header(obj).tid
-        if tid & (GCFLAG_HASHTAKEN|GCFLAG_HASHFIELD):
+        if tid & GCFLAG_HASHMASK:
             totalsize_incl_hash = totalsize + llmemory.sizeof(lltype.Signed)
         else:
             totalsize_incl_hash = totalsize
@@ -370,14 +372,10 @@
         self._nonmoving_copy_size += raw_malloc_usage(totalsize)
 
         llmemory.raw_memcopy(obj - self.size_gc_header(), newaddr, totalsize)
-        # check if we need to write a hash value at the end of the new obj
-        if tid & (GCFLAG_HASHTAKEN|GCFLAG_HASHFIELD):
-            if tid & GCFLAG_HASHFIELD:
-                hash = (obj + objsize).signed[0]
-            else:
-                hash = llmemory.cast_adr_to_int(obj)
-                tid |= GCFLAG_HASHFIELD
+        if tid & GCFLAG_HASHMASK:
+            hash = self._get_object_hash(obj, objsize, tid)
             (newaddr + totalsize).signed[0] = hash
+            tid |= GC_HASH_HASFIELD
         #
         # GCFLAG_UNVISITED is not set
         # GCFLAG_NO_HEAP_PTRS is not set either, conservatively.  It may be
@@ -544,8 +542,8 @@
         tid = self.header(obj).tid
         ll_assert(bool(tid & GCFLAG_EXTERNAL),
                   "gen2: missing GCFLAG_EXTERNAL")
-        ll_assert(bool(tid & GCFLAG_HASHTAKEN),
-                  "gen2: missing GCFLAG_HASHTAKEN")
+        ll_assert(bool(tid & GC_HASH_TAKEN_ADDR),
+                  "gen2: missing GC_HASH_TAKEN_ADDR")
         ll_assert(bool(tid & GCFLAG_UNVISITED),
                   "gen2: missing GCFLAG_UNVISITED")
         ll_assert((tid & GCFLAG_AGE_MASK) < GCFLAG_AGE_MAX,
@@ -554,8 +552,8 @@
         tid = self.header(obj).tid
         ll_assert(bool(tid & GCFLAG_EXTERNAL),
                   "gen3: missing GCFLAG_EXTERNAL")
-        ll_assert(bool(tid & GCFLAG_HASHTAKEN),
-                  "gen3: missing GCFLAG_HASHTAKEN")
+        ll_assert(bool(tid & GC_HASH_TAKEN_ADDR),
+                  "gen3: missing GC_HASH_TAKEN_ADDR")
         ll_assert(not (tid & GCFLAG_UNVISITED),
                   "gen3: unexpected GCFLAG_UNVISITED")
         ll_assert((tid & GCFLAG_AGE_MASK) == GCFLAG_AGE_MAX,

Modified: pypy/trunk/pypy/rpython/memory/gc/semispace.py
==============================================================================
--- pypy/trunk/pypy/rpython/memory/gc/semispace.py	(original)
+++ pypy/trunk/pypy/rpython/memory/gc/semispace.py	Wed Feb 24 21:17:05 2010
@@ -21,11 +21,22 @@
 # either immortal objects or (for HybridGC) externally raw_malloc'ed
 GCFLAG_EXTERNAL = first_gcflag << 1
 GCFLAG_FINALIZATION_ORDERING = first_gcflag << 2
-GCFLAG_HASHTAKEN = first_gcflag << 3      # someone already asked for the hash
-GCFLAG_HASHFIELD = first_gcflag << 4      # we have an extra hash field
+
+_GCFLAG_HASH_BASE = first_gcflag << 3
+GCFLAG_HASHMASK = _GCFLAG_HASH_BASE * 0x3   # also consumes 'first_gcflag << 4'
+# the two bits in GCFLAG_HASHMASK can have one of the following values:
+#   - nobody ever asked for the hash of the object
+GC_HASH_NOTTAKEN   = _GCFLAG_HASH_BASE * 0x0
+#   - someone asked, and we gave the address of the object
+GC_HASH_TAKEN_ADDR = _GCFLAG_HASH_BASE * 0x1
+#   - someone asked, and we gave the address plus 'nursery_hash_base'
+GC_HASH_TAKEN_NURS = _GCFLAG_HASH_BASE * 0x2
+#   - we have our own extra field to store the hash
+GC_HASH_HASFIELD   = _GCFLAG_HASH_BASE * 0x3
 
 memoryError = MemoryError()
 
+
 class SemiSpaceGC(MovingGCBase):
     _alloc_flavor_ = "raw"
     inline_simple_malloc = True
@@ -35,9 +46,9 @@
 
     HDR = lltype.Struct('header', ('tid', lltype.Signed))   # XXX or rffi.INT?
     typeid_is_in_field = 'tid'
-    withhash_flag_is_in_field = 'tid', GCFLAG_HASHFIELD
-    # ^^^ all prebuilt objects have GCFLAG_HASHTAKEN, but only some have
-    #     GCFLAG_HASHFIELD (and then they are one word longer).
+    withhash_flag_is_in_field = 'tid', _GCFLAG_HASH_BASE * 0x2
+    # ^^^ prebuilt objects either have GC_HASH_TAKEN_ADDR or they
+    #     have GC_HASH_HASFIELD (and then they are one word longer).
     FORWARDSTUB = lltype.GcStruct('forwarding_stub',
                                   ('forw', llmemory.Address))
     FORWARDSTUBPTR = lltype.Ptr(FORWARDSTUB)
@@ -316,7 +327,7 @@
     def get_size_incl_hash(self, obj):
         size = self.get_size(obj)
         hdr = self.header(obj)
-        if hdr.tid & GCFLAG_HASHFIELD:
+        if (hdr.tid & GCFLAG_HASHMASK) == GC_HASH_HASFIELD:
             size += llmemory.sizeof(lltype.Signed)
         return size
 
@@ -351,22 +362,31 @@
             self.set_forwarding_address(obj, newobj, objsize)
             return newobj
 
+    def _get_object_hash(self, obj, objsize, tid):
+        # Returns the hash of the object, which must not be GC_HASH_NOTTAKEN.
+        gc_hash = tid & GCFLAG_HASHMASK
+        if gc_hash == GC_HASH_HASFIELD:
+            obj = llarena.getfakearenaaddress(obj)
+            return (obj + objsize).signed[0]
+        elif gc_hash == GC_HASH_TAKEN_ADDR:
+            return llmemory.cast_adr_to_int(obj)
+        elif gc_hash == GC_HASH_TAKEN_NURS:
+            return self._compute_current_nursery_hash(obj)
+        else:
+            assert 0, "gc_hash == GC_HASH_NOTTAKEN"
+
     def _make_a_copy_with_tid(self, obj, objsize, tid):
         totalsize = self.size_gc_header() + objsize
         newaddr = self.free
         llarena.arena_reserve(newaddr, totalsize)
         raw_memcopy(obj - self.size_gc_header(), newaddr, totalsize)
-        #
-        # check if we need to write a hash value at the end of the new obj
-        if tid & (GCFLAG_HASHTAKEN|GCFLAG_HASHFIELD):
-            if tid & GCFLAG_HASHFIELD:
-                hash = (obj + objsize).signed[0]
-            else:
-                hash = llmemory.cast_adr_to_int(obj)
-                tid |= GCFLAG_HASHFIELD
+        if tid & GCFLAG_HASHMASK:
+            hash = self._get_object_hash(obj, objsize, tid)
+            llarena.arena_reserve(newaddr + totalsize,
+                                  llmemory.sizeof(lltype.Signed))
             (newaddr + totalsize).signed[0] = hash
+            tid |= GC_HASH_HASFIELD
             totalsize += llmemory.sizeof(lltype.Signed)
-        #
         self.free += totalsize
         newhdr = llmemory.cast_adr_to_ptr(newaddr, lltype.Ptr(self.HDR))
         newhdr.tid = tid
@@ -446,7 +466,7 @@
 
     def init_gc_object_immortal(self, addr, typeid16, flags=0):
         hdr = llmemory.cast_adr_to_ptr(addr, lltype.Ptr(self.HDR))
-        flags |= GCFLAG_EXTERNAL | GCFLAG_FORWARDED | GCFLAG_HASHTAKEN
+        flags |= GCFLAG_EXTERNAL | GCFLAG_FORWARDED | GC_HASH_TAKEN_ADDR
         hdr.tid = self.combine(typeid16, flags)
         # immortal objects always have GCFLAG_FORWARDED set;
         # see get_forwarding_address().
@@ -611,36 +631,46 @@
 
     STATISTICS_NUMBERS = 0
 
+    def is_in_nursery(self, addr):
+        # overridden in generation.py.
+        return False
+
+    def _compute_current_nursery_hash(self, obj):
+        # overridden in generation.py.
+        raise AssertionError("should not be called")
+
     def identityhash(self, gcobj):
-        # The following code should run at most twice.
+        # The following loop should run at most twice.
         while 1:
             obj = llmemory.cast_ptr_to_adr(gcobj)
             hdr = self.header(obj)
-            #
-            if hdr.tid & GCFLAG_HASHFIELD:  # the hash is in a field at the end
-                obj += self.get_size(obj)
-                return obj.signed[0]
-            #
-            if not (hdr.tid & GCFLAG_HASHTAKEN):
-                # It's the first time we ask for a hash, and it's not an
-                # external object.  Shrink the top of space by the extra
-                # hash word that will be needed after a collect.
-                shrunk_top = self.top_of_space - llmemory.sizeof(lltype.Signed)
-                if shrunk_top < self.free:
-                    # Cannot shrink!  Do a collection, asking for at least
-                    # one word of free space, and try again.  May raise
-                    # MemoryError.  Obscure: not called directly, but
-                    # across an llop, to make sure that there is the
-                    # correct push_roots/pop_roots around the call...
-                    llop.gc_obtain_free_space(llmemory.Address,
-                                              llmemory.sizeof(lltype.Signed))
-                    continue
-                # Now we can have side-effects: set GCFLAG_HASHTAKEN
-                # and lower the top of space.
+            if hdr.tid & GCFLAG_HASHMASK:
+                break
+            # It's the first time we ask for a hash, and it's not an
+            # external object.  Shrink the top of space by the extra
+            # hash word that will be needed after a collect.
+            shrunk_top = self.top_of_space - llmemory.sizeof(lltype.Signed)
+            if shrunk_top < self.free:
+                # Cannot shrink!  Do a collection, asking for at least
+                # one word of free space, and try again.  May raise
+                # MemoryError.  Obscure: not called directly, but
+                # across an llop, to make sure that there is the
+                # correct push_roots/pop_roots around the call...
+                llop.gc_obtain_free_space(llmemory.Address,
+                                          llmemory.sizeof(lltype.Signed))
+                continue
+            else:
+                # Now we can have side-effects: lower the top of space
+                # and set one of the GC_HASH_TAKEN_xxx flags.
                 self.top_of_space = shrunk_top
-                hdr.tid |= GCFLAG_HASHTAKEN
-            #
-            return llmemory.cast_adr_to_int(obj)  # direct case
+                if self.is_in_nursery(obj):
+                    hdr.tid |= GC_HASH_TAKEN_NURS
+                else:
+                    hdr.tid |= GC_HASH_TAKEN_ADDR
+                break
+        # Now we can return the result
+        objsize = self.get_size(obj)
+        return self._get_object_hash(obj, objsize, hdr.tid)
 
     def track_heap_parent(self, obj, parent):
         addr = obj.address[0]

Modified: pypy/trunk/pypy/rpython/memory/gc/test/test_direct.py
==============================================================================
--- pypy/trunk/pypy/rpython/memory/gc/test/test_direct.py	(original)
+++ pypy/trunk/pypy/rpython/memory/gc/test/test_direct.py	Wed Feb 24 21:17:05 2010
@@ -264,6 +264,51 @@
         self.gc.collect()
         verify()
 
+    def test_identityhash(self):
+        # a "does not crash" kind of test
+        p_const = lltype.malloc(S, immortal=True)
+        self.consider_constant(p_const)
+        # (1) p is in the nursery
+        self.gc.collect()
+        p = self.malloc(S)
+        hash = self.gc.identityhash(p)
+        print hash
+        assert isinstance(hash, (int, long))
+        assert hash == self.gc.identityhash(p)
+        self.stackroots.append(p)
+        for i in range(6):
+            self.gc.collect()
+            assert hash == self.gc.identityhash(self.stackroots[-1])
+        self.stackroots.pop()
+        # (2) p is an older object
+        p = self.malloc(S)
+        self.stackroots.append(p)
+        self.gc.collect()
+        hash = self.gc.identityhash(self.stackroots[-1])
+        print hash
+        assert isinstance(hash, (int, long))
+        for i in range(6):
+            self.gc.collect()
+            assert hash == self.gc.identityhash(self.stackroots[-1])
+        self.stackroots.pop()
+        # (3) p is a gen3 object (for hybrid)
+        p = self.malloc(S)
+        self.stackroots.append(p)
+        for i in range(6):
+            self.gc.collect()
+        hash = self.gc.identityhash(self.stackroots[-1])
+        print hash
+        assert isinstance(hash, (int, long))
+        for i in range(2):
+            self.gc.collect()
+            assert hash == self.gc.identityhash(self.stackroots[-1])
+        self.stackroots.pop()
+        # (4) p is a prebuilt object
+        hash = self.gc.identityhash(p_const)
+        print hash
+        assert isinstance(hash, (int, long))
+        assert hash == self.gc.identityhash(p_const)
+
 
 class TestSemiSpaceGC(DirectGCTest):
     from pypy.rpython.memory.gc.semispace import SemiSpaceGC as GCClass
@@ -376,6 +421,9 @@
         assert calls == [('semispace_collect', True)]
         calls = []
 
+    def test_identityhash(self):
+        py.test.skip("does not support raw_mallocs(sizeof(S)+sizeof(hash))")
+
 
 class TestMarkCompactGC(DirectGCTest):
     from pypy.rpython.memory.gc.markcompact import MarkCompactGC as GCClass

Modified: pypy/trunk/pypy/translator/c/test/test_newgc.py
==============================================================================
--- pypy/trunk/pypy/translator/c/test/test_newgc.py	(original)
+++ pypy/trunk/pypy/translator/c/test/test_newgc.py	Wed Feb 24 21:17:05 2010
@@ -1019,6 +1019,30 @@
         res = self.run('string_builder_over_allocation')
         assert res[1000] == 'y'
 
+    def define_nursery_hash_base(cls):
+        from pypy.rlib.objectmodel import compute_identity_hash
+        class A:
+            pass
+        def fn():
+            objects = []
+            hashes = []
+            for i in range(200):
+                rgc.collect(0)     # nursery-only collection, if possible
+                obj = A()
+                objects.append(obj)
+                hashes.append(compute_identity_hash(obj))
+            unique = {}
+            for i in range(len(objects)):
+                assert compute_identity_hash(objects[i]) == hashes[i]
+                unique[hashes[i]] = None
+            return len(unique)
+        return fn
+
+    def test_nursery_hash_base(self):
+        res = self.run('nursery_hash_base')
+        assert res >= 195
+
+
 class TestGenerationalGC(TestSemiSpaceGC):
     gcpolicy = "generation"
     should_be_moving = True



More information about the Pypy-commit mailing list