[pypy-commit] pypy concurrent-marksweep: Start implementing the new section.

Fri Jan 6 18:59:47 CET 2012

Author: Armin Rigo <arigo at tunes.org>
Branch: concurrent-marksweep
Changeset: r51074:719f30faa809
Date: 2012-01-02 22:42 +0100
http://bitbucket.org/pypy/pypy/changeset/719f30faa809/

Log:	Start implementing the new section.

diff --git a/pypy/rpython/memory/gc/concurrentgen.py b/pypy/rpython/memory/gc/concurrentgen.py
--- a/pypy/rpython/memory/gc/concurrentgen.py
+++ b/pypy/rpython/memory/gc/concurrentgen.py
@@ -39,7 +39,7 @@
 # let us know if the 'tid' is valid or is just a word-aligned address):
 MARK_BYTE_1       = 0x6D    # 'm', 109
 MARK_BYTE_2       = 0x4B    # 'K', 75
-MARK_BYTE_3       = 0x23    # '#', 35
+MARK_BYTE_3       = 0x25    # '%', 37
 MARK_BYTE_STATIC  = 0x53    # 'S', 83
 # Next lower byte: a combination of flags.
 FL_WITHHASH       = 0x0100
@@ -75,24 +75,17 @@
         # The default size of the nursery: use 6 MB by default.
         # Environment variable: PYPY_GC_NURSERY
         "nursery_size": 6*1024*1024,
-
-        # Trigger another major collection when 'N+(F-1)*P' bytes survived
-        # minor collections, where N = nursery_size, P = bytes surviving
-        # the previous major collection, and F is the fill_factor here.
-        # Environment variable: PYPY_GC_MAJOR_COLLECT
-        "fill_factor": 1.75,
         }
 
 
     def __init__(self, config,
                  read_from_env=False,
                  nursery_size=32*WORD,
-                 fill_factor=2.0,
+                 fill_factor=2.0,   # xxx kill
                  **kwds):
         GCBase.__init__(self, config, **kwds)
         self.read_from_env = read_from_env
         self.nursery_size = nursery_size
-        self.fill_factor = fill_factor
         #
         self.main_thread_ident = ll_thread.get_ident() # non-transl. debug only
         #
@@ -110,12 +103,12 @@
         # that was not scanned yet.
         self._init_writebarrier_logic()
         #
-        def trigger_collection_now():
+        def _nursery_full(additional_size):
             # a hack to reduce the code size in _account_for_nursery():
-            # avoids both 'self' and the default argument value to be passed
-            self.trigger_next_collection()
-        trigger_collection_now._dont_inline_ = True
-        self.trigger_collection_now = trigger_collection_now
+            # avoids the 'self' argument.
+            self.nursery_full(additional_size)
+        _nursery_full._dont_inline_ = True
+        self._nursery_full = _nursery_full
 
     def _initialize(self):
         # Initialize the GC.  In normal translated program, this function
@@ -163,25 +156,25 @@
         #
         self.collector.setup()
         #
-        self.set_nursery_size(self.nursery_size)
+        self.set_minimal_nursery_size(self.nursery_size)
         if self.read_from_env:
             #
             newsize = env.read_from_env('PYPY_GC_NURSERY')
             if newsize > 0:
-                self.set_nursery_size(newsize)
-            #
-            fillfact = env.read_float_from_env('PYPY_GC_MAJOR_COLLECT')
-            if fillfact > 1.0:
-                self.fill_factor = fillfact
+                self.set_minimal_nursery_size(newsize)
         #
-        debug_print("nursery size:", self.nursery_size)
-        debug_print("fill factor: ", self.fill_factor)
+        debug_print("minimal nursery size:", self.minimal_nursery_size)
         debug_stop("gc-startup")
 
-    def set_nursery_size(self, newsize):
-        self.nursery_size = newsize
-        self.nursery_size_still_available = newsize
-        self.size_still_available_before_major = newsize
+    def set_minimal_nursery_size(self, newsize):
+        # See concurrentgen.txt.  At the start of the process, 'newsize' is
+        # a quarter of the total memory size.
+        newsize = min(newsize, (sys.maxint - 65535) // 4)
+        self.minimal_nursery_size = r_uint(newsize)
+        self.total_memory_size = r_uint(4 * newsize)  # total size
+        self.nursery_size = r_uint(newsize)      # size of the '->new...' box
+        self.old_objects_size = r_uint(0)        # size of the 'old objs' box
+        self.nursery_size_still_available = intmask(self.nursery_size)
 
     def _teardown(self):
         "Stop the collector thread after tests have run."
@@ -208,7 +201,9 @@
         hdr.tid = self.combine(typeid, MARK_BYTE_STATIC, 0)
 
     def malloc_fixedsize_clear(self, typeid, size,
-                               needs_finalizer=False, contains_weakptr=False):
+                               needs_finalizer=False,
+                               finalizer_is_light=False,
+                               contains_weakptr=False):
         #
         # Case of finalizers (test constant-folded)
         if needs_finalizer:
@@ -267,7 +262,7 @@
     def _account_for_nursery(self, additional_size):
         self.nursery_size_still_available -= additional_size
         if self.nursery_size_still_available < 0:
-            self.trigger_collection_now()
+            self._nursery_full(additional_size)
     _account_for_nursery._always_inline_ = True
 
     # ----------
@@ -379,15 +374,63 @@
         self.get_mark(obj)
         self.extra_objects_to_mark.append(obj)
 
+    # ----------
 
-    def wait_for_the_end_of_collection(self):
+    def nursery_full(self, additional_size):
+        # See concurrentgen.txt.
+        #
+        assert self.nursery_size_still_available < 0
+        #
+        # Handle big allocations specially
+        if additional_size > intmask(self.total_memory_size >> 4):
+            xxxxxxxxxxxx
+            self.handle_big_allocation(additional_size)
+            return
+        #
+        if self.collector.running <= 0:
+            #
+            # The previous collection finished.  If necessary, synchronize
+            # the main thread with it.
+            self.sync_end_of_collection()
+            #
+            # Expand the nursery if we can, up to 25% of total_memory_size.
+            # In some cases, the limiting factor is that the nursery size
+            # plus the old objects size must not be larger than
+            # total_memory_size.
+            expand_to = self.total_memory_size >> 2
+            expand_to = min(expand_to, self.total_memory_size -
+                                       self.old_objects_size)
+            self.nursery_size_still_available += intmask(expand_to -
+                                                         self.nursery_size)
+            self.nursery_size = expand_to
+            #
+            # If 'nursery_size_still_available' has been increased to a
+            # nonnegative number, then we are done: we can just continue
+            # filling the nursery.
+            if self.nursery_size_still_available >= 0:
+                return
+            #
+            # Else, we trigger the next minor collection now.
+            self._start_minor_collection()
+            #
+            # Now there is no new object left.  Reset the nursery size
+            # to be 3/4*total_memory_size - old_objects_size, and no
+            # more than 25% of total_memory_size.
+            newsize = (self.total_memory_size >> 2) * 3 - self.old_objects_size
+            newsize = min(newsize, self.total_memory_size >> 2)
+            self.nursery_size = newsize
+            self.nursery_size_still_available = newsize
+            return
+
+        xxx
+
+
+    def sync_end_of_collection(self):
         """In the mutator thread: wait for the minor collection currently
-        running (if any) to finish."""
+        running (if any) to finish, and synchronize the two threads."""
         if self.collector.running != 0:
             debug_start("gc-stop")
             self._stop_collection()
-            debug_print("size_still_available_before_major =",
-                        self.size_still_available_before_major)
             debug_stop("gc-stop")
             #
             # We must *not* run execute_finalizers_ll() here, because it
@@ -397,6 +440,7 @@
             ll_assert(self.collector.running == 0,
                       "collector thread not paused?")
 
+
     def _stop_collection(self):
         self.acquire(self.finished_lock)
         self.collector.running = 0
@@ -435,6 +479,7 @@
 
 
     def collect(self, gen=4):
+        return
         """
         gen=0: Trigger a minor collection if none is running.  Never blocks,
         except if it happens to start a major collection.
diff --git a/pypy/rpython/memory/gc/concurrentgen.txt b/pypy/rpython/memory/gc/concurrentgen.txt
--- a/pypy/rpython/memory/gc/concurrentgen.txt
+++ b/pypy/rpython/memory/gc/concurrentgen.txt
@@ -5,173 +5,6 @@
 Goal: reduce the total real time by moving a part of the GC to its own
 thread that can run in parallel with the main execution thread.
 
-On current modern hardware with at least two cores, the two cores can
-read the same area of memory concurrently.  If one of the cores writes
-to this area, then I believe that the core doing the writing works at
-full speed, whereas the core doing the reading suffers from waiting for
-the data to move to it; but it's still ok because the data usually moves
-in a cache-to-cache bus, not via the main memory.  Also, if an area of
-memory is written to by one core, and then read and written to by the
-other core only, then performance is fine.  The bad case is the one in
-which both cores continously read and write the same area of memory.
-
-So, assuming that the main thread reads and writes to random objects all
-the time, it means that the GC thread should *only read* from the
-objects.  Conversely, the data structures built by the GC thread should
-only be *read* from the main thread.  In particular: when the GC thread
-does marking, it should use off-objects bits; and sweeping should be
-done by adding free objects to lists that are not chained lists.  In
-this way the GC thread never writes to the object's memory.  Similarly,
-for the same reason, the GC thread should not reset areas of memory to
-zero in the background.
-
-This goal is not reached so far: both threads read and write the object
-mark byte; there are no off-objects bits.
-
-
-************************************************************
-  Minor collection cycles of the "concurrentgen" collector
-************************************************************
-
-Objects mark byte:
-
-    cym: young objs (and all flagged objs)
-    cam: aging objs
-    com: old objs
-    'S': static prebuilt objs with no heap pointer
-
-cym = current_young_marker
-cam = current_aging_marker
-com = current_old_marker
-
-The write barrier activates when writing into an object whose
-mark byte is different from 'cym'.
-
-
-------------------------------------------------------------
-
-Step 1.  Only the mutator runs.
-
-   old obj    flagged obj     old obj
-                    |
-                    |
-                    v
-                young obj...
-
-Write barrier: change "old obj" to "flagged obj"
-    (if mark != cym:
-         mark = cym (used to be com or 'S')
-         record the object in the "flagged" list)
-    - note that we consider that flagged old objs are again young objects
-
-------------------------------------------------------------
-
-Step 2.  Preparation for running the collector.  (Still single-threaded.)
-
-   - young objs -> aging objs
-         (exchange the values of 'cam' and 'cym'.
-          there was no 'cam' object, so now there is no 'cym' object)
-
-   - collect roots; add roots and flagged objs to the "gray objs" list
-
-   - unflag objs (i.e. empty the "flagged" list)
-
-------------------------------------------------------------
-
-Step 3.  Parallel execution of the collector, mark phase
-
-   old obj    old obj     old obj
-
-         aging obj   aging obj
-
-   new young obj...
-
-
-Collector thread:
-
-    for each gray obj:
-        skip obj if not an aging obj    (i.e. if mark != cam: continue)
-        for each obj found by tracing:
-            add to gray objs      (if not an aging obj, will be skipped later)
-        gray obj -> black obj     (i.e. mark = com)
-
-Write barrier:
-
-   - perform as a "deletion barrier", detecting changes done to aging objs
-        (i.e. if mark == cam,
-                  mark = com
-                  trace and add to gray objs)
-   - also flag old-or-aging objs that point to new young objs
-        (if mark != cym:
-             mark = cym (used to be com or 'S')
-             record the object in the "flagged" list)
-
-Threading issues:
-
-   - it's possible that both threads will trace the same object, if we're
-     unlucky, but it does not have buggy effects
-   - the "mark = com" in the collector thread can conflict with the
-     "mark = cym" in the mutator write barrier, but again, it should not
-     have buggy effects beyond occasionally triggering the write barrier
-     twice on the same object, adding it twice in "flagged" (and never more)
-   - it is essential to have "mark = com" _after_ tracing in the collector
-     thread; otherwise, the write barrier in the mutator thread would be
-     ignored in case it occurs between the two, and then the tracing done
-     by the collector thread doesn't see the original values any more.
-   - the detection of "we are done" in the collector thread needs to
-     account for the write barrier currently tracing and adding more
-     objects to "gray objs".
-
-------------------------------------------------------------
-
-Step 4.  Parallel execution of the collector, sweep phase
-
-    for obj in previous nursery:
-        if obj is "black":     (i.e. if mark != cam)
-            make the obj old   (         nothing to do here, mark already ok)
-        else:
-            return the object to the available list
-    after this there are no more aging objects
-
-Write barrier:
-
-   - flag old objs that point to new young objs
-        (should not see any 'cam' object any more here)
-
-
-
-************************************************************
-  MAJOR collection cycles of the "concurrentgen" collector
-************************************************************
-
-Works mostly like a minor collection cycle.  The only difference
-is in step 2, which is replaced with:
-
-
-Step 2+.  Preparation for running a major collection.  (Still single-threaded.)
-
-   - force a minor collection's marking step to occur sequentially
-     (steps 2 and 3), to get rid of 'cym' objects.  Objects are left
-     either 'cam' (non-marked) or 'com' (marked).
-
-   - empty the "flagged" list
-
-   - collect roots; add roots to the "gray objs" list
-
-   - com <-> cam
-         (exchange the values of 'com' and 'cam'.
-          there are no 'cym' object right now.
-          the newly 'com' objects are the ones marked unreachable above.)
-
-
-Major collections only worry about old objects.  To avoid serializing
-the complete major collection, we serialize the minor collection's
-marking step that occurs first; the goal is to be sure that all objects
-are in the 'com' state.  We can minimize the non-parallelized delay
-introduced by this step by doing the major collection just after the
-previous minor collection finished, when the quantity of new young
-objects should still be small.
-
 
 
 ************************************************************
@@ -181,7 +14,13 @@
 The objects are never physically moving with this GC; in the pictures
 below, they "move" only in the sense that their age changes.
 
-Allocate new objects until 25% of the total RAM is reached:
+Objects have 4 possible ages: "new" when they are newly allocated;
+"aging" when they are in the process of being marked by the GC thread;
+"old" when they survived a minor collection; and "static" is used to
+mark the static prebuilt GC objects, at least until they grow a pointer
+to a dynamic GC object.
+
+We allocate new objects until 25% of the total RAM is reached:
 
             25%       25%                50%
         +-----------+-----------+-----------------------+
@@ -298,3 +137,176 @@
 Additionally we fix an absolute minimum (at least 6 MB), to avoid doing
 a large number of tiny minor collections, ending up spending all of our
 time in Step 2 scanning the stack of the process.
+
+
+
+************************************************************
+  Notes about running two threads
+************************************************************
+
+On current modern hardware with at least two cores, the two cores can
+read the same area of memory concurrently.  If one of the cores writes
+to this area, then I believe that the core doing the writing works at
+full speed, whereas the core doing the reading suffers from waiting for
+the data to move to it; but it's still ok because the data usually moves
+in a cache-to-cache bus, not via the main memory.  Also, if an area of
+memory is written to by one core, and then read and written to by the
+other core only, then performance is fine.  The bad case is the one in
+which both cores continously read and write the same area of memory.
+
+So, assuming that the main thread reads and writes to random objects all
+the time, it means that the GC thread should *only read* from the
+objects.  Conversely, the data structures built by the GC thread should
+only be *read* from the main thread.  In particular: when the GC thread
+does marking, it should use off-objects bits; and sweeping should be
+done by adding free objects to lists that are not chained lists.  In
+this way the GC thread never writes to the object's memory.  Similarly,
+for the same reason, the GC thread should not reset areas of memory to
+zero in the background.
+
+This goal is not reached so far: both threads read and write the object
+mark byte; there are no off-objects bits.
+
+
+************************************************************
+  Minor collection cycles of the "concurrentgen" collector
+************************************************************
+
+Objects mark byte:
+
+    cym: young objs (and all flagged objs)
+    cam: aging objs
+    com: old objs
+    'S': static prebuilt objs with no heap pointer
+
+cym = current_young_marker
+cam = current_aging_marker
+com = current_old_marker
+
+The write barrier activates when writing into an object whose
+mark byte is different from 'cym'.
+
+
+------------------------------------------------------------
+
+Step 1.  Only the mutator runs.
+
+   old obj    flagged obj     old obj
+                    |
+                    |
+                    v
+                young obj...
+
+Write barrier: change "old obj" to "flagged obj"
+    (if mark != cym:
+         mark = cym (used to be com or 'S')
+         record the object in the "flagged" list)
+    - note that we consider that flagged old objs are again young objects
+
+------------------------------------------------------------
+
+Step 2.  Preparation for running the collector.  (Still single-threaded.)
+
+   - young objs -> aging objs
+         (exchange the values of 'cam' and 'cym'.
+          there was no 'cam' object, so now there is no 'cym' object)
+
+   - collect roots; add roots and flagged objs to the "gray objs" list
+
+   - unflag objs (i.e. empty the "flagged" list)
+
+------------------------------------------------------------
+
+Step 3.  Parallel execution of the collector, mark phase
+
+   old obj    old obj     old obj
+
+         aging obj   aging obj
+
+   new young obj...
+
+
+Collector thread:
+
+    for each gray obj:
+        skip obj if not an aging obj    (i.e. if mark != cam: continue)
+        for each obj found by tracing:
+            add to gray objs      (if not an aging obj, will be skipped later)
+        gray obj -> black obj     (i.e. mark = com)
+
+Write barrier:
+
+   - perform as a "deletion barrier", detecting changes done to aging objs
+        (i.e. if mark == cam,
+                  mark = com
+                  trace and add to gray objs)
+   - also flag old-or-aging objs that point to new young objs
+        (if mark != cym:
+             mark = cym (used to be com or 'S')
+             record the object in the "flagged" list)
+
+Threading issues:
+
+   - it's possible that both threads will trace the same object, if we're
+     unlucky, but it does not have buggy effects
+   - the "mark = com" in the collector thread can conflict with the
+     "mark = cym" in the mutator write barrier, but again, it should not
+     have buggy effects beyond occasionally triggering the write barrier
+     twice on the same object, adding it twice in "flagged" (and never more)
+   - it is essential to have "mark = com" _after_ tracing in the collector
+     thread; otherwise, the write barrier in the mutator thread would be
+     ignored in case it occurs between the two, and then the tracing done
+     by the collector thread doesn't see the original values any more.
+   - the detection of "we are done" in the collector thread needs to
+     account for the write barrier currently tracing and adding more
+     objects to "gray objs".
+
+------------------------------------------------------------
+
+Step 4.  Parallel execution of the collector, sweep phase
+
+    for obj in previous nursery:
+        if obj is "black":     (i.e. if mark != cam)
+            make the obj old   (         nothing to do here, mark already ok)
+        else:
+            return the object to the available list
+    after this there are no more aging objects
+
+Write barrier:
+
+   - flag old objs that point to new young objs
+        (should not see any 'cam' object any more here)
+
+
+
+************************************************************
+  MAJOR collection cycles of the "concurrentgen" collector
+************************************************************
+
+Works mostly like a minor collection cycle.  The only difference
+is in step 2, which is replaced with:
+
+
+Step 2+.  Preparation for running a major collection.  (Still single-threaded.)
+
+   - force a minor collection's marking step to occur sequentially
+     (steps 2 and 3), to get rid of 'cym' objects.  Objects are left
+     either 'cam' (non-marked) or 'com' (marked).
+
+   - empty the "flagged" list
+
+   - collect roots; add roots to the "gray objs" list
+
+   - com <-> cam
+         (exchange the values of 'com' and 'cam'.
+          there are no 'cym' object right now.
+          the newly 'com' objects are the ones marked unreachable above.)
+
+
+Major collections only worry about old objects.  To avoid serializing
+the complete major collection, we serialize the minor collection's
+marking step that occurs first; the goal is to be sure that all objects
+are in the 'com' state.  We can minimize the non-parallelized delay
+introduced by this step by doing the major collection just after the
+previous minor collection finished, when the quantity of new young
+objects should still be small.