[pypy-commit] pypy concurrent-marksweep: Start implementing the new section.
arigo
noreply at buildbot.pypy.org
Fri Jan 6 18:59:47 CET 2012
Author: Armin Rigo <arigo at tunes.org>
Branch: concurrent-marksweep
Changeset: r51074:719f30faa809
Date: 2012-01-02 22:42 +0100
http://bitbucket.org/pypy/pypy/changeset/719f30faa809/
Log: Start implementing the new section.
diff --git a/pypy/rpython/memory/gc/concurrentgen.py b/pypy/rpython/memory/gc/concurrentgen.py
--- a/pypy/rpython/memory/gc/concurrentgen.py
+++ b/pypy/rpython/memory/gc/concurrentgen.py
@@ -39,7 +39,7 @@
# let us know if the 'tid' is valid or is just a word-aligned address):
MARK_BYTE_1 = 0x6D # 'm', 109
MARK_BYTE_2 = 0x4B # 'K', 75
-MARK_BYTE_3 = 0x23 # '#', 35
+MARK_BYTE_3 = 0x25 # '%', 37
MARK_BYTE_STATIC = 0x53 # 'S', 83
# Next lower byte: a combination of flags.
FL_WITHHASH = 0x0100
@@ -75,24 +75,17 @@
# The default size of the nursery: use 6 MB by default.
# Environment variable: PYPY_GC_NURSERY
"nursery_size": 6*1024*1024,
-
- # Trigger another major collection when 'N+(F-1)*P' bytes survived
- # minor collections, where N = nursery_size, P = bytes surviving
- # the previous major collection, and F is the fill_factor here.
- # Environment variable: PYPY_GC_MAJOR_COLLECT
- "fill_factor": 1.75,
}
def __init__(self, config,
read_from_env=False,
nursery_size=32*WORD,
- fill_factor=2.0,
+ fill_factor=2.0, # xxx kill
**kwds):
GCBase.__init__(self, config, **kwds)
self.read_from_env = read_from_env
self.nursery_size = nursery_size
- self.fill_factor = fill_factor
#
self.main_thread_ident = ll_thread.get_ident() # non-transl. debug only
#
@@ -110,12 +103,12 @@
# that was not scanned yet.
self._init_writebarrier_logic()
#
- def trigger_collection_now():
+ def _nursery_full(additional_size):
# a hack to reduce the code size in _account_for_nursery():
- # avoids both 'self' and the default argument value to be passed
- self.trigger_next_collection()
- trigger_collection_now._dont_inline_ = True
- self.trigger_collection_now = trigger_collection_now
+ # avoids the 'self' argument.
+ self.nursery_full(additional_size)
+ _nursery_full._dont_inline_ = True
+ self._nursery_full = _nursery_full
def _initialize(self):
# Initialize the GC. In normal translated program, this function
@@ -163,25 +156,25 @@
#
self.collector.setup()
#
- self.set_nursery_size(self.nursery_size)
+ self.set_minimal_nursery_size(self.nursery_size)
if self.read_from_env:
#
newsize = env.read_from_env('PYPY_GC_NURSERY')
if newsize > 0:
- self.set_nursery_size(newsize)
- #
- fillfact = env.read_float_from_env('PYPY_GC_MAJOR_COLLECT')
- if fillfact > 1.0:
- self.fill_factor = fillfact
+ self.set_minimal_nursery_size(newsize)
#
- debug_print("nursery size:", self.nursery_size)
- debug_print("fill factor: ", self.fill_factor)
+ debug_print("minimal nursery size:", self.minimal_nursery_size)
debug_stop("gc-startup")
- def set_nursery_size(self, newsize):
- self.nursery_size = newsize
- self.nursery_size_still_available = newsize
- self.size_still_available_before_major = newsize
+ def set_minimal_nursery_size(self, newsize):
+ # See concurrentgen.txt. At the start of the process, 'newsize' is
+ # a quarter of the total memory size.
+ newsize = min(newsize, (sys.maxint - 65535) // 4)
+ self.minimal_nursery_size = r_uint(newsize)
+ self.total_memory_size = r_uint(4 * newsize) # total size
+ self.nursery_size = r_uint(newsize) # size of the '->new...' box
+ self.old_objects_size = r_uint(0) # size of the 'old objs' box
+ self.nursery_size_still_available = intmask(self.nursery_size)
def _teardown(self):
"Stop the collector thread after tests have run."
@@ -208,7 +201,9 @@
hdr.tid = self.combine(typeid, MARK_BYTE_STATIC, 0)
def malloc_fixedsize_clear(self, typeid, size,
- needs_finalizer=False, contains_weakptr=False):
+ needs_finalizer=False,
+ finalizer_is_light=False,
+ contains_weakptr=False):
#
# Case of finalizers (test constant-folded)
if needs_finalizer:
@@ -267,7 +262,7 @@
def _account_for_nursery(self, additional_size):
self.nursery_size_still_available -= additional_size
if self.nursery_size_still_available < 0:
- self.trigger_collection_now()
+ self._nursery_full(additional_size)
_account_for_nursery._always_inline_ = True
# ----------
@@ -379,15 +374,63 @@
self.get_mark(obj)
self.extra_objects_to_mark.append(obj)
+ # ----------
- def wait_for_the_end_of_collection(self):
+ def nursery_full(self, additional_size):
+ # See concurrentgen.txt.
+ #
+ assert self.nursery_size_still_available < 0
+ #
+ # Handle big allocations specially
+ if additional_size > intmask(self.total_memory_size >> 4):
+ xxxxxxxxxxxx
+ self.handle_big_allocation(additional_size)
+ return
+ #
+ if self.collector.running <= 0:
+ #
+ # The previous collection finished. If necessary, synchronize
+ # the main thread with it.
+ self.sync_end_of_collection()
+ #
+ # Expand the nursery if we can, up to 25% of total_memory_size.
+ # In some cases, the limiting factor is that the nursery size
+ # plus the old objects size must not be larger than
+ # total_memory_size.
+ expand_to = self.total_memory_size >> 2
+ expand_to = min(expand_to, self.total_memory_size -
+ self.old_objects_size)
+ self.nursery_size_still_available += intmask(expand_to -
+ self.nursery_size)
+ self.nursery_size = expand_to
+ #
+ # If 'nursery_size_still_available' has been increased to a
+ # nonnegative number, then we are done: we can just continue
+ # filling the nursery.
+ if self.nursery_size_still_available >= 0:
+ return
+ #
+ # Else, we trigger the next minor collection now.
+ self._start_minor_collection()
+ #
+ # Now there is no new object left. Reset the nursery size
+ # to be 3/4*total_memory_size - old_objects_size, and no
+ # more than 25% of total_memory_size.
+ newsize = (self.total_memory_size >> 2) * 3 - self.old_objects_size
+ newsize = min(newsize, self.total_memory_size >> 2)
+ self.nursery_size = newsize
+ self.nursery_size_still_available = newsize
+ return
+
+ xxx
+
+
+ def sync_end_of_collection(self):
"""In the mutator thread: wait for the minor collection currently
- running (if any) to finish."""
+ running (if any) to finish, and synchronize the two threads."""
if self.collector.running != 0:
debug_start("gc-stop")
self._stop_collection()
- debug_print("size_still_available_before_major =",
- self.size_still_available_before_major)
debug_stop("gc-stop")
#
# We must *not* run execute_finalizers_ll() here, because it
@@ -397,6 +440,7 @@
ll_assert(self.collector.running == 0,
"collector thread not paused?")
+
def _stop_collection(self):
self.acquire(self.finished_lock)
self.collector.running = 0
@@ -435,6 +479,7 @@
def collect(self, gen=4):
+ return
"""
gen=0: Trigger a minor collection if none is running. Never blocks,
except if it happens to start a major collection.
diff --git a/pypy/rpython/memory/gc/concurrentgen.txt b/pypy/rpython/memory/gc/concurrentgen.txt
--- a/pypy/rpython/memory/gc/concurrentgen.txt
+++ b/pypy/rpython/memory/gc/concurrentgen.txt
@@ -5,173 +5,6 @@
Goal: reduce the total real time by moving a part of the GC to its own
thread that can run in parallel with the main execution thread.
-On current modern hardware with at least two cores, the two cores can
-read the same area of memory concurrently. If one of the cores writes
-to this area, then I believe that the core doing the writing works at
-full speed, whereas the core doing the reading suffers from waiting for
-the data to move to it; but it's still ok because the data usually moves
-in a cache-to-cache bus, not via the main memory. Also, if an area of
-memory is written to by one core, and then read and written to by the
-other core only, then performance is fine. The bad case is the one in
-which both cores continously read and write the same area of memory.
-
-So, assuming that the main thread reads and writes to random objects all
-the time, it means that the GC thread should *only read* from the
-objects. Conversely, the data structures built by the GC thread should
-only be *read* from the main thread. In particular: when the GC thread
-does marking, it should use off-objects bits; and sweeping should be
-done by adding free objects to lists that are not chained lists. In
-this way the GC thread never writes to the object's memory. Similarly,
-for the same reason, the GC thread should not reset areas of memory to
-zero in the background.
-
-This goal is not reached so far: both threads read and write the object
-mark byte; there are no off-objects bits.
-
-
-************************************************************
- Minor collection cycles of the "concurrentgen" collector
-************************************************************
-
-Objects mark byte:
-
- cym: young objs (and all flagged objs)
- cam: aging objs
- com: old objs
- 'S': static prebuilt objs with no heap pointer
-
-cym = current_young_marker
-cam = current_aging_marker
-com = current_old_marker
-
-The write barrier activates when writing into an object whose
-mark byte is different from 'cym'.
-
-
-------------------------------------------------------------
-
-Step 1. Only the mutator runs.
-
- old obj flagged obj old obj
- |
- |
- v
- young obj...
-
-Write barrier: change "old obj" to "flagged obj"
- (if mark != cym:
- mark = cym (used to be com or 'S')
- record the object in the "flagged" list)
- - note that we consider that flagged old objs are again young objects
-
-------------------------------------------------------------
-
-Step 2. Preparation for running the collector. (Still single-threaded.)
-
- - young objs -> aging objs
- (exchange the values of 'cam' and 'cym'.
- there was no 'cam' object, so now there is no 'cym' object)
-
- - collect roots; add roots and flagged objs to the "gray objs" list
-
- - unflag objs (i.e. empty the "flagged" list)
-
-------------------------------------------------------------
-
-Step 3. Parallel execution of the collector, mark phase
-
- old obj old obj old obj
-
- aging obj aging obj
-
- new young obj...
-
-
-Collector thread:
-
- for each gray obj:
- skip obj if not an aging obj (i.e. if mark != cam: continue)
- for each obj found by tracing:
- add to gray objs (if not an aging obj, will be skipped later)
- gray obj -> black obj (i.e. mark = com)
-
-Write barrier:
-
- - perform as a "deletion barrier", detecting changes done to aging objs
- (i.e. if mark == cam,
- mark = com
- trace and add to gray objs)
- - also flag old-or-aging objs that point to new young objs
- (if mark != cym:
- mark = cym (used to be com or 'S')
- record the object in the "flagged" list)
-
-Threading issues:
-
- - it's possible that both threads will trace the same object, if we're
- unlucky, but it does not have buggy effects
- - the "mark = com" in the collector thread can conflict with the
- "mark = cym" in the mutator write barrier, but again, it should not
- have buggy effects beyond occasionally triggering the write barrier
- twice on the same object, adding it twice in "flagged" (and never more)
- - it is essential to have "mark = com" _after_ tracing in the collector
- thread; otherwise, the write barrier in the mutator thread would be
- ignored in case it occurs between the two, and then the tracing done
- by the collector thread doesn't see the original values any more.
- - the detection of "we are done" in the collector thread needs to
- account for the write barrier currently tracing and adding more
- objects to "gray objs".
-
-------------------------------------------------------------
-
-Step 4. Parallel execution of the collector, sweep phase
-
- for obj in previous nursery:
- if obj is "black": (i.e. if mark != cam)
- make the obj old ( nothing to do here, mark already ok)
- else:
- return the object to the available list
- after this there are no more aging objects
-
-Write barrier:
-
- - flag old objs that point to new young objs
- (should not see any 'cam' object any more here)
-
-
-
-************************************************************
- MAJOR collection cycles of the "concurrentgen" collector
-************************************************************
-
-Works mostly like a minor collection cycle. The only difference
-is in step 2, which is replaced with:
-
-
-Step 2+. Preparation for running a major collection. (Still single-threaded.)
-
- - force a minor collection's marking step to occur sequentially
- (steps 2 and 3), to get rid of 'cym' objects. Objects are left
- either 'cam' (non-marked) or 'com' (marked).
-
- - empty the "flagged" list
-
- - collect roots; add roots to the "gray objs" list
-
- - com <-> cam
- (exchange the values of 'com' and 'cam'.
- there are no 'cym' object right now.
- the newly 'com' objects are the ones marked unreachable above.)
-
-
-Major collections only worry about old objects. To avoid serializing
-the complete major collection, we serialize the minor collection's
-marking step that occurs first; the goal is to be sure that all objects
-are in the 'com' state. We can minimize the non-parallelized delay
-introduced by this step by doing the major collection just after the
-previous minor collection finished, when the quantity of new young
-objects should still be small.
-
************************************************************
@@ -181,7 +14,13 @@
The objects are never physically moving with this GC; in the pictures
below, they "move" only in the sense that their age changes.
-Allocate new objects until 25% of the total RAM is reached:
+Objects have 4 possible ages: "new" when they are newly allocated;
+"aging" when they are in the process of being marked by the GC thread;
+"old" when they survived a minor collection; and "static" is used to
+mark the static prebuilt GC objects, at least until they grow a pointer
+to a dynamic GC object.
+
+We allocate new objects until 25% of the total RAM is reached:
25% 25% 50%
+-----------+-----------+-----------------------+
@@ -298,3 +137,176 @@
Additionally we fix an absolute minimum (at least 6 MB), to avoid doing
a large number of tiny minor collections, ending up spending all of our
time in Step 2 scanning the stack of the process.
+
+
+
+************************************************************
+ Notes about running two threads
+************************************************************
+
+On current modern hardware with at least two cores, the two cores can
+read the same area of memory concurrently. If one of the cores writes
+to this area, then I believe that the core doing the writing works at
+full speed, whereas the core doing the reading suffers from waiting for
+the data to move to it; but it's still ok because the data usually moves
+in a cache-to-cache bus, not via the main memory. Also, if an area of
+memory is written to by one core, and then read and written to by the
+other core only, then performance is fine. The bad case is the one in
+which both cores continously read and write the same area of memory.
+
+So, assuming that the main thread reads and writes to random objects all
+the time, it means that the GC thread should *only read* from the
+objects. Conversely, the data structures built by the GC thread should
+only be *read* from the main thread. In particular: when the GC thread
+does marking, it should use off-objects bits; and sweeping should be
+done by adding free objects to lists that are not chained lists. In
+this way the GC thread never writes to the object's memory. Similarly,
+for the same reason, the GC thread should not reset areas of memory to
+zero in the background.
+
+This goal is not reached so far: both threads read and write the object
+mark byte; there are no off-objects bits.
+
+
+************************************************************
+ Minor collection cycles of the "concurrentgen" collector
+************************************************************
+
+Objects mark byte:
+
+ cym: young objs (and all flagged objs)
+ cam: aging objs
+ com: old objs
+ 'S': static prebuilt objs with no heap pointer
+
+cym = current_young_marker
+cam = current_aging_marker
+com = current_old_marker
+
+The write barrier activates when writing into an object whose
+mark byte is different from 'cym'.
+
+
+------------------------------------------------------------
+
+Step 1. Only the mutator runs.
+
+ old obj flagged obj old obj
+ |
+ |
+ v
+ young obj...
+
+Write barrier: change "old obj" to "flagged obj"
+ (if mark != cym:
+ mark = cym (used to be com or 'S')
+ record the object in the "flagged" list)
+ - note that we consider that flagged old objs are again young objects
+
+------------------------------------------------------------
+
+Step 2. Preparation for running the collector. (Still single-threaded.)
+
+ - young objs -> aging objs
+ (exchange the values of 'cam' and 'cym'.
+ there was no 'cam' object, so now there is no 'cym' object)
+
+ - collect roots; add roots and flagged objs to the "gray objs" list
+
+ - unflag objs (i.e. empty the "flagged" list)
+
+------------------------------------------------------------
+
+Step 3. Parallel execution of the collector, mark phase
+
+ old obj old obj old obj
+
+ aging obj aging obj
+
+ new young obj...
+
+
+Collector thread:
+
+ for each gray obj:
+ skip obj if not an aging obj (i.e. if mark != cam: continue)
+ for each obj found by tracing:
+ add to gray objs (if not an aging obj, will be skipped later)
+ gray obj -> black obj (i.e. mark = com)
+
+Write barrier:
+
+ - perform as a "deletion barrier", detecting changes done to aging objs
+ (i.e. if mark == cam,
+ mark = com
+ trace and add to gray objs)
+ - also flag old-or-aging objs that point to new young objs
+ (if mark != cym:
+ mark = cym (used to be com or 'S')
+ record the object in the "flagged" list)
+
+Threading issues:
+
+ - it's possible that both threads will trace the same object, if we're
+ unlucky, but it does not have buggy effects
+ - the "mark = com" in the collector thread can conflict with the
+ "mark = cym" in the mutator write barrier, but again, it should not
+ have buggy effects beyond occasionally triggering the write barrier
+ twice on the same object, adding it twice in "flagged" (and never more)
+ - it is essential to have "mark = com" _after_ tracing in the collector
+ thread; otherwise, the write barrier in the mutator thread would be
+ ignored in case it occurs between the two, and then the tracing done
+ by the collector thread doesn't see the original values any more.
+ - the detection of "we are done" in the collector thread needs to
+ account for the write barrier currently tracing and adding more
+ objects to "gray objs".
+
+------------------------------------------------------------
+
+Step 4. Parallel execution of the collector, sweep phase
+
+ for obj in previous nursery:
+ if obj is "black": (i.e. if mark != cam)
+ make the obj old ( nothing to do here, mark already ok)
+ else:
+ return the object to the available list
+ after this there are no more aging objects
+
+Write barrier:
+
+ - flag old objs that point to new young objs
+ (should not see any 'cam' object any more here)
+
+
+
+************************************************************
+ MAJOR collection cycles of the "concurrentgen" collector
+************************************************************
+
+Works mostly like a minor collection cycle. The only difference
+is in step 2, which is replaced with:
+
+
+Step 2+. Preparation for running a major collection. (Still single-threaded.)
+
+ - force a minor collection's marking step to occur sequentially
+ (steps 2 and 3), to get rid of 'cym' objects. Objects are left
+ either 'cam' (non-marked) or 'com' (marked).
+
+ - empty the "flagged" list
+
+ - collect roots; add roots to the "gray objs" list
+
+ - com <-> cam
+ (exchange the values of 'com' and 'cam'.
+ there are no 'cym' object right now.
+ the newly 'com' objects are the ones marked unreachable above.)
+
+
+Major collections only worry about old objects. To avoid serializing
+the complete major collection, we serialize the minor collection's
+marking step that occurs first; the goal is to be sure that all objects
+are in the 'com' state. We can minimize the non-parallelized delay
+introduced by this step by doing the major collection just after the
+previous minor collection finished, when the quantity of new young
+objects should still be small.
More information about the pypy-commit
mailing list