[pypy-commit] stmgc gc-small-uniform: hg merge default

Sun Apr 13 18:25:13 CEST 2014

Author: Armin Rigo <arigo at tunes.org>
Branch: gc-small-uniform
Changeset: r1153:f4c49a88012e
Date: 2014-04-12 12:40 +0200
http://bitbucket.org/pypy/stmgc/changeset/f4c49a88012e/

Log:	hg merge default

diff --git a/c7/demo/Makefile b/c7/demo/Makefile
--- a/c7/demo/Makefile
+++ b/c7/demo/Makefile
@@ -17,7 +17,7 @@
 H_FILES = ../stmgc.h ../stm/*.h
 C_FILES = ../stmgc.c ../stm/*.c
 
-COMMON = -I.. -pthread -lrt -g -Wall -Werror
+COMMON = -I.. -pthread -lrt -g -Wall -Werror -DSTM_LARGEMALLOC_TEST
 
 
 # note that 'build' is partially optimized but still contains all asserts
diff --git a/c7/demo/demo_largemalloc.c b/c7/demo/demo_largemalloc.c
new file mode 100644
--- /dev/null
+++ b/c7/demo/demo_largemalloc.c
@@ -0,0 +1,72 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include "stmgc.h"
+#include "../stm/largemalloc.h"
+
+static inline double get_stm_time(void)
+{
+    struct timespec tp;
+    clock_gettime(CLOCK_MONOTONIC, &tp);
+    return tp.tv_sec + tp.tv_nsec * 0.000000001;
+}
+
+ssize_t stmcb_size_rounded_up(struct object_s *ob)
+{
+    abort();
+}
+
+void stmcb_trace(struct object_s *obj, void visit(object_t **))
+{
+    abort();
+}
+
+/************************************************************/
+
+#define ARENA_SIZE  (1024*1024*1024)
+
+static char *arena_data;
+extern bool (*_stm_largemalloc_keep)(char *data);   /* a hook for tests */
+void _stm_mutex_pages_lock(void);
+
+
+static bool keep_me(char *data) {
+    static bool last_answer = false;
+    last_answer = !last_answer;
+    return last_answer;
+}
+
+void timing(int scale)
+{
+    long limit = 1L << scale;
+    _stm_largemalloc_init_arena(arena_data, ARENA_SIZE);
+    double start = get_stm_time();
+
+    long i;
+    for (i = 0; i < limit; i++) {
+        _stm_large_malloc(16 + 8 * (i % 4));  /* may return NULL */
+    }
+    _stm_largemalloc_keep = keep_me;
+    _stm_largemalloc_sweep();
+    for (i = 0; i < limit; i++) {
+        _stm_large_malloc(16 + 8 * (i % 4));  /* may return NULL */
+    }
+
+    double stop = get_stm_time();
+    printf("scale %2d: %.9f\n", scale, stop - start);
+}
+
+
+
+int main(void)
+{
+    int i;
+    arena_data = malloc(ARENA_SIZE);
+    assert(arena_data != NULL);
+    _stm_mutex_pages_lock();
+    for (i = 0; i < 25; i++)
+        timing(i);
+    return 0;
+}
diff --git a/c7/gdb/gdb_stm.py b/c7/gdb/gdb_stm.py
new file mode 100644
--- /dev/null
+++ b/c7/gdb/gdb_stm.py
@@ -0,0 +1,49 @@
+""" Adds two built-in functions: $rfs(p=0) and $rgs(p=0).
+
+Returns the number or the address 'p', offset with the value of
+the %fs or %gs register in the current thread.
+
+Usage: you can for example add this line in your ~/.gdbinit:
+
+    python execfile('/path/to/gdb_stm.py')
+"""
+import gdb
+
+def gdb_function(func):
+    class Func(gdb.Function):
+        __doc__ = func.__doc__
+        invoke = staticmethod(func)
+    Func(func.__name__)
+
+# -------------------------------------------------------
+
+SEG_FS = 0x1003
+SEG_GS = 0x1004
+
+def get_segment_register(which):
+    v = gdb.parse_and_eval('(long*)malloc(8)')
+    L = gdb.lookup_type('long')
+    gdb.parse_and_eval('arch_prctl(%d, %d)' % (which, int(v.cast(L))))
+    result = int(v.dereference())
+    gdb.parse_and_eval('free(%d)' % (int(v.cast(L)),))
+    return result
+
+def rfsrgs(name, which):
+    seg = get_segment_register(which)
+    if name is None:
+        return seg
+    tp = name.type
+    if tp.code == gdb.TYPE_CODE_INT:
+        return name + seg
+    assert tp.code == gdb.TYPE_CODE_PTR
+    L = gdb.lookup_type('long')
+    return (name.cast(L) + seg).cast(tp)
+
+ at gdb_function
+def rfs(name=None):
+    return rfsrgs(name, SEG_FS)
+
+ at gdb_function
+def rgs(name=None):
+    return rfsrgs(name, SEG_GS)
+
diff --git a/c7/stm/atomic.h b/c7/stm/atomic.h
--- a/c7/stm/atomic.h
+++ b/c7/stm/atomic.h
@@ -36,4 +36,12 @@
 #endif
 
 
+#define spinlock_acquire(lock)                                          \
+    do { if (LIKELY(__sync_lock_test_and_set(&(lock), 1) == 0)) break;  \
+         spin_loop(); } while (1)
+#define spinlock_release(lock)                                          \
+    do { assert((lock) == 1);                                           \
+         __sync_lock_release(&(lock)); } while (0)
+
+
 #endif  /* _STM_ATOMIC_H */
diff --git a/c7/stm/core.c b/c7/stm/core.c
--- a/c7/stm/core.c
+++ b/c7/stm/core.c
@@ -8,6 +8,23 @@
     memset(write_locks, 0, sizeof(write_locks));
 }
 
+#ifdef NDEBUG
+#define EVENTUALLY(condition)    /* nothing */
+#else
+#define EVENTUALLY(condition)                                   \
+    {                                                           \
+        if (!(condition)) {                                     \
+            int _i;                                             \
+            for (_i = 1; _i <= NB_SEGMENTS; _i++)               \
+                spinlock_acquire(lock_pages_privatizing[_i]);   \
+            if (!(condition))                                   \
+                stm_fatalerror("fails: " #condition);           \
+            for (_i = 1; _i <= NB_SEGMENTS; _i++)               \
+                spinlock_release(lock_pages_privatizing[_i]);   \
+        }                                                       \
+    }
+#endif
+
 static void check_flag_write_barrier(object_t *obj)
 {
     /* check that all copies of the object, apart from mine, have the
@@ -21,12 +38,7 @@
         if (i == STM_SEGMENT->segment_num)
             continue;
         o1 = (struct object_s *)REAL_ADDRESS(get_segment_base(i), obj);
-        if (!(o1->stm_flags & GCFLAG_WRITE_BARRIER)) {
-            mutex_pages_lock();  /* try again... */
-            if (!(o1->stm_flags & GCFLAG_WRITE_BARRIER))
-                stm_fatalerror("missing GCFLAG_WRITE_BARRIER");
-            mutex_pages_unlock();
-        }
+        EVENTUALLY(o1->stm_flags & GCFLAG_WRITE_BARRIER);
     }
 #endif
 }
@@ -271,7 +283,6 @@
        with synchronize_object_now() but I don't completely see how to
        improve...
     */
-    assert(_has_mutex_pages());
     assert(!_is_young(obj));
 
     uintptr_t start = (uintptr_t)obj;
@@ -326,10 +337,7 @@
     /* Copy around the version of 'obj' that lives in our own segment.
        It is first copied into the shared pages, and then into other
        segments' own private pages.
-
-       This must be called with the mutex_pages_lock!
     */
-    assert(_has_mutex_pages());
     assert(!_is_young(obj));
     assert(obj->stm_flags & GCFLAG_WRITE_BARRIER);
 
@@ -406,7 +414,7 @@
                     memcpy(dst, src, copy_size);
             }
             else {
-                assert(memcmp(dst, src, copy_size) == 0);  /* same page */
+                EVENTUALLY(memcmp(dst, src, copy_size) == 0);  /* same page */
             }
 
             for (i = 1; i <= NB_SEGMENTS; i++) {
@@ -425,7 +433,7 @@
                         memcpy(dst, src, copy_size);
                 }
                 else {
-                    assert(memcmp(dst, src, copy_size) == 0);  /* same page */
+                    EVENTUALLY(!memcmp(dst, src, copy_size));  /* same page */
                 }
             }
 
@@ -518,12 +526,10 @@
         major_collection_now_at_safe_point();
 
     /* synchronize overflow objects living in privatized pages */
-    mutex_pages_lock();
     push_overflow_objects_from_privatized_pages();
 
     /* synchronize modified old objects to other threads */
     push_modified_to_other_segments();
-    mutex_pages_unlock();
 
     /* update 'overflow_number' if needed */
     if (STM_PSEGMENT->overflow_number_has_been_used) {
diff --git a/c7/stm/core.h b/c7/stm/core.h
--- a/c7/stm/core.h
+++ b/c7/stm/core.h
@@ -35,8 +35,6 @@
 #define WRITELOCK_START       ((END_NURSERY_PAGE * 4096UL) >> 4)
 #define WRITELOCK_END         READMARKER_END
 
-#define SHADOW_STACK_SIZE     1000
-
 enum /* stm_flags */ {
     /* This flag is set on non-nursery objects.  It forces stm_write()
        to call _stm_write_slowpath().
diff --git a/c7/stm/forksupport.c b/c7/stm/forksupport.c
--- a/c7/stm/forksupport.c
+++ b/c7/stm/forksupport.c
@@ -70,7 +70,6 @@
 
     s_mutex_lock();
     synchronize_all_threads(STOP_OTHERS_UNTIL_MUTEX_UNLOCK);
-    mutex_pages_lock();
 
     /* Make a new mmap at some other address, but of the same size as
        the standard mmap at stm_object_pages
@@ -166,7 +165,6 @@
     fork_big_copy = NULL;
     bool was_in_transaction = fork_was_in_transaction;
 
-    mutex_pages_unlock();
     s_mutex_unlock();
 
     if (!was_in_transaction) {
@@ -203,7 +201,6 @@
 
     /* this new process contains no other thread, so we can
        just release these locks early */
-    mutex_pages_unlock();
     s_mutex_unlock();
 
     /* Move the copy of the mmap over the old one, overwriting it
diff --git a/c7/stm/gcpage.c b/c7/stm/gcpage.c
--- a/c7/stm/gcpage.c
+++ b/c7/stm/gcpage.c
@@ -32,18 +32,23 @@
     pages_initialize_shared((pages_addr - stm_object_pages) / 4096UL, num);
 }
 
+
+static int lock_growth_large = 0;
+
 static char *allocate_outside_nursery_large(uint64_t size)
 {
-    /* thread-safe: use the lock of pages.c to prevent any remapping
-       from occurring under our feet */
-    mutex_pages_lock();
-    increment_total_allocated(size + LARGE_MALLOC_OVERHEAD);
-
     /* Allocate the object with largemalloc.c from the lower addresses. */
     char *addr = _stm_large_malloc(size);
     if (addr == NULL)
         stm_fatalerror("not enough memory!");
 
+    if (LIKELY(addr + size <= uninitialized_page_start)) {
+        return addr;
+    }
+
+    /* uncommon case: need to initialize some more pages */
+    spinlock_acquire(lock_growth_large);
+
     if (addr + size > uninitialized_page_start) {
         uintptr_t npages;
         npages = (addr + size - uninitialized_page_start) / 4096UL;
@@ -53,11 +58,10 @@
             stm_fatalerror("out of memory!");   /* XXX */
         }
         setup_N_pages(uninitialized_page_start, npages);
+        __sync_synchronize();
         uninitialized_page_start += npages * 4096UL;
     }
-
-    mutex_pages_unlock();
-
+    spinlock_release(lock_growth_large);
     return addr;
 }
 
@@ -213,7 +217,6 @@
        total_allocated by 4096. */
 
     long i;
-    mutex_pages_lock();
 
     for (i = 1; i <= NB_SEGMENTS; i++) {
         /* The 'modified_old_objects' list gives the list of objects
@@ -263,7 +266,6 @@
     for (i = 1; i <= NB_SEGMENTS; i++) {
         major_restore_private_bits_for_modified_objects(i);
     }
-    mutex_pages_unlock();
 }
 
 
@@ -422,9 +424,7 @@
 
 static void sweep_large_objects(void)
 {
-    mutex_pages_lock();
     _stm_largemalloc_sweep();
-    mutex_pages_unlock();
 }
 
 static void clean_write_locks(void)
diff --git a/c7/stm/largemalloc.c b/c7/stm/largemalloc.c
--- a/c7/stm/largemalloc.c
+++ b/c7/stm/largemalloc.c
@@ -20,19 +20,25 @@
 #define LAST_BIN_INDEX(sz) ((sz) >= (3 << 18))
 
 typedef struct dlist_s {
-    struct dlist_s *next;   /* a doubly-linked list */
+    struct dlist_s *next;   /* a circular doubly-linked list */
     struct dlist_s *prev;
 } dlist_t;
 
+typedef struct ulist_s {
+    struct ulist_s *up;     /* a non-circular doubly-linked list */
+    struct ulist_s *down;
+} ulist_t;
+
 typedef struct malloc_chunk {
     size_t prev_size;     /* - if the previous chunk is free: size of its data
                              - otherwise, if this chunk is free: 1
                              - otherwise, 0. */
-    size_t size;          /* size of the data in this chunk,
-                             plus optionally the FLAG_SORTED */
+    size_t size;          /* size of the data in this chunk */
 
-    dlist_t d;            /* if free: a doubly-linked list */
+    dlist_t d;            /* if free: a doubly-linked list 'largebins' */
                           /* if not free: the user data starts here */
+    ulist_t u;            /* if free, if unsorted: up==UU_UNSORTED
+                             if free, if sorted: a doubly-linked list */
 
     /* The chunk has a total size of 'size'.  It is immediately followed
        in memory by another chunk.  This list ends with the last "chunk"
@@ -41,29 +47,22 @@
        one are considered "not free". */
 } mchunk_t;
 
-#define FLAG_SORTED          1
+#define UU_UNSORTED          ((ulist_t *) 1)
 #define THIS_CHUNK_FREE      1
 #define BOTH_CHUNKS_USED     0
 #define CHUNK_HEADER_SIZE    offsetof(struct malloc_chunk, d)
 #define END_MARKER           0xDEADBEEF
+#define MIN_ALLOC_SIZE       (sizeof(struct malloc_chunk) - CHUNK_HEADER_SIZE)
 
 #define chunk_at_offset(p, ofs)  ((mchunk_t *)(((char *)(p)) + (ofs)))
 #define data2chunk(p)            chunk_at_offset(p, -CHUNK_HEADER_SIZE)
+#define updown2chunk(p)          chunk_at_offset(p,                     \
+                                     -(CHUNK_HEADER_SIZE + sizeof(dlist_t)))
 
-static mchunk_t *next_chunk_s(mchunk_t *p)
+static mchunk_t *next_chunk(mchunk_t *p)
 {
-    assert(p->size & FLAG_SORTED);
-    return chunk_at_offset(p, CHUNK_HEADER_SIZE + p->size - FLAG_SORTED);
-}
-static mchunk_t *next_chunk_u(mchunk_t *p)
-{
-    assert(!(p->size & FLAG_SORTED));
     return chunk_at_offset(p, CHUNK_HEADER_SIZE + p->size);
 }
-static mchunk_t *next_chunk_a(mchunk_t *p)
-{
-    return chunk_at_offset(p, CHUNK_HEADER_SIZE + (p->size & ~FLAG_SORTED));
-}
 
 
 /* The free chunks are stored in "bins".  Each bin is a doubly-linked
@@ -76,36 +75,73 @@
    neighbors to ensure this.
 
    In each bin's doubly-linked list, chunks are sorted by their size in
-   decreasing order (if you start from 'd.next').  At the end of this
-   list are some unsorted chunks.  All unsorted chunks are after all
-   sorted chunks.  The flag 'FLAG_SORTED' distinguishes them.
+   decreasing order (if you follow 'largebins[n].next',
+   'largebins[n].next->next', etc.).  At the end of this list are some
+   unsorted chunks.  All unsorted chunks are after all sorted chunks.
+   Unsorted chunks are distinguished by having 'u.up == UU_UNSORTED'.
 
    Note that if the user always calls large_malloc() with a large
    enough argument, then the few bins corresponding to smaller values
    will never be sorted at all.  They are still populated with the
    fragments of space between bigger allocations.
+
+   Following the 'd' linked list, we get only one chunk of every size.
+   The additional chunks of a given size are linked "vertically" in
+   the secondary 'u' doubly-linked list.
+
+
+                            +-----+
+                            | 296 |
+                            +-----+
+                              ^ |
+                              | v
+                            +-----+     +-----+
+                            | 296 |     | 288 |
+                            +-----+     +-----+
+                              ^ |         ^ |     UU_UNSORTED
+                              | v         | v          |
+   largebins    +-----+     +-----+     +-----+     +-----+     largebins
+   [4].next <-> | 304 | <-> | 296 | <-> | 288 | <-> | 296 | <-> [4].prev
+                +-----+     +-----+     +-----+     +-----+
+
 */
 
-static dlist_t largebins[N_BINS];
-static mchunk_t *first_chunk, *last_chunk;
+
+static struct {
+    int lock;
+    mchunk_t *first_chunk, *last_chunk;
+    dlist_t largebins[N_BINS];
+} lm __attribute__((aligned(64)));
+
+
+static void lm_lock(void)
+{
+    spinlock_acquire(lm.lock);
+}
+
+static void lm_unlock(void)
+{
+    spinlock_release(lm.lock);
+}
 
 
 static void insert_unsorted(mchunk_t *new)
 {
     size_t index = LAST_BIN_INDEX(new->size) ? N_BINS - 1
                                              : largebin_index(new->size);
-    new->d.next = &largebins[index];
-    new->d.prev = largebins[index].prev;
+    new->d.next = &lm.largebins[index];
+    new->d.prev = lm.largebins[index].prev;
     new->d.prev->next = &new->d;
-    largebins[index].prev = &new->d;
-    assert(!(new->size & FLAG_SORTED));
+    new->u.up = UU_UNSORTED;
+    new->u.down = NULL;
+    lm.largebins[index].prev = &new->d;
 }
 
 static int compare_chunks(const void *vchunk1, const void *vchunk2)
 {
     /* sort by size */
-    const mchunk_t *chunk1 = (const mchunk_t *)vchunk1;
-    const mchunk_t *chunk2 = (const mchunk_t *)vchunk2;
+    mchunk_t *chunk1 = *(mchunk_t *const *)vchunk1;
+    mchunk_t *chunk2 = *(mchunk_t *const *)vchunk2;
     if (chunk1->size < chunk2->size)
         return -1;
     if (chunk1->size == chunk2->size)
@@ -114,13 +150,15 @@
         return +1;
 }
 
+#define MAX_STACK_COUNT  64
+
 static void really_sort_bin(size_t index)
 {
-    dlist_t *unsorted = largebins[index].prev;
-    dlist_t *end = &largebins[index];
+    dlist_t *unsorted = lm.largebins[index].prev;
+    dlist_t *end = &lm.largebins[index];
     dlist_t *scan = unsorted->prev;
     size_t count = 1;
-    while (scan != end && !(data2chunk(scan)->size & FLAG_SORTED)) {
+    while (scan != end && data2chunk(scan)->u.up == UU_UNSORTED) {
         scan = scan->prev;
         ++count;
     }
@@ -128,12 +166,20 @@
     scan->next = end;
 
     mchunk_t *chunk1;
-    mchunk_t *chunks[count];    /* dynamically-sized */
+    mchunk_t *chunk_array[MAX_STACK_COUNT];
+    mchunk_t **chunks = chunk_array;
+
     if (count == 1) {
         chunk1 = data2chunk(unsorted);   /* common case */
         count = 0;
     }
     else {
+        if (count > MAX_STACK_COUNT) {
+            chunks = malloc(count * sizeof(mchunk_t *));
+            if (chunks == NULL) {
+                stm_fatalerror("out of memory");   // XXX
+            }
+        }
         size_t i;
         for (i = 0; i < count; i++) {
             chunks[i] = data2chunk(unsorted);
@@ -144,55 +190,111 @@
 
         chunk1 = chunks[--count];
     }
-    chunk1->size |= FLAG_SORTED;
     size_t search_size = chunk1->size;
-    dlist_t *head = largebins[index].next;
+    dlist_t *head = lm.largebins[index].next;
 
     while (1) {
-        if (head == end || search_size >= data2chunk(head)->size) {
+        if (head == end || data2chunk(head)->size < search_size) {
             /* insert 'chunk1' here, before the current head */
             head->prev->next = &chunk1->d;
             chunk1->d.prev = head->prev;
             head->prev = &chunk1->d;
             chunk1->d.next = head;
-            if (count == 0)
-                break;    /* all done */
-            chunk1 = chunks[--count];
-            chunk1->size |= FLAG_SORTED;
-            search_size = chunk1->size;
+            chunk1->u.up = NULL;
+            chunk1->u.down = NULL;
+            head = &chunk1->d;
+        }
+        else if (data2chunk(head)->size == search_size) {
+            /* insert 'chunk1' vertically in the 'u' list */
+            ulist_t *uhead = &data2chunk(head)->u;
+            chunk1->u.up = uhead->up;
+            chunk1->u.down = uhead;
+            if (uhead->up != NULL)
+                uhead->up->down = &chunk1->u;
+            uhead->up = &chunk1->u;
+#ifndef NDEBUG
+            chunk1->d.next = (dlist_t *)0x42;   /* not used */
+            chunk1->d.prev = (dlist_t *)0x42;
+#endif
         }
         else {
             head = head->next;
+            continue;
         }
+        if (count == 0)
+            break;    /* all done */
+        chunk1 = chunks[--count];
+        search_size = chunk1->size;
     }
+
+    if (chunks != chunk_array)
+        free(chunks);
 }
 
 static void sort_bin(size_t index)
 {
-    dlist_t *last = largebins[index].prev;
-    if (last != &largebins[index] && !(data2chunk(last)->size & FLAG_SORTED))
+    dlist_t *last = lm.largebins[index].prev;
+    if (last != &lm.largebins[index] && data2chunk(last)->u.up == UU_UNSORTED)
         really_sort_bin(index);
 }
 
+static void unlink_chunk(mchunk_t *mscan)
+{
+    if (mscan->u.down != NULL) {
+        /* unlink mscan from the vertical list 'u' */
+        ulist_t *up   = mscan->u.up;
+        ulist_t *down = mscan->u.down;
+        down->up = up;
+        if (up != NULL) up->down = down;
+    }
+    else {
+        dlist_t *prev = mscan->d.prev;
+        dlist_t *next = mscan->d.next;
+        if (mscan->u.up == NULL || mscan->u.up == UU_UNSORTED) {
+            /* unlink mscan from the doubly-linked list 'd' */
+            next->prev = prev;
+            prev->next = next;
+        }
+        else {
+            /* relink in the 'd' list the item above me */
+            mchunk_t *above = updown2chunk(mscan->u.up);
+            next->prev = &above->d;
+            prev->next = &above->d;
+            above->d.next = next;
+            above->d.prev = prev;
+            above->u.down = NULL;
+        }
+    }
+}
+
 char *_stm_large_malloc(size_t request_size)
 {
     /* 'request_size' should already be a multiple of the word size here */
     assert((request_size & (sizeof(char *)-1)) == 0);
 
+    /* it can be very small, but we need to ensure a minimal size
+       (currently 32 bytes) */
+    if (request_size < MIN_ALLOC_SIZE)
+        request_size = MIN_ALLOC_SIZE;
+
+    lm_lock();
+
     size_t index = largebin_index(request_size);
     sort_bin(index);
 
     /* scan through the chunks of current bin in reverse order
        to find the smallest that fits. */
-    dlist_t *scan = largebins[index].prev;
-    dlist_t *end = &largebins[index];
+    dlist_t *scan = lm.largebins[index].prev;
+    dlist_t *end = &lm.largebins[index];
     mchunk_t *mscan;
     while (scan != end) {
         mscan = data2chunk(scan);
         assert(mscan->prev_size == THIS_CHUNK_FREE);
-        assert(next_chunk_s(mscan)->prev_size == mscan->size - FLAG_SORTED);
+        assert(next_chunk(mscan)->prev_size == mscan->size);
+        assert(IMPLY(mscan->d.prev != end,
+                     data2chunk(mscan->d.prev)->size > mscan->size));
 
-        if (mscan->size > request_size)
+        if (mscan->size >= request_size)
             goto found;
         scan = mscan->d.prev;
     }
@@ -201,31 +303,40 @@
        smallest item of the first non-empty bin, as it will be large
        enough. */
     while (++index < N_BINS) {
-        if (largebins[index].prev != &largebins[index]) {
+        if (lm.largebins[index].prev != &lm.largebins[index]) {
             /* non-empty bin. */
             sort_bin(index);
-            scan = largebins[index].prev;
-            end = &largebins[index];
+            scan = lm.largebins[index].prev;
             mscan = data2chunk(scan);
             goto found;
         }
     }
 
     /* not enough memory. */
+    lm_unlock();
     return NULL;
 
  found:
-    assert(mscan->size & FLAG_SORTED);
-    assert(mscan->size > request_size);
+    assert(mscan->size >= request_size);
+    assert(mscan->u.up != UU_UNSORTED);
 
-    /* unlink mscan from the doubly-linked list */
-    mscan->d.next->prev = mscan->d.prev;
-    mscan->d.prev->next = mscan->d.next;
+    if (mscan->u.up != NULL) {
+        /* fast path: grab the item that is just above, to avoid needing
+           to rearrange the 'd' list */
+        mchunk_t *above = updown2chunk(mscan->u.up);
+        ulist_t *two_above = above->u.up;
+        mscan->u.up = two_above;
+        if (two_above != NULL) two_above->down = &mscan->u;
+        mscan = above;
+    }
+    else {
+        unlink_chunk(mscan);
+    }
 
-    size_t remaining_size_plus_1 = mscan->size - request_size;
-    if (remaining_size_plus_1 <= sizeof(struct malloc_chunk)) {
-        next_chunk_s(mscan)->prev_size = BOTH_CHUNKS_USED;
-        request_size = mscan->size & ~FLAG_SORTED;
+    size_t remaining_size = mscan->size - request_size;
+    if (remaining_size < sizeof(struct malloc_chunk)) {
+        next_chunk(mscan)->prev_size = BOTH_CHUNKS_USED;
+        request_size = mscan->size;
     }
     else {
         /* only part of the chunk is being used; reduce the size
@@ -234,27 +345,35 @@
         mchunk_t *new = chunk_at_offset(mscan, CHUNK_HEADER_SIZE +
                                                request_size);
         new->prev_size = THIS_CHUNK_FREE;
-        size_t remaining_size = remaining_size_plus_1 - 1 - CHUNK_HEADER_SIZE;
-        new->size = remaining_size;
-        next_chunk_u(new)->prev_size = remaining_size;
+        size_t remaining_data_size = remaining_size - CHUNK_HEADER_SIZE;
+        new->size = remaining_data_size;
+        next_chunk(new)->prev_size = remaining_data_size;
         insert_unsorted(new);
     }
     mscan->size = request_size;
     mscan->prev_size = BOTH_CHUNKS_USED;
+    increment_total_allocated(request_size + LARGE_MALLOC_OVERHEAD);
+
+    lm_unlock();
 
     return (char *)&mscan->d;
 }
 
-void _stm_large_free(char *data)
+static void _large_free(mchunk_t *chunk)
 {
-    mchunk_t *chunk = data2chunk(data);
     assert((chunk->size & (sizeof(char *) - 1)) == 0);
     assert(chunk->prev_size != THIS_CHUNK_FREE);
 
+    /* 'size' is at least MIN_ALLOC_SIZE */
+    increment_total_allocated(-(chunk->size + LARGE_MALLOC_OVERHEAD));
+
 #ifndef NDEBUG
-    assert(chunk->size >= sizeof(dlist_t));
-    assert(chunk->size <= (((char *)last_chunk) - (char *)data));
-    memset(data, 0xDE, chunk->size);
+    {
+        char *data = (char *)&chunk->d;
+        assert(chunk->size >= sizeof(dlist_t));
+        assert(chunk->size <= (((char *)lm.last_chunk) - data));
+        memset(data, 0xDE, chunk->size);
+    }
 #endif
 
     /* try to merge with the following chunk in memory */
@@ -262,17 +381,15 @@
     mchunk_t *mscan = chunk_at_offset(chunk, msize);
 
     if (mscan->prev_size == BOTH_CHUNKS_USED) {
-        assert((mscan->size & ((sizeof(char *) - 1) & ~FLAG_SORTED)) == 0);
+        assert((mscan->size & (sizeof(char *) - 1)) == 0);
         mscan->prev_size = chunk->size;
     }
     else {
-        mscan->size &= ~FLAG_SORTED;
         size_t fsize = mscan->size;
         mchunk_t *fscan = chunk_at_offset(mscan, fsize + CHUNK_HEADER_SIZE);
 
         /* unlink the following chunk */
-        mscan->d.next->prev = mscan->d.prev;
-        mscan->d.prev->next = mscan->d.next;
+        unlink_chunk(mscan);
 #ifndef NDEBUG
         mscan->prev_size = (size_t)-258;  /* 0xfffffffffffffefe */
         mscan->size = (size_t)-515;       /* 0xfffffffffffffdfd */
@@ -296,15 +413,14 @@
         msize = chunk->prev_size + CHUNK_HEADER_SIZE;
         mscan = chunk_at_offset(chunk, -msize);
         assert(mscan->prev_size == THIS_CHUNK_FREE);
-        assert((mscan->size & ~FLAG_SORTED) == chunk->prev_size);
+        assert(mscan->size == chunk->prev_size);
 
         /* unlink the previous chunk */
-        mscan->d.next->prev = mscan->d.prev;
-        mscan->d.prev->next = mscan->d.next;
+        unlink_chunk(mscan);
 
         /* merge the two chunks */
         mscan->size = msize + chunk->size;
-        next_chunk_u(mscan)->prev_size = mscan->size;
+        next_chunk(mscan)->prev_size = mscan->size;
 
         assert(chunk->prev_size = (size_t)-1);
         assert(chunk->size = (size_t)-1);
@@ -314,18 +430,28 @@
     insert_unsorted(chunk);
 }
 
+void _stm_large_free(char *data)
+{
+    lm_lock();
+    _large_free(data2chunk(data));
+    lm_unlock();
+}
+
 
 void _stm_large_dump(void)
 {
-    char *data = ((char *)first_chunk) + 16;
+    lm_lock();
+    char *data = ((char *)lm.first_chunk) + 16;
     size_t prev_size_if_free = 0;
+    fprintf(stderr, "\n");
     while (1) {
-        fprintf(stderr, "[ %p: %zu\n", data - 16, *(size_t*)(data - 16));
+        assert((((uintptr_t)data) & 7) == 0);   /* alignment */
+        fprintf(stderr, "[ %p: %zu", data - 16, *(size_t*)(data - 16));
         if (prev_size_if_free == 0) {
             assert(*(size_t*)(data - 16) == THIS_CHUNK_FREE ||
                    *(size_t*)(data - 16) == BOTH_CHUNKS_USED);
             if (*(size_t*)(data - 16) == THIS_CHUNK_FREE)
-                prev_size_if_free = (*(size_t*)(data - 8)) & ~FLAG_SORTED;
+                prev_size_if_free = (*(size_t*)(data - 8));
         }
         else {
             assert(*(size_t*)(data - 16) == prev_size_if_free);
@@ -333,30 +459,33 @@
         }
         if (*(size_t*)(data - 8) == END_MARKER)
             break;
-        fprintf(stderr, "  %p: %zu ]", data - 8, *(size_t*)(data - 8));
         if (prev_size_if_free) {
-            fprintf(stderr, " (free %p / %p)\n",
-                    *(void **)data, *(void **)(data + 8));
+            fprintf(stderr, "        \t(up %p / down %p)",
+                    *(void **)(data + 16), *(void **)(data + 24));
+        }
+        fprintf(stderr, "\n  %p: %zu ]", data - 8, *(size_t*)(data - 8));
+        if (prev_size_if_free) {
+            fprintf(stderr, "\t(prev %p <-> next %p)\n",
+                    *(void **)(data + 8), *(void **)data);
         }
         else {
             fprintf(stderr, "\n");
         }
-        if (!prev_size_if_free)
-            assert(!((*(size_t*)(data - 8)) & FLAG_SORTED));
         assert(*(ssize_t*)(data - 8) >= 16);
-        data += (*(size_t*)(data - 8)) & ~FLAG_SORTED;
+        data += *(size_t*)(data - 8);
         data += 16;
     }
-    fprintf(stderr, "  %p: end. ]\n\n", data - 8);
-    assert(data - 16 == (char *)last_chunk);
+    fprintf(stderr, "\n  %p: end. ]\n\n", data - 8);
+    assert(data - 16 == (char *)lm.last_chunk);
+    lm_unlock();
 }
 
 char *_stm_largemalloc_data_start(void)
 {
-    return (char *)first_chunk;
+    return (char *)lm.first_chunk;
 }
 
-#ifdef STM_TESTS
+#ifdef STM_LARGEMALLOC_TEST
 bool (*_stm_largemalloc_keep)(char *data);   /* a hook for tests */
 #endif
 
@@ -364,87 +493,95 @@
 {
     int i;
     for (i = 0; i < N_BINS; i++) {
-        largebins[i].prev = &largebins[i];
-        largebins[i].next = &largebins[i];
+        lm.largebins[i].prev = &lm.largebins[i];
+        lm.largebins[i].next = &lm.largebins[i];
     }
 
     assert(data_size >= 2 * sizeof(struct malloc_chunk));
     assert((data_size & 31) == 0);
-    first_chunk = (mchunk_t *)data_start;
-    first_chunk->prev_size = THIS_CHUNK_FREE;
-    first_chunk->size = data_size - 2 * CHUNK_HEADER_SIZE;
-    last_chunk = chunk_at_offset(first_chunk, data_size - CHUNK_HEADER_SIZE);
-    last_chunk->prev_size = first_chunk->size;
-    last_chunk->size = END_MARKER;
-    assert(last_chunk == next_chunk_u(first_chunk));
+    lm.first_chunk = (mchunk_t *)data_start;
+    lm.first_chunk->prev_size = THIS_CHUNK_FREE;
+    lm.first_chunk->size = data_size - 2 * CHUNK_HEADER_SIZE;
+    lm.last_chunk = chunk_at_offset(lm.first_chunk,
+                                    data_size - CHUNK_HEADER_SIZE);
+    lm.last_chunk->prev_size = lm.first_chunk->size;
+    lm.last_chunk->size = END_MARKER;
+    assert(lm.last_chunk == next_chunk(lm.first_chunk));
+    lm.lock = 0;
 
-    insert_unsorted(first_chunk);
+    insert_unsorted(lm.first_chunk);
 
-#ifdef STM_TESTS
+#ifdef STM_LARGEMALLOC_TEST
     _stm_largemalloc_keep = NULL;
 #endif
 }
 
 int _stm_largemalloc_resize_arena(size_t new_size)
 {
+    int result = 0;
+    lm_lock();
+
     if (new_size < 2 * sizeof(struct malloc_chunk))
-        return 0;
+        goto fail;
     OPT_ASSERT((new_size & 31) == 0);
 
     new_size -= CHUNK_HEADER_SIZE;
-    mchunk_t *new_last_chunk = chunk_at_offset(first_chunk, new_size);
-    mchunk_t *old_last_chunk = last_chunk;
-    size_t old_size = ((char *)old_last_chunk) - (char *)first_chunk;
+    mchunk_t *new_last_chunk = chunk_at_offset(lm.first_chunk, new_size);
+    mchunk_t *old_last_chunk = lm.last_chunk;
+    size_t old_size = ((char *)old_last_chunk) - (char *)lm.first_chunk;
 
     if (new_size < old_size) {
         /* check if there is enough free space at the end to allow
            such a reduction */
-        size_t lsize = last_chunk->prev_size;
+        size_t lsize = lm.last_chunk->prev_size;
         assert(lsize != THIS_CHUNK_FREE);
         if (lsize == BOTH_CHUNKS_USED)
-            return 0;
+            goto fail;
         lsize += CHUNK_HEADER_SIZE;
-        mchunk_t *prev_chunk = chunk_at_offset(last_chunk, -lsize);
+        mchunk_t *prev_chunk = chunk_at_offset(lm.last_chunk, -lsize);
         if (((char *)new_last_chunk) < ((char *)prev_chunk) +
                                        sizeof(struct malloc_chunk))
-            return 0;
+            goto fail;
 
         /* unlink the prev_chunk from the doubly-linked list */
-        prev_chunk->d.next->prev = prev_chunk->d.prev;
-        prev_chunk->d.prev->next = prev_chunk->d.next;
+        unlink_chunk(prev_chunk);
 
         /* reduce the prev_chunk */
-        assert((prev_chunk->size & ~FLAG_SORTED) == last_chunk->prev_size);
+        assert(prev_chunk->size == lm.last_chunk->prev_size);
         prev_chunk->size = ((char*)new_last_chunk) - (char *)prev_chunk
                            - CHUNK_HEADER_SIZE;
 
         /* make a fresh-new last chunk */
         new_last_chunk->prev_size = prev_chunk->size;
         new_last_chunk->size = END_MARKER;
-        last_chunk = new_last_chunk;
-        assert(last_chunk == next_chunk_u(prev_chunk));
+        lm.last_chunk = new_last_chunk;
+        assert(lm.last_chunk == next_chunk(prev_chunk));
 
         insert_unsorted(prev_chunk);
     }
     else if (new_size > old_size) {
         /* make the new last chunk first, with only the extra size */
-        mchunk_t *old_last_chunk = last_chunk;
+        mchunk_t *old_last_chunk = lm.last_chunk;
         old_last_chunk->size = (new_size - old_size) - CHUNK_HEADER_SIZE;
         new_last_chunk->prev_size = BOTH_CHUNKS_USED;
         new_last_chunk->size = END_MARKER;
-        last_chunk = new_last_chunk;
-        assert(last_chunk == next_chunk_u(old_last_chunk));
+        lm.last_chunk = new_last_chunk;
+        assert(lm.last_chunk == next_chunk(old_last_chunk));
 
         /* then free the last_chunk (turn it from "used" to "free) */
-        _stm_large_free((char *)&old_last_chunk->d);
+        _large_free(old_last_chunk);
     }
-    return 1;
+
+    result = 1;
+ fail:
+    lm_unlock();
+    return result;
 }
 
 
 static inline bool _largemalloc_sweep_keep(mchunk_t *chunk)
 {
-#ifdef STM_TESTS
+#ifdef STM_LARGEMALLOC_TEST
     if (_stm_largemalloc_keep != NULL)
         return _stm_largemalloc_keep((char *)&chunk->d);
 #endif
@@ -453,31 +590,32 @@
 
 void _stm_largemalloc_sweep(void)
 {
-    /* This may be slightly optimized by inlining _stm_large_free() and
+    lm_lock();
+
+    /* This may be slightly optimized by inlining _large_free() and
        making cases, e.g. we might know already if the previous block
        was free or not.  It's probably not really worth it. */
-    mchunk_t *mnext, *chunk = first_chunk;
+    mchunk_t *mnext, *chunk = lm.first_chunk;
 
     if (chunk->prev_size == THIS_CHUNK_FREE)
-        chunk = next_chunk_a(chunk);   /* go to the first non-free chunk */
+        chunk = next_chunk(chunk);   /* go to the first non-free chunk */
 
-    while (chunk != last_chunk) {
-
+    while (chunk != lm.last_chunk) {
         /* here, the chunk we're pointing to is not free */
         assert(chunk->prev_size != THIS_CHUNK_FREE);
 
         /* first figure out the next non-free chunk */
-        mnext = next_chunk_u(chunk);
+        mnext = next_chunk(chunk);
         if (mnext->prev_size == THIS_CHUNK_FREE)
-            mnext = next_chunk_a(mnext);
+            mnext = next_chunk(mnext);
 
         /* use the callback to know if 'chunk' contains an object that
            survives or dies */
         if (!_largemalloc_sweep_keep(chunk)) {
-            size_t size = chunk->size;
-            increment_total_allocated(-(size + LARGE_MALLOC_OVERHEAD));
-            _stm_large_free((char *)&chunk->d);     /* dies */
+            _large_free(chunk);     /* dies */
         }
         chunk = mnext;
     }
+
+    lm_unlock();
 }
diff --git a/c7/stm/misc.c b/c7/stm/misc.c
--- a/c7/stm/misc.c
+++ b/c7/stm/misc.c
@@ -75,19 +75,6 @@
 
 uint64_t _stm_total_allocated(void)
 {
-    mutex_pages_lock();
-    uint64_t result = increment_total_allocated(0);
-    mutex_pages_unlock();
-    return result;
-}
-
-void _stm_mutex_pages_lock(void)
-{
-    mutex_pages_lock();
-}
-
-void _stm_mutex_pages_unlock(void)
-{
-    mutex_pages_unlock();
+    return increment_total_allocated(0);
 }
 #endif
diff --git a/c7/stm/nursery.c b/c7/stm/nursery.c
--- a/c7/stm/nursery.c
+++ b/c7/stm/nursery.c
@@ -193,9 +193,7 @@
                content); or add the object to 'large_overflow_objects'.
             */
             if (STM_PSEGMENT->minor_collect_will_commit_now) {
-                mutex_pages_lock();
                 synchronize_object_now(obj);
-                mutex_pages_unlock();
             }
             else
                 LIST_APPEND(STM_PSEGMENT->large_overflow_objects, obj);
@@ -231,23 +229,13 @@
 
     /* free any object left from 'young_outside_nursery' */
     if (!tree_is_cleared(pseg->young_outside_nursery)) {
-        bool locked = false;
         wlog_t *item;
+
         TREE_LOOP_FORWARD(*pseg->young_outside_nursery, item) {
             assert(!_is_in_nursery((object_t *)item->addr));
-            if (!locked) {
-                mutex_pages_lock();
-                locked = true;
-            }
-            char *realobj = REAL_ADDRESS(pseg->pub.segment_base, item->addr);
-            ssize_t size = stmcb_size_rounded_up((struct object_s *)realobj);
-            increment_total_allocated(-(size + LARGE_MALLOC_OVERHEAD));
             _stm_large_free(stm_object_pages + item->addr);
         } TREE_LOOP_END;
 
-        if (locked)
-            mutex_pages_unlock();
-
         tree_clear(pseg->young_outside_nursery);
     }
 
diff --git a/c7/stm/pages.c b/c7/stm/pages.c
--- a/c7/stm/pages.c
+++ b/c7/stm/pages.c
@@ -5,16 +5,12 @@
 
 /************************************************************/
 
-static union {
-    struct {
-        uint8_t mutex_pages;
-        volatile bool major_collection_requested;
-        uint64_t total_allocated;  /* keep track of how much memory we're
-                                      using, ignoring nurseries */
-        uint64_t total_allocated_bound;
-    };
-    char reserved[64];
-} pages_ctl __attribute__((aligned(64)));
+struct {
+    volatile bool major_collection_requested;
+    uint64_t total_allocated;  /* keep track of how much memory we're
+                                  using, ignoring nurseries */
+    uint64_t total_allocated_bound;
+} pages_ctl;
 
 
 static void setup_pages(void)
@@ -28,37 +24,15 @@
     memset(pages_privatized, 0, sizeof(pages_privatized));
 }
 
-static void mutex_pages_lock(void)
-{
-    if (__sync_lock_test_and_set(&pages_ctl.mutex_pages, 1) == 0)
-        return;
-
-    int previous = change_timing_state(STM_TIME_SPIN_LOOP);
-    while (__sync_lock_test_and_set(&pages_ctl.mutex_pages, 1) != 0) {
-        spin_loop();
-    }
-    change_timing_state(previous);
-}
-
-static void mutex_pages_unlock(void)
-{
-    __sync_lock_release(&pages_ctl.mutex_pages);
-}
-
-static bool _has_mutex_pages(void)
-{
-    return pages_ctl.mutex_pages != 0;
-}
-
 static uint64_t increment_total_allocated(ssize_t add_or_remove)
 {
-    assert(_has_mutex_pages());
-    pages_ctl.total_allocated += add_or_remove;
+    uint64_t ta = __sync_add_and_fetch(&pages_ctl.total_allocated,
+                                       add_or_remove);
 
-    if (pages_ctl.total_allocated >= pages_ctl.total_allocated_bound)
+    if (ta >= pages_ctl.total_allocated_bound)
         pages_ctl.major_collection_requested = true;
 
-    return pages_ctl.total_allocated;
+    return ta;
 }
 
 static bool is_major_collection_requested(void)
@@ -95,6 +69,17 @@
              (void *)((addr - stm_object_pages) % (4096UL * NB_PAGES)),
              (long)pgoff / NB_PAGES,
              (void *)((pgoff % NB_PAGES) * 4096UL)));
+    assert(size % 4096 == 0);
+    assert(size <= TOTAL_MEMORY);
+    assert(((uintptr_t)addr) % 4096 == 0);
+    assert(addr >= stm_object_pages);
+    assert(addr <= stm_object_pages + TOTAL_MEMORY - size);
+    assert(pgoff >= 0);
+    assert(pgoff <= (TOTAL_MEMORY - size) / 4096UL);
+
+    /* assert remappings follow the rule that page N in one segment
+       can only be remapped to page N in another segment */
+    assert(((addr - stm_object_pages) / 4096UL - pgoff) % NB_PAGES == 0);
 
     int res = remap_file_pages(addr, size, 0, pgoff, 0);
     if (UNLIKELY(res < 0))
@@ -106,10 +91,12 @@
     /* call remap_file_pages() to make all pages in the range(pagenum,
        pagenum+count) refer to the same physical range of pages from
        segment 0. */
-    uintptr_t i;
-    assert(_has_mutex_pages());
+    dprintf(("pages_initialize_shared: 0x%ld - 0x%ld\n", pagenum,
+             pagenum + count));
+    assert(pagenum < NB_PAGES);
     if (count == 0)
         return;
+    uintptr_t i;
     for (i = 1; i <= NB_SEGMENTS; i++) {
         char *segment_base = get_segment_base(i);
         d_remap_file_pages(segment_base + pagenum * 4096UL,
@@ -119,14 +106,20 @@
 
 static void page_privatize(uintptr_t pagenum)
 {
-    if (is_private_page(STM_SEGMENT->segment_num, pagenum)) {
-        /* the page is already privatized */
+    /* check this thread's 'pages_privatized' bit */
+    uint64_t bitmask = 1UL << (STM_SEGMENT->segment_num - 1);
+    struct page_shared_s *ps = &pages_privatized[pagenum - PAGE_FLAG_START];
+    if (ps->by_segment & bitmask) {
+        /* the page is already privatized; nothing to do */
         return;
     }
 
-    /* lock, to prevent concurrent threads from looking up this thread's
-       'pages_privatized' bits in parallel */
-    mutex_pages_lock();
+#ifndef NDEBUG
+    spinlock_acquire(lock_pages_privatizing[STM_SEGMENT->segment_num]);
+#endif
+
+    /* add this thread's 'pages_privatized' bit */
+    __sync_fetch_and_add(&ps->by_segment, bitmask);
 
     /* "unmaps" the page to make the address space location correspond
        again to its underlying file offset (XXX later we should again
@@ -140,11 +133,9 @@
     /* copy the content from the shared (segment 0) source */
     pagecopy(new_page, stm_object_pages + pagenum * 4096UL);
 
-    /* add this thread's 'pages_privatized' bit */
-    uint64_t bitmask = 1UL << (STM_SEGMENT->segment_num - 1);
-    pages_privatized[pagenum - PAGE_FLAG_START].by_segment |= bitmask;
-
-    mutex_pages_unlock();
+#ifndef NDEBUG
+    spinlock_release(lock_pages_privatizing[STM_SEGMENT->segment_num]);
+#endif
 }
 
 static void _page_do_reshare(long segnum, uintptr_t pagenum)
diff --git a/c7/stm/pages.h b/c7/stm/pages.h
--- a/c7/stm/pages.h
+++ b/c7/stm/pages.h
@@ -34,6 +34,20 @@
 };
 
 static struct page_shared_s pages_privatized[PAGE_FLAG_END - PAGE_FLAG_START];
+/* Rules for concurrent access to this array, possibly with is_private_page():
+
+   - we clear bits only during major collection, when all threads are
+     synchronized anyway
+
+   - we set only the bit corresponding to our segment number, using
+     an atomic addition; and we do it _before_ we actually make the
+     page private.
+
+   - concurrently, other threads checking the bits might (rarely)
+     get the answer 'true' to is_private_page() even though it is not
+     actually private yet.  This inconsistency is in the direction
+     that we want for synchronize_object_now().
+*/
 
 static void pages_initialize_shared(uintptr_t pagenum, uintptr_t count);
 static void page_privatize(uintptr_t pagenum);
@@ -41,10 +55,6 @@
 static void _page_do_reshare(long segnum, uintptr_t pagenum);
 static void pages_setup_readmarkers_for_nursery(void);
 
-/* Note: don't ever do "mutex_pages_lock(); mutex_lock()" in that order */
-static void mutex_pages_lock(void);
-static void mutex_pages_unlock(void);
-static bool _has_mutex_pages(void) __attribute__((unused));
 static uint64_t increment_total_allocated(ssize_t add_or_remove);
 static bool is_major_collection_requested(void);
 static void force_major_collection_request(void);
@@ -62,3 +72,7 @@
     if (pages_privatized[pagenum - PAGE_FLAG_START].by_segment != 0)
         page_reshare(pagenum);
 }
+
+#ifndef NDEBUG
+static char lock_pages_privatizing[NB_SEGMENTS + 1] = { 0 };
+#endif
diff --git a/c7/stm/setup.c b/c7/stm/setup.c
--- a/c7/stm/setup.c
+++ b/c7/stm/setup.c
@@ -9,7 +9,7 @@
                         PROT_READ | PROT_WRITE,
                         MAP_PAGES_FLAGS, -1, 0);
     if (result == MAP_FAILED)
-        stm_fatalerror("%s failed: %m\n", reason);
+        stm_fatalerror("%s failed: %m", reason);
 
     return result;
 }
@@ -132,17 +132,37 @@
     teardown_pages();
 }
 
+static void _shadowstack_trap_page(char *start, int prot)
+{
+    size_t bsize = STM_SHADOW_STACK_DEPTH * sizeof(struct stm_shadowentry_s);
+    char *end = start + bsize + 4095;
+    end -= (((uintptr_t)end) & 4095);
+    mprotect(end, 4096, prot);
+}
+
 static void _init_shadow_stack(stm_thread_local_t *tl)
 {
-    struct stm_shadowentry_s *s = (struct stm_shadowentry_s *)
-        malloc(SHADOW_STACK_SIZE * sizeof(struct stm_shadowentry_s));
-    assert(s);
+    size_t bsize = STM_SHADOW_STACK_DEPTH * sizeof(struct stm_shadowentry_s);
+    char *start = malloc(bsize + 8192);  /* for the trap page, plus rounding */
+    if (!start)
+        stm_fatalerror("can't allocate shadow stack");
+
+    /* set up a trap page: if the shadowstack overflows, it will
+       crash in a clean segfault */
+    _shadowstack_trap_page(start, PROT_NONE);
+
+    struct stm_shadowentry_s *s = (struct stm_shadowentry_s *)start;
     tl->shadowstack = s;
     tl->shadowstack_base = s;
 }
 
 static void _done_shadow_stack(stm_thread_local_t *tl)
 {
+    assert(tl->shadowstack >= tl->shadowstack_base);
+
+    char *start = (char *)tl->shadowstack_base;
+    _shadowstack_trap_page(start, PROT_READ | PROT_WRITE);
+
     free(tl->shadowstack_base);
     tl->shadowstack = NULL;
     tl->shadowstack_base = NULL;
diff --git a/c7/stm/timing.c b/c7/stm/timing.c
--- a/c7/stm/timing.c
+++ b/c7/stm/timing.c
@@ -55,7 +55,6 @@
     "minor gc",
     "major gc",
     "sync pause",
-    "spin loop",
 };
 
 void stm_flush_timing(stm_thread_local_t *tl, int verbose)
diff --git a/c7/stm/weakref.c b/c7/stm/weakref.c
--- a/c7/stm/weakref.c
+++ b/c7/stm/weakref.c
@@ -34,7 +34,7 @@
     stm_char *point_to_loc = (stm_char*)WEAKREF_PTR(weakref, size);
 
     long i;
-    for (i = 1; i <= NB_SEGMENTS; i++) {
+    for (i = 0; i <= NB_SEGMENTS; i++) {
         char *base = get_segment_base(i);
         object_t ** ref_loc = (object_t **)REAL_ADDRESS(base, point_to_loc);
         *ref_loc = value;
@@ -57,11 +57,14 @@
                a young outside nursery object. */
             assert(_is_in_nursery(item));
             object_t *TLPREFIX *pforwarded_array = (object_t *TLPREFIX *)item;
+            ssize_t size = 16;
 
-            /* the following checks are done like in nursery.c: */
-            if (!(item->stm_flags & GCFLAG_HAS_SHADOW)
-                || (pforwarded_array[0] != GCWORD_MOVED)) {
-                /* weakref dies */
+            /* check if the weakref object was moved out of the nursery */
+            if (pforwarded_array[0] != GCWORD_MOVED) {
+                /* no: weakref dies */
+#ifndef NDEBUG
+                *WEAKREF_PTR(item, size) = (object_t *)-99;
+#endif
                 continue;
             }
 
@@ -69,15 +72,13 @@
 
             assert(!_is_young(item));
 
-            ssize_t size = 16;
             object_t *pointing_to = *WEAKREF_PTR(item, size);
             assert(pointing_to != NULL);
 
             if (_is_in_nursery(pointing_to)) {
                 object_t *TLPREFIX *pforwarded_array = (object_t *TLPREFIX *)pointing_to;
-                /* the following checks are done like in nursery.c: */
-                if (!(pointing_to->stm_flags & GCFLAG_HAS_SHADOW)
-                    || (pforwarded_array[0] != GCWORD_MOVED)) {
+                /* check if the target was moved out of the nursery */
+                if (pforwarded_array[0] != GCWORD_MOVED) {
                     /* pointing_to dies */
                     _set_weakref_in_all_segments(item, NULL);
                     continue;   /* no need to remember in old_weakrefs */
@@ -96,7 +97,9 @@
                     _set_weakref_in_all_segments(item, NULL);
                     continue;   /* no need to remember in old_weakrefs */
                 }
-                /* pointing_to was already old */
+                /* pointing_to is either a surviving young object outside
+                   the nursery, or it was already old; in both cases keeping
+                   the currently stored pointer is what we need */
             }
             LIST_APPEND(STM_PSEGMENT->old_weakrefs, item);
         }));
@@ -128,7 +131,7 @@
             stm_char *wr = (stm_char *)WEAKREF_PTR(weakref, size);
             char *real_wr = REAL_ADDRESS(pseg->pub.segment_base, wr);
             object_t *pointing_to = *(object_t **)real_wr;
-            assert(pointing_to != NULL);
+            assert((uintptr_t)pointing_to >= NURSERY_END);
             if (!mark_visited_test(pointing_to)) {
                 //assert(flag_page_private[(uintptr_t)weakref / 4096UL] != PRIVATE_PAGE);
                 _set_weakref_in_all_segments(weakref, NULL);
diff --git a/c7/stmgc.h b/c7/stmgc.h
--- a/c7/stmgc.h
+++ b/c7/stmgc.h
@@ -70,7 +70,6 @@
     STM_TIME_MINOR_GC,
     STM_TIME_MAJOR_GC,
     STM_TIME_SYNC_PAUSE,
-    STM_TIME_SPIN_LOOP,
     _STM_TIME_N
 };
 
@@ -136,8 +135,6 @@
 object_t *_stm_enum_modified_old_objects(long index);
 object_t *_stm_enum_objects_pointing_to_nursery(long index);
 uint64_t _stm_total_allocated(void);
-void _stm_mutex_pages_lock(void);
-void _stm_mutex_pages_unlock(void);
 char *stm_object_pages;
 #endif
 
@@ -262,6 +259,10 @@
 void stm_setup(void);
 void stm_teardown(void);
 
+/* The size of each shadow stack, in number of entries.
+   Must be big enough to accomodate all STM_PUSH_ROOTs! */
+#define STM_SHADOW_STACK_DEPTH   163840
+
 /* Push and pop roots from/to the shadow stack. Only allowed inside
    transaction. */
 #define STM_PUSH_ROOT(tl, p)   ((tl).shadowstack++->ss = (object_t *)(p))
diff --git a/c7/test/support.py b/c7/test/support.py
--- a/c7/test/support.py
+++ b/c7/test/support.py
@@ -96,8 +96,6 @@
 
 void stm_collect(long level);
 uint64_t _stm_total_allocated(void);
-void _stm_mutex_pages_lock(void);
-void _stm_mutex_pages_unlock(void);
 
 long stm_identityhash(object_t *obj);
 long stm_id(object_t *obj);
@@ -279,6 +277,7 @@
 
 ''', sources=source_files,
      define_macros=[('STM_TESTS', '1'),
+                    ('STM_LARGEMALLOC_TEST', '1'),
                     ('STM_NO_COND_WAIT', '1'),
                     ('STM_DEBUGPRINT', '1'),
                     ('GC_N_SMALL_REQUESTS', str(GC_N_SMALL_REQUESTS)), #check
diff --git a/c7/test/test_largemalloc.py b/c7/test/test_largemalloc.py
--- a/c7/test/test_largemalloc.py
+++ b/c7/test/test_largemalloc.py
@@ -14,10 +14,12 @@
 
         lib.memset(self.rawmem, 0xcd, self.size)
         lib._stm_largemalloc_init_arena(self.rawmem, self.size)
-        lib._stm_mutex_pages_lock()   # for this file
 
     def test_simple(self):
+        #
+        lib._stm_large_dump()
         d1 = lib._stm_large_malloc(7000)
+        lib._stm_large_dump()
         d2 = lib._stm_large_malloc(8000)
         print d1
         print d2
@@ -70,7 +72,7 @@
         lib._stm_large_dump()
 
     def test_resize_arena_reduce_2(self):
-        lib._stm_large_malloc(self.size // 2 - 64)
+        lib._stm_large_malloc(self.size // 2 - 80)
         r = lib._stm_largemalloc_resize_arena(self.size // 2)
         assert r == 1
         lib._stm_large_dump()
@@ -120,7 +122,7 @@
                 p.append((d, sz, content1, content2))
         lib._stm_large_dump()
 
-    def test_random_largemalloc_sweep(self):
+    def test_random_largemalloc_sweep(self, constrained_size_range=False):
         @ffi.callback("bool(char *)")
         def keep(data):
             try:
@@ -138,7 +140,11 @@
 
         r = random.Random(1000)
         for j in range(500):
-            sizes = [random.choice(range(104, 500, 8)) for i in range(20)]
+            if constrained_size_range:
+                max = 120
+            else:
+                max = 500
+            sizes = [random.choice(range(104, max, 8)) for i in range(20)]
             all = [lib._stm_large_malloc(size) for size in sizes]
             print all
 
@@ -170,3 +176,6 @@
                     assert all[i][50] == chr(65 + i)
                 else:
                     assert all_orig[i][50] == '\xDE'
+
+    def test_random_largemalloc_sweep_constrained_size_range(self):
+        self.test_random_largemalloc_sweep(constrained_size_range=True)
diff --git a/c7/test/test_weakref.py b/c7/test/test_weakref.py
--- a/c7/test/test_weakref.py
+++ b/c7/test/test_weakref.py
@@ -360,3 +360,40 @@
         self.switch(1)
         make_wr()
         stm_major_collect()
+
+
+class TestManyThreads(BaseTest):
+    NB_THREADS = NB_SEGMENTS
+
+    def test_weakref_bug3(self):
+        # make an object
+        self.start_transaction()
+        lp0 = stm_allocate(16)
+        self.push_root(lp0)
+        self.commit_transaction()
+        lp0 = self.pop_root()
+        self.push_root(lp0)
+        #
+        # privatize the page in all segments
+        for i in range(NB_SEGMENTS-1, -1, -1):
+            self.switch(i)
+            self.start_transaction()
+            stm_set_char(lp0, 'A')
+            self.commit_transaction()
+        #
+        self.start_transaction()
+        lp2 = stm_allocate(16)
+        self.push_root(lp2)
+        lp1 = stm_allocate_weakref(lp2)
+        self.push_root(lp1)
+        self.commit_transaction()
+        lp1 = self.pop_root()
+        lp2 = self.pop_root()
+        self.push_root(lp2)
+        self.push_root(lp1)
+        # the commit copies the weakref to all segments, but misses
+        # segment #0
+        #
+        self.start_transaction()
+        stm_major_collect()    # reshare all, keeping only segment #0
+        assert stm_get_weakref(lp1) == lp2