[pypy-commit] stmgc c8-private-pages: start with largemalloc support (WIP)

Wed Jan 14 14:08:42 CET 2015

Author: Remi Meier <remi.meier at inf.ethz.ch>
Branch: c8-private-pages
Changeset: r1525:349d0e3910ea
Date: 2015-01-14 13:41 +0100
http://bitbucket.org/pypy/stmgc/changeset/349d0e3910ea/

Log:	start with largemalloc support (WIP)

diff --git a/c8/stm/core.c b/c8/stm/core.c
--- a/c8/stm/core.c
+++ b/c8/stm/core.c
@@ -144,11 +144,18 @@
             most_recent_rev = log_entry->rev_num;
         }
     }
-    OPT_ASSERT(copy_from_segnum != -1 && copy_from_segnum != my_segnum);
+    OPT_ASSERT(copy_from_segnum != my_segnum);
 
-    /* make our page private */
+    /* make our page write-ready */
     page_mark_accessible(my_segnum, pagenum);
-    assert(get_page_status_in(my_segnum, pagenum) == PAGE_ACCESSIBLE);
+
+    if (copy_from_segnum == -1) {
+        /* this page is only accessible in the sharing segment so far (new
+           allocation). We can thus simply mark it accessible here and
+           not care about its contents so far. */
+        release_all_privatization_locks();
+        return;
+    }
 
     /* before copying anything, acquire modification locks from our and
        the other segment */
diff --git a/c8/stm/gcpage.c b/c8/stm/gcpage.c
--- a/c8/stm/gcpage.c
+++ b/c8/stm/gcpage.c
@@ -2,27 +2,47 @@
 # error "must be compiled via stmgc.c"
 #endif
 
+static struct list_s *testing_prebuilt_objs = NULL;
+static struct tree_s *tree_prebuilt_objs = NULL;     /* XXX refactor */
+
 
 static void setup_gcpage(void)
 {
+    char *base = stm_object_pages + END_NURSERY_PAGE * 4096UL;
+    uintptr_t length = (NB_PAGES - END_NURSERY_PAGE) * 4096UL;
+    _stm_largemalloc_init_arena(base, length);
+
     uninitialized_page_start = stm_object_pages + END_NURSERY_PAGE * 4096UL;
     uninitialized_page_stop  = uninitialized_page_start + NB_SHARED_PAGES * 4096UL;
 }
 
 static void teardown_gcpage(void)
 {
+    LIST_FREE(testing_prebuilt_objs);
+    if (tree_prebuilt_objs != NULL) {
+        tree_free(tree_prebuilt_objs);
+        tree_prebuilt_objs = NULL;
+    }
 }
 
+
+
 static void setup_N_pages(char *pages_addr, long num)
 {
-    /* initialize to |N|P|N|N| */
+    /* make pages accessible in sharing segment only (pages already
+       PROT_READ/WRITE (see setup.c), but not marked accessible as page
+       status). */
+
+    /* lock acquiring maybe not necessary because the affected pages don't
+       need privatization protection. (but there is an assert right
+       now to enforce that XXXXXX) */
     acquire_all_privatization_locks();
 
     uintptr_t p = (pages_addr - stm_object_pages) / 4096UL;
     dprintf(("setup_N_pages(%p, %lu): pagenum %lu\n", pages_addr, num, p));
     while (num-->0) {
         /* XXX: page_range_mark_accessible() */
-        page_mark_accessible(STM_SEGMENT->segment_num, p + num);
+        page_mark_accessible(0, p + num);
     }
 
     release_all_privatization_locks();
@@ -33,14 +53,23 @@
 
 static stm_char *allocate_outside_nursery_large(uint64_t size)
 {
-    /* XXX: real allocation */
+    /* Allocate the object with largemalloc.c from the lower addresses. */
+    char *addr = _stm_large_malloc(size);
+    if (addr == NULL)
+        stm_fatalerror("not enough memory!");
+
+    if (LIKELY(addr + size <= uninitialized_page_start))
+        return (stm_char*)(addr - stm_object_pages);
+
+
+    /* uncommon case: need to initialize some more pages */
     spinlock_acquire(lock_growth_large);
-    char *addr = uninitialized_page_start;
 
     char *start = uninitialized_page_start;
-    if (addr + size > start) {  /* XXX: always for now */
+    if (addr + size > start) {
         uintptr_t npages;
-        npages = (addr + size - start) / 4096UL + 1;
+        npages = (addr + size - start) / 4096UL;
+        npages += GCPAGE_NUM_PAGES;
         if (uninitialized_page_stop - start < npages * 4096UL) {
             stm_fatalerror("out of memory!");   /* XXX */
         }
diff --git a/c8/stm/largemalloc.c b/c8/stm/largemalloc.c
new file mode 100644
--- /dev/null
+++ b/c8/stm/largemalloc.c
@@ -0,0 +1,623 @@
+#ifndef _STM_CORE_H_
+# error "must be compiled via stmgc.c"
+#endif
+
+/* This contains a lot of inspiration from malloc() in the GNU C Library.
+   More precisely, this is (a subset of) the part that handles large
+   blocks, which in our case means at least 288 bytes.  It is actually
+   a general allocator, although it doesn't contain any of the small-
+   or medium-block support that are also present in the GNU C Library.
+*/
+
+#define largebin_index(sz)                                      \
+    (((sz) < (48 <<  6)) ?      ((sz) >>  6):  /*  0 - 47 */    \
+     ((sz) < (24 <<  9)) ? 42 + ((sz) >>  9):  /* 48 - 65 */    \
+     ((sz) < (12 << 12)) ? 63 + ((sz) >> 12):  /* 66 - 74 */    \
+     ((sz) < (6  << 15)) ? 74 + ((sz) >> 15):  /* 75 - 79 */    \
+     ((sz) < (3  << 18)) ? 80 + ((sz) >> 18):  /* 80 - 82 */    \
+                           83)
+#define N_BINS             84
+#define LAST_BIN_INDEX(sz) ((sz) >= (3 << 18))
+
+typedef struct dlist_s {
+    struct dlist_s *next;   /* a circular doubly-linked list */
+    struct dlist_s *prev;
+} dlist_t;
+
+typedef struct ulist_s {
+    struct ulist_s *up;     /* a non-circular doubly-linked list */
+    struct ulist_s *down;
+} ulist_t;
+
+typedef struct malloc_chunk {
+    size_t prev_size;     /* - if the previous chunk is free: size of its data
+                             - otherwise, if this chunk is free: 1
+                             - otherwise, 0. */
+    size_t size;          /* size of the data in this chunk */
+
+    dlist_t d;            /* if free: a doubly-linked list 'largebins' */
+                          /* if not free: the user data starts here */
+    ulist_t u;            /* if free, if unsorted: up==UU_UNSORTED
+                             if free, if sorted: a doubly-linked list */
+
+    /* The chunk has a total size of 'size'.  It is immediately followed
+       in memory by another chunk.  This list ends with the last "chunk"
+       being actually only two words long, with END_MARKER as 'size'.
+       Both this last chunk and the theoretical chunk before the first
+       one are considered "not free". */
+} mchunk_t;
+
+#define UU_UNSORTED          ((ulist_t *) 1)
+#define THIS_CHUNK_FREE      1
+#define BOTH_CHUNKS_USED     0
+#define CHUNK_HEADER_SIZE    offsetof(struct malloc_chunk, d)
+#define END_MARKER           0xDEADBEEF
+#define MIN_ALLOC_SIZE       (sizeof(struct malloc_chunk) - CHUNK_HEADER_SIZE)
+
+#define chunk_at_offset(p, ofs)  ((mchunk_t *)(((char *)(p)) + (ofs)))
+#define data2chunk(p)            chunk_at_offset(p, -CHUNK_HEADER_SIZE)
+#define updown2chunk(p)          chunk_at_offset(p,                     \
+                                     -(CHUNK_HEADER_SIZE + sizeof(dlist_t)))
+
+static mchunk_t *next_chunk(mchunk_t *p)
+{
+    return chunk_at_offset(p, CHUNK_HEADER_SIZE + p->size);
+}
+
+
+/* The free chunks are stored in "bins".  Each bin is a doubly-linked
+   list of chunks.  There are 84 bins, with largebin_index() giving the
+   correspondence between sizes and bin indices.
+
+   Each free chunk is preceeded in memory by a non-free chunk (or no
+   chunk at all).  Each free chunk is followed in memory by a non-free
+   chunk (or no chunk at all).  Chunks are consolidated with their
+   neighbors to ensure this.
+
+   In each bin's doubly-linked list, chunks are sorted by their size in
+   decreasing order (if you follow 'largebins[n].next',
+   'largebins[n].next->next', etc.).  At the end of this list are some
+   unsorted chunks.  All unsorted chunks are after all sorted chunks.
+   Unsorted chunks are distinguished by having 'u.up == UU_UNSORTED'.
+
+   Note that if the user always calls large_malloc() with a large
+   enough argument, then the few bins corresponding to smaller values
+   will never be sorted at all.  They are still populated with the
+   fragments of space between bigger allocations.
+
+   Following the 'd' linked list, we get only one chunk of every size.
+   The additional chunks of a given size are linked "vertically" in
+   the secondary 'u' doubly-linked list.
+
+
+                            +-----+
+                            | 296 |
+                            +-----+
+                              ^ |
+                              | v
+                            +-----+     +-----+
+                            | 296 |     | 288 |
+                            +-----+     +-----+
+                              ^ |         ^ |     UU_UNSORTED
+                              | v         | v          |
+   largebins    +-----+     +-----+     +-----+     +-----+     largebins
+   [4].next <-> | 304 | <-> | 296 | <-> | 288 | <-> | 296 | <-> [4].prev
+                +-----+     +-----+     +-----+     +-----+
+
+*/
+
+
+static struct {
+    int lock;
+    mchunk_t *first_chunk, *last_chunk;
+    dlist_t largebins[N_BINS];
+} lm __attribute__((aligned(64)));
+
+
+static void lm_lock(void)
+{
+    spinlock_acquire(lm.lock);
+}
+
+static void lm_unlock(void)
+{
+    spinlock_release(lm.lock);
+}
+
+
+static void insert_unsorted(mchunk_t *new)
+{
+    size_t index = LAST_BIN_INDEX(new->size) ? N_BINS - 1
+                                             : largebin_index(new->size);
+    new->d.next = &lm.largebins[index];
+    new->d.prev = lm.largebins[index].prev;
+    new->d.prev->next = &new->d;
+    new->u.up = UU_UNSORTED;
+    new->u.down = NULL;
+    lm.largebins[index].prev = &new->d;
+}
+
+static int compare_chunks(const void *vchunk1, const void *vchunk2)
+{
+    /* sort by size */
+    mchunk_t *chunk1 = *(mchunk_t *const *)vchunk1;
+    mchunk_t *chunk2 = *(mchunk_t *const *)vchunk2;
+    if (chunk1->size < chunk2->size)
+        return -1;
+    if (chunk1->size == chunk2->size)
+        return 0;
+    else
+        return +1;
+}
+
+#define MAX_STACK_COUNT  64
+
+static void really_sort_bin(size_t index)
+{
+    dlist_t *unsorted = lm.largebins[index].prev;
+    dlist_t *end = &lm.largebins[index];
+    dlist_t *scan = unsorted->prev;
+    size_t count = 1;
+    while (scan != end && data2chunk(scan)->u.up == UU_UNSORTED) {
+        scan = scan->prev;
+        ++count;
+    }
+    end->prev = scan;
+    scan->next = end;
+
+    mchunk_t *chunk1;
+    mchunk_t *chunk_array[MAX_STACK_COUNT];
+    mchunk_t **chunks = chunk_array;
+
+    if (count == 1) {
+        chunk1 = data2chunk(unsorted);   /* common case */
+        count = 0;
+    }
+    else {
+        if (count > MAX_STACK_COUNT) {
+            chunks = malloc(count * sizeof(mchunk_t *));
+            if (chunks == NULL) {
+                stm_fatalerror("out of memory");   // XXX
+            }
+        }
+        size_t i;
+        for (i = 0; i < count; i++) {
+            chunks[i] = data2chunk(unsorted);
+            unsorted = unsorted->prev;
+        }
+        assert(unsorted == scan);
+        qsort(chunks, count, sizeof(mchunk_t *), compare_chunks);
+
+        chunk1 = chunks[--count];
+    }
+    size_t search_size = chunk1->size;
+    dlist_t *head = lm.largebins[index].next;
+
+    while (1) {
+        if (head == end || data2chunk(head)->size < search_size) {
+            /* insert 'chunk1' here, before the current head */
+            head->prev->next = &chunk1->d;
+            chunk1->d.prev = head->prev;
+            head->prev = &chunk1->d;
+            chunk1->d.next = head;
+            chunk1->u.up = NULL;
+            chunk1->u.down = NULL;
+            head = &chunk1->d;
+        }
+        else if (data2chunk(head)->size == search_size) {
+            /* insert 'chunk1' vertically in the 'u' list */
+            ulist_t *uhead = &data2chunk(head)->u;
+            chunk1->u.up = uhead->up;
+            chunk1->u.down = uhead;
+            if (uhead->up != NULL)
+                uhead->up->down = &chunk1->u;
+            uhead->up = &chunk1->u;
+#ifndef NDEBUG
+            chunk1->d.next = (dlist_t *)0x42;   /* not used */
+            chunk1->d.prev = (dlist_t *)0x42;
+#endif
+        }
+        else {
+            head = head->next;
+            continue;
+        }
+        if (count == 0)
+            break;    /* all done */
+        chunk1 = chunks[--count];
+        search_size = chunk1->size;
+    }
+
+    if (chunks != chunk_array)
+        free(chunks);
+}
+
+static void sort_bin(size_t index)
+{
+    dlist_t *last = lm.largebins[index].prev;
+    if (last != &lm.largebins[index] && data2chunk(last)->u.up == UU_UNSORTED)
+        really_sort_bin(index);
+}
+
+static void unlink_chunk(mchunk_t *mscan)
+{
+    if (mscan->u.down != NULL) {
+        /* unlink mscan from the vertical list 'u' */
+        ulist_t *up   = mscan->u.up;
+        ulist_t *down = mscan->u.down;
+        down->up = up;
+        if (up != NULL) up->down = down;
+    }
+    else {
+        dlist_t *prev = mscan->d.prev;
+        dlist_t *next = mscan->d.next;
+        if (mscan->u.up == NULL || mscan->u.up == UU_UNSORTED) {
+            /* unlink mscan from the doubly-linked list 'd' */
+            next->prev = prev;
+            prev->next = next;
+        }
+        else {
+            /* relink in the 'd' list the item above me */
+            mchunk_t *above = updown2chunk(mscan->u.up);
+            next->prev = &above->d;
+            prev->next = &above->d;
+            above->d.next = next;
+            above->d.prev = prev;
+            above->u.down = NULL;
+        }
+    }
+}
+
+char *_stm_large_malloc(size_t request_size)
+{
+    /* 'request_size' should already be a multiple of the word size here */
+    assert((request_size & (sizeof(char *)-1)) == 0);
+
+    /* it can be very small, but we need to ensure a minimal size
+       (currently 32 bytes) */
+    if (request_size < MIN_ALLOC_SIZE)
+        request_size = MIN_ALLOC_SIZE;
+
+    lm_lock();
+
+    size_t index = largebin_index(request_size);
+    sort_bin(index);
+
+    /* scan through the chunks of current bin in reverse order
+       to find the smallest that fits. */
+    dlist_t *scan = lm.largebins[index].prev;
+    dlist_t *end = &lm.largebins[index];
+    mchunk_t *mscan;
+    while (scan != end) {
+        mscan = data2chunk(scan);
+        assert(mscan->prev_size == THIS_CHUNK_FREE);
+        assert(next_chunk(mscan)->prev_size == mscan->size);
+        assert(IMPLY(mscan->d.prev != end,
+                     data2chunk(mscan->d.prev)->size > mscan->size));
+
+        if (mscan->size >= request_size)
+            goto found;
+        scan = mscan->d.prev;
+    }
+
+    /* search now through all higher bins.  We only need to take the
+       smallest item of the first non-empty bin, as it will be large
+       enough. */
+    while (++index < N_BINS) {
+        if (lm.largebins[index].prev != &lm.largebins[index]) {
+            /* non-empty bin. */
+            sort_bin(index);
+            scan = lm.largebins[index].prev;
+            mscan = data2chunk(scan);
+            goto found;
+        }
+    }
+
+    /* not enough memory. */
+    lm_unlock();
+    return NULL;
+
+ found:
+    assert(mscan->size >= request_size);
+    assert(mscan->u.up != UU_UNSORTED);
+
+    if (mscan->u.up != NULL) {
+        /* fast path: grab the item that is just above, to avoid needing
+           to rearrange the 'd' list */
+        mchunk_t *above = updown2chunk(mscan->u.up);
+        ulist_t *two_above = above->u.up;
+        mscan->u.up = two_above;
+        if (two_above != NULL) two_above->down = &mscan->u;
+        mscan = above;
+    }
+    else {
+        unlink_chunk(mscan);
+    }
+
+    size_t remaining_size = mscan->size - request_size;
+    if (remaining_size < sizeof(struct malloc_chunk)) {
+        next_chunk(mscan)->prev_size = BOTH_CHUNKS_USED;
+        request_size = mscan->size;
+    }
+    else {
+        /* only part of the chunk is being used; reduce the size
+           of 'mscan' down to 'request_size', and create a new
+           chunk of the 'remaining_size' afterwards */
+        mchunk_t *new = chunk_at_offset(mscan, CHUNK_HEADER_SIZE +
+                                               request_size);
+        new->prev_size = THIS_CHUNK_FREE;
+        size_t remaining_data_size = remaining_size - CHUNK_HEADER_SIZE;
+        new->size = remaining_data_size;
+        next_chunk(new)->prev_size = remaining_data_size;
+        insert_unsorted(new);
+    }
+    mscan->size = request_size;
+    mscan->prev_size = BOTH_CHUNKS_USED;
+#ifndef NDEBUG
+    memset((char *)&mscan->d, 0xda, request_size);
+#endif
+
+    lm_unlock();
+
+    return (char *)&mscan->d;
+}
+
+static void _large_free(mchunk_t *chunk)
+{
+    assert((chunk->size & (sizeof(char *) - 1)) == 0);
+    assert(chunk->prev_size != THIS_CHUNK_FREE);
+
+    /* 'size' is at least MIN_ALLOC_SIZE */
+
+#ifndef NDEBUG
+    {
+        char *data = (char *)&chunk->d;
+        assert(chunk->size >= sizeof(dlist_t));
+        assert(chunk->size <= (((char *)lm.last_chunk) - data));
+        memset(data, 0xDE, chunk->size);
+    }
+#endif
+
+    /* try to merge with the following chunk in memory */
+    size_t msize = chunk->size + CHUNK_HEADER_SIZE;
+    mchunk_t *mscan = chunk_at_offset(chunk, msize);
+
+    if (mscan->prev_size == BOTH_CHUNKS_USED) {
+        assert((mscan->size & (sizeof(char *) - 1)) == 0);
+        mscan->prev_size = chunk->size;
+    }
+    else {
+        size_t fsize = mscan->size;
+        mchunk_t *fscan = chunk_at_offset(mscan, fsize + CHUNK_HEADER_SIZE);
+
+        /* unlink the following chunk */
+        unlink_chunk(mscan);
+#ifndef NDEBUG
+        mscan->prev_size = (size_t)-258;  /* 0xfffffffffffffefe */
+        mscan->size = (size_t)-515;       /* 0xfffffffffffffdfd */
+#endif
+
+        /* merge the two chunks */
+        assert(fsize == fscan->prev_size);
+        fsize += msize;
+        fscan->prev_size = fsize;
+        chunk->size = fsize;
+    }
+
+    /* try to merge with the previous chunk in memory */
+    if (chunk->prev_size == BOTH_CHUNKS_USED) {
+        chunk->prev_size = THIS_CHUNK_FREE;
+    }
+    else {
+        assert((chunk->prev_size & (sizeof(char *) - 1)) == 0);
+
+        /* get at the previous chunk */
+        msize = chunk->prev_size + CHUNK_HEADER_SIZE;
+        mscan = chunk_at_offset(chunk, -msize);
+        assert(mscan->prev_size == THIS_CHUNK_FREE);
+        assert(mscan->size == chunk->prev_size);
+
+        /* unlink the previous chunk */
+        unlink_chunk(mscan);
+
+        /* merge the two chunks */
+        mscan->size = msize + chunk->size;
+        next_chunk(mscan)->prev_size = mscan->size;
+
+        assert(chunk->prev_size = (size_t)-1);
+        assert(chunk->size = (size_t)-1);
+        chunk = mscan;
+    }
+
+    insert_unsorted(chunk);
+}
+
+void _stm_large_free(char *data)
+{
+    lm_lock();
+    _large_free(data2chunk(data));
+    lm_unlock();
+}
+
+
+void _stm_large_dump(void)
+{
+    lm_lock();
+    char *data = ((char *)lm.first_chunk) + 16;
+    size_t prev_size_if_free = 0;
+    fprintf(stderr, "\n");
+    while (1) {
+        assert((((uintptr_t)data) & 7) == 0);   /* alignment */
+        fprintf(stderr, "[ %p: %zu", data - 16, *(size_t*)(data - 16));
+        if (prev_size_if_free == 0) {
+            assert(*(size_t*)(data - 16) == THIS_CHUNK_FREE ||
+                   *(size_t*)(data - 16) == BOTH_CHUNKS_USED);
+            if (*(size_t*)(data - 16) == THIS_CHUNK_FREE)
+                prev_size_if_free = (*(size_t*)(data - 8));
+        }
+        else {
+            assert(*(size_t*)(data - 16) == prev_size_if_free);
+            prev_size_if_free = 0;
+        }
+        if (*(size_t*)(data - 8) == END_MARKER)
+            break;
+        if (prev_size_if_free) {
+            fprintf(stderr, "        \t(up %p / down %p)",
+                    *(void **)(data + 16), *(void **)(data + 24));
+        }
+        fprintf(stderr, "\n  %p: %zu ]", data - 8, *(size_t*)(data - 8));
+        if (prev_size_if_free) {
+            fprintf(stderr, "\t(prev %p <-> next %p)\n",
+                    *(void **)(data + 8), *(void **)data);
+        }
+        else {
+            fprintf(stderr, "\n");
+        }
+        assert(*(ssize_t*)(data - 8) >= 16);
+        data += *(size_t*)(data - 8);
+        data += 16;
+    }
+    fprintf(stderr, "\n  %p: end. ]\n\n", data - 8);
+    assert(data - 16 == (char *)lm.last_chunk);
+    lm_unlock();
+}
+
+char *_stm_largemalloc_data_start(void)
+{
+    return (char *)lm.first_chunk;
+}
+
+#ifdef STM_LARGEMALLOC_TEST
+bool (*_stm_largemalloc_keep)(char *data);   /* a hook for tests */
+#endif
+
+void _stm_largemalloc_init_arena(char *data_start, size_t data_size)
+{
+    int i;
+    for (i = 0; i < N_BINS; i++) {
+        lm.largebins[i].prev = &lm.largebins[i];
+        lm.largebins[i].next = &lm.largebins[i];
+    }
+
+    assert(data_size >= 2 * sizeof(struct malloc_chunk));
+    assert((data_size & 31) == 0);
+    lm.first_chunk = (mchunk_t *)data_start;
+    lm.first_chunk->prev_size = THIS_CHUNK_FREE;
+    lm.first_chunk->size = data_size - 2 * CHUNK_HEADER_SIZE;
+    lm.last_chunk = chunk_at_offset(lm.first_chunk,
+                                    data_size - CHUNK_HEADER_SIZE);
+    lm.last_chunk->prev_size = lm.first_chunk->size;
+    lm.last_chunk->size = END_MARKER;
+    assert(lm.last_chunk == next_chunk(lm.first_chunk));
+    lm.lock = 0;
+
+    insert_unsorted(lm.first_chunk);
+
+#ifdef STM_LARGEMALLOC_TEST
+    _stm_largemalloc_keep = NULL;
+#endif
+}
+
+int _stm_largemalloc_resize_arena(size_t new_size)
+{
+    int result = 0;
+    lm_lock();
+
+    if (new_size < 2 * sizeof(struct malloc_chunk))
+        goto fail;
+    OPT_ASSERT((new_size & 31) == 0);
+
+    new_size -= CHUNK_HEADER_SIZE;
+    mchunk_t *new_last_chunk = chunk_at_offset(lm.first_chunk, new_size);
+    mchunk_t *old_last_chunk = lm.last_chunk;
+    size_t old_size = ((char *)old_last_chunk) - (char *)lm.first_chunk;
+
+    if (new_size < old_size) {
+        /* check if there is enough free space at the end to allow
+           such a reduction */
+        size_t lsize = lm.last_chunk->prev_size;
+        assert(lsize != THIS_CHUNK_FREE);
+        if (lsize == BOTH_CHUNKS_USED)
+            goto fail;
+        lsize += CHUNK_HEADER_SIZE;
+        mchunk_t *prev_chunk = chunk_at_offset(lm.last_chunk, -lsize);
+        if (((char *)new_last_chunk) < ((char *)prev_chunk) +
+                                       sizeof(struct malloc_chunk))
+            goto fail;
+
+        /* unlink the prev_chunk from the doubly-linked list */
+        unlink_chunk(prev_chunk);
+
+        /* reduce the prev_chunk */
+        assert(prev_chunk->size == lm.last_chunk->prev_size);
+        prev_chunk->size = ((char*)new_last_chunk) - (char *)prev_chunk
+                           - CHUNK_HEADER_SIZE;
+
+        /* make a fresh-new last chunk */
+        new_last_chunk->prev_size = prev_chunk->size;
+        new_last_chunk->size = END_MARKER;
+        lm.last_chunk = new_last_chunk;
+        assert(lm.last_chunk == next_chunk(prev_chunk));
+
+        insert_unsorted(prev_chunk);
+    }
+    else if (new_size > old_size) {
+        /* make the new last chunk first, with only the extra size */
+        mchunk_t *old_last_chunk = lm.last_chunk;
+        old_last_chunk->size = (new_size - old_size) - CHUNK_HEADER_SIZE;
+        new_last_chunk->prev_size = BOTH_CHUNKS_USED;
+        new_last_chunk->size = END_MARKER;
+        lm.last_chunk = new_last_chunk;
+        assert(lm.last_chunk == next_chunk(old_last_chunk));
+
+        /* then free the last_chunk (turn it from "used" to "free) */
+        _large_free(old_last_chunk);
+    }
+
+    result = 1;
+ fail:
+    lm_unlock();
+    return result;
+}
+
+
+static inline bool _largemalloc_sweep_keep(mchunk_t *chunk)
+{
+#ifdef STM_LARGEMALLOC_TEST
+    if (_stm_largemalloc_keep != NULL)
+        return _stm_largemalloc_keep((char *)&chunk->d);
+#endif
+    return true;
+    //XXX:    return largemalloc_keep_object_at((char *)&chunk->d);
+}
+
+void _stm_largemalloc_sweep(void)
+{
+    lm_lock();
+
+    /* This may be slightly optimized by inlining _large_free() and
+       making cases, e.g. we might know already if the previous block
+       was free or not.  It's probably not really worth it. */
+    mchunk_t *mnext, *chunk = lm.first_chunk;
+
+    if (chunk->prev_size == THIS_CHUNK_FREE)
+        chunk = next_chunk(chunk);   /* go to the first non-free chunk */
+
+    while (chunk != lm.last_chunk) {
+        /* here, the chunk we're pointing to is not free */
+        assert(chunk->prev_size != THIS_CHUNK_FREE);
+
+        /* first figure out the next non-free chunk */
+        mnext = next_chunk(chunk);
+        if (mnext->prev_size == THIS_CHUNK_FREE)
+            mnext = next_chunk(mnext);
+
+        /* use the callback to know if 'chunk' contains an object that
+           survives or dies */
+        if (!_largemalloc_sweep_keep(chunk)) {
+            _large_free(chunk);     /* dies */
+        }
+        chunk = mnext;
+    }
+
+    lm_unlock();
+}
diff --git a/c8/stm/largemalloc.h b/c8/stm/largemalloc.h
new file mode 100644
--- /dev/null
+++ b/c8/stm/largemalloc.h
@@ -0,0 +1,18 @@
+
+/* all addresses passed to this interface should be "char *" pointers
+   in the segment 0. */
+void _stm_largemalloc_init_arena(char *data_start, size_t data_size);
+int _stm_largemalloc_resize_arena(size_t new_size);
+char *_stm_largemalloc_data_start(void);
+
+/* large_malloc() and large_free() are not thread-safe.  This is
+   due to the fact that they should be mostly called during minor or
+   major collections, which have their own synchronization mecanisms. */
+char *_stm_large_malloc(size_t request_size);
+void _stm_large_free(char *data);
+void _stm_largemalloc_sweep(void);
+
+void _stm_large_dump(void);
+
+
+#define LARGE_MALLOC_OVERHEAD   (2 * sizeof(size_t))   /* estimate */
diff --git a/c8/stm/setup.c b/c8/stm/setup.c
--- a/c8/stm/setup.c
+++ b/c8/stm/setup.c
@@ -31,6 +31,11 @@
                  (NB_READMARKER_PAGES + NB_NURSERY_PAGES) * 4096,
                  PROT_READ | PROT_WRITE);
     }
+
+    /* make the sharing segment writable for the memory allocator: */
+    mprotect(stm_object_pages + END_NURSERY_PAGE * 4096UL,
+             (NB_PAGES - END_NURSERY_PAGE) * 4096UL,
+             PROT_READ | PROT_WRITE);
 }
 
 
diff --git a/c8/stm/smallmalloc.c b/c8/stm/smallmalloc.c
--- a/c8/stm/smallmalloc.c
+++ b/c8/stm/smallmalloc.c
@@ -67,23 +67,17 @@
         /* if (!_stm_largemalloc_resize_arena(uninitialized_page_stop - base)) */
         /*     goto out_of_memory; */
 
-        /* lock acquiring not necessary because the affected pages don't
-           need privatization protection. (but there is an assert right
-           now to enforce that XXXXXX) */
-        acquire_all_privatization_locks();
+        /* make writable in sharing seg */
+        setup_N_pages(uninitialized_page_stop, GCPAGE_NUM_PAGES);
 
         char *p = uninitialized_page_stop;
         long i;
         for (i = 0; i < GCPAGE_NUM_PAGES; i++) {
-            /* accessible in seg0: */
-            page_mark_accessible(0, (p - stm_object_pages) / 4096UL);
-
             /* add to free_uniform_pages list */
             ((struct small_free_loc_s *)p)->nextpage = free_uniform_pages;
             free_uniform_pages = (struct small_free_loc_s *)p;
             p += 4096;
         }
-        release_all_privatization_locks();
     }
 
     spinlock_release(gmfp_lock);
@@ -128,17 +122,6 @@
                                                    smallpage->nextpage)))
             goto retry;
 
-
-
-        /* lock acquiring not necessary because the affected pages don't
-           need privatization protection. (but there is an assert right
-           now to enforce that XXXXXX) */
-        acquire_all_privatization_locks();
-        /* make page accessible in our segment too: */
-        page_mark_accessible(STM_SEGMENT->segment_num,
-                             ((char*)smallpage - stm_object_pages) / 4096UL);
-        release_all_privatization_locks();
-
         /* Succeeded: we have a page in 'smallpage', which is not
            initialized so far, apart from the 'nextpage' field read
            above.  Initialize it.
@@ -315,6 +298,7 @@
 
 void _stm_smallmalloc_sweep(void)
 {
+    acquire_all_privatization_locks(); /* should be done outside, but tests... */
     long i, szword;
     for (szword = 2; szword < GC_N_SMALL_REQUESTS; szword++) {
         struct small_free_loc_s *page = small_page_lists[szword];
@@ -362,4 +346,5 @@
             sweep_small_page(pageptr, NULL, sz);
         }
     }
+    release_all_privatization_locks();
 }
diff --git a/c8/stmgc.c b/c8/stmgc.c
--- a/c8/stmgc.c
+++ b/c8/stmgc.c
@@ -6,6 +6,7 @@
 #include "stm/core.h"
 #include "stm/pagecopy.h"
 #include "stm/pages.h"
+#include "stm/largemalloc.h"
 #include "stm/gcpage.h"
 #include "stm/sync.h"
 #include "stm/setup.h"
@@ -20,6 +21,7 @@
 #include "stm/pagecopy.c"
 #include "stm/pages.c"
 #include "stm/prebuilt.c"
+#include "stm/largemalloc.c"
 #include "stm/gcpage.c"
 #include "stm/nursery.c"
 #include "stm/sync.c"
diff --git a/c8/stmgc.h b/c8/stmgc.h
--- a/c8/stmgc.h
+++ b/c8/stmgc.h
@@ -79,6 +79,16 @@
 void _stm_test_switch_segment(int segnum);
 void _push_obj_to_other_segments(object_t *obj);
 
+void _stm_largemalloc_init_arena(char *data_start, size_t data_size);
+int _stm_largemalloc_resize_arena(size_t new_size);
+char *_stm_largemalloc_data_start(void);
+char *_stm_large_malloc(size_t request_size);
+void _stm_large_free(char *data);
+void _stm_large_dump(void);
+bool (*_stm_largemalloc_keep)(char *data);
+void _stm_largemalloc_sweep(void);
+
+
 char *stm_object_pages;
 char *stm_file_pages;
 object_t *_stm_allocate_old_small(ssize_t size_rounded_up);
diff --git a/c8/test/support.py b/c8/test/support.py
--- a/c8/test/support.py
+++ b/c8/test/support.py
@@ -81,6 +81,15 @@
 void stm_collect(long level);
 
 void _stm_set_nursery_free_count(uint64_t free_count);
+void _stm_largemalloc_init_arena(char *data_start, size_t data_size);
+int _stm_largemalloc_resize_arena(size_t new_size);
+char *_stm_largemalloc_data_start(void);
+char *_stm_large_malloc(size_t request_size);
+void _stm_large_free(char *data);
+void _stm_large_dump(void);
+bool (*_stm_largemalloc_keep)(char *data);
+void _stm_largemalloc_sweep(void);
+
 
 long stm_identityhash(object_t *obj);
 long stm_id(object_t *obj);
diff --git a/c8/test/test_largemalloc.py b/c8/test/test_largemalloc.py
new file mode 100644
--- /dev/null
+++ b/c8/test/test_largemalloc.py
@@ -0,0 +1,181 @@
+from support import *
+import sys, random
+
+ra = lambda x: x   # backward compat.
+
+class TestLargeMalloc(BaseTest):
+    def setup_method(self, meth):
+        # initialize some big heap in stm_setup()
+        BaseTest.setup_method(self, meth)
+
+        # now re-initialize the heap to 1MB with 0xcd in it
+        self.size = 1024 * 1024     # 1MB
+        self.rawmem = lib._stm_largemalloc_data_start()
+
+        lib.memset(self.rawmem, 0xcd, self.size)
+        lib._stm_largemalloc_init_arena(self.rawmem, self.size)
+
+    def test_simple(self):
+        #
+        lib._stm_large_dump()
+        d1 = lib._stm_large_malloc(7000)
+        lib._stm_large_dump()
+        d2 = lib._stm_large_malloc(8000)
+        print d1
+        print d2
+        assert ra(d2) - ra(d1) == 7016
+        d3 = lib._stm_large_malloc(9000)
+        assert ra(d3) - ra(d2) == 8016
+        #
+        lib._stm_large_free(d1)
+        lib._stm_large_free(d2)
+        #
+        d4 = lib._stm_large_malloc(600)
+        assert d4 == d1
+        d5 = lib._stm_large_malloc(600)
+        assert ra(d5) == ra(d4) + 616
+        #
+        lib._stm_large_free(d5)
+        #
+        d6 = lib._stm_large_malloc(600)
+        assert d6 == d5
+        #
+        lib._stm_large_free(d4)
+        #
+        d7 = lib._stm_large_malloc(608)
+        assert ra(d7) == ra(d6) + 616
+        d8 = lib._stm_large_malloc(600)
+        assert d8 == d4
+        #
+        lib._stm_large_dump()
+
+    def test_overflow_1(self):
+        d = lib._stm_large_malloc(self.size - 32)
+        assert ra(d) == self.rawmem + 16
+        lib._stm_large_dump()
+
+    def test_overflow_2(self):
+        d = lib._stm_large_malloc(self.size - 16)
+        assert d == ffi.NULL
+        lib._stm_large_dump()
+
+    def test_overflow_3(self):
+        d = lib._stm_large_malloc(sys.maxint & ~7)
+        assert d == ffi.NULL
+        lib._stm_large_dump()
+
+    def test_resize_arena_reduce_1(self):
+        r = lib._stm_largemalloc_resize_arena(self.size - 32)
+        assert r == 1
+        d = lib._stm_large_malloc(self.size - 32)
+        assert d == ffi.NULL
+        lib._stm_large_dump()
+
+    def test_resize_arena_reduce_2(self):
+        lib._stm_large_malloc(self.size // 2 - 80)
+        r = lib._stm_largemalloc_resize_arena(self.size // 2)
+        assert r == 1
+        lib._stm_large_dump()
+
+    def test_resize_arena_reduce_3(self):
+        d1 = lib._stm_large_malloc(128)
+        r = lib._stm_largemalloc_resize_arena(self.size // 2)
+        assert r == 1
+        d2 = lib._stm_large_malloc(128)
+        assert ra(d1) == self.rawmem + 16
+        assert ra(d2) == ra(d1) + 128 + 16
+        lib._stm_large_dump()
+
+    def test_resize_arena_cannot_reduce_1(self):
+        lib._stm_large_malloc(self.size // 2)
+        r = lib._stm_largemalloc_resize_arena(self.size // 2)
+        assert r == 0
+        lib._stm_large_dump()
+
+    def test_resize_arena_cannot_reduce_2(self):
+        lib._stm_large_malloc(self.size // 2 - 56)
+        r = lib._stm_largemalloc_resize_arena(self.size // 2)
+        assert r == 0
+        lib._stm_large_dump()
+
+    def test_random(self):
+        r = random.Random(1007)
+        p = []
+        for i in range(100000):
+            if len(p) != 0 and (len(p) > 100 or r.randrange(0, 5) < 2):
+                index = r.randrange(0, len(p))
+                d, length, content1, content2 = p.pop(index)
+                print ' free %5d  (%s)' % (length, d)
+                assert ra(d)[0] == content1
+                assert ra(d)[length - 1] == content2
+                lib._stm_large_free(d)
+            else:
+                sz = r.randrange(8, 160) * 8
+                d = lib._stm_large_malloc(sz)
+                print 'alloc %5d  (%s)' % (sz, d)
+                assert d != ffi.NULL
+                lib.memset(ra(d), 0xdd, sz)
+                content1 = chr(r.randrange(0, 256))
+                content2 = chr(r.randrange(0, 256))
+                ra(d)[0] = content1
+                ra(d)[sz - 1] = content2
+                p.append((d, sz, content1, content2))
+        lib._stm_large_dump()
+
+    def test_random_largemalloc_sweep(self, constrained_size_range=False):
+        @ffi.callback("bool(char *)")
+        def keep(data):
+            try:
+                if data in from_before:
+                    return False
+                index = all.index(data)
+                seen_for.add(index)
+                return index in keep_me
+            except Exception, e:
+                errors.append(e)
+                raise
+        lib._stm_largemalloc_keep = keep
+        errors = []
+        from_before = set()
+
+        r = random.Random(1000)
+        for j in range(500):
+            if constrained_size_range:
+                max = 120
+            else:
+                max = 500
+            sizes = [random.choice(range(104, max, 8)) for i in range(20)]
+            all = [lib._stm_large_malloc(size) for size in sizes]
+            print all
+
+            for i in range(len(all)):
+                all[i][50] = chr(65 + i)
+            all_orig = all[:]
+
+            keep_me = set()
+            for i in range(len(all)):
+                if r.random() < 0.5:
+                    print 'free:', all[i]
+                    lib._stm_large_free(all[i])
+                    all[i] = None
+                elif r.random() < 0.5:
+                    keep_me.add(i)
+
+            seen_for = set()
+            lib._stm_largemalloc_sweep()
+            if errors:
+                raise errors[0]
+            assert seen_for == set([i for i in range(len(all))
+                                      if all[i] is not None])
+            lib._stm_large_dump()
+
+            from_before = [all[i] for i in keep_me]
+
+            for i in range(len(all)):
+                if i in keep_me:
+                    assert all[i][50] == chr(65 + i)
+                else:
+                    assert all_orig[i][50] == '\xDE'
+
+    def test_random_largemalloc_sweep_constrained_size_range(self):
+        self.test_random_largemalloc_sweep(constrained_size_range=True)