[pypy-commit] pypy stm: Import from arigo/hack/stm/c.

Tue Sep 27 13:29:49 CEST 2011

Author: Armin Rigo <arigo at tunes.org>
Branch: stm
Changeset: r47621:29beb9a43433
Date: 2011-09-27 13:28 +0200
http://bitbucket.org/pypy/pypy/changeset/29beb9a43433/

Log:	Import from arigo/hack/stm/c.

diff --git a/pypy/translator/stm/__init__.py b/pypy/translator/stm/__init__.py
new file mode 100644
diff --git a/pypy/translator/stm/_rffi_stm.py b/pypy/translator/stm/_rffi_stm.py
new file mode 100644
--- /dev/null
+++ b/pypy/translator/stm/_rffi_stm.py
@@ -0,0 +1,40 @@
+import py
+import os
+from pypy.tool.autopath import pypydir
+from pypy.rpython.lltypesystem import lltype, rffi
+from pypy.translator.tool.cbuild import ExternalCompilationInfo
+
+
+cdir = py.path.local(pypydir) / 'translator' / 'stm'
+
+
+eci = ExternalCompilationInfo(
+    include_dirs = [cdir],
+    includes = ['src_stm/et.h'],
+    post_include_bits = [
+        r'''#define stm_begin_transaction_inline()  ; \
+              jmp_buf _jmpbuf;                   \
+              setjmp(_jmpbuf);                   \
+              stm_begin_transaction(&_jmpbuf);
+        '''],
+    separate_module_sources = ['#include "src_stm/et.c"\n'],
+)
+
+def llexternal(name, args, result, **kwds):
+    return rffi.llexternal(name, args, result, compilation_info=eci,
+                           _nowrapper=True, **kwds)
+
+
+descriptor_init = llexternal('stm_descriptor_init', [], lltype.Void)
+descriptor_done = llexternal('stm_descriptor_done', [], lltype.Void)
+
+begin_transaction = llexternal('stm_begin_transaction_inline',[], lltype.Void)
+commit_transaction = llexternal('stm_commit_transaction', [], lltype.Signed)
+
+read_word = llexternal('stm_read_word', [rffi.VOIDPP], rffi.VOIDP)
+write_word = llexternal('stm_write_word', [rffi.VOIDPP, rffi.VOIDP],
+                        lltype.Void)
+
+CALLBACK = lltype.Ptr(lltype.FuncType([rffi.VOIDP], rffi.VOIDP))
+perform_transaction = llexternal('stm_perform_transaction',
+                                 [CALLBACK, rffi.VOIDP], rffi.VOIDP)
diff --git a/pypy/translator/stm/src_stm/atomic_ops.h b/pypy/translator/stm/src_stm/atomic_ops.h
new file mode 100644
--- /dev/null
+++ b/pypy/translator/stm/src_stm/atomic_ops.h
@@ -0,0 +1,45 @@
+
+
+/* "compiler fence" for preventing reordering of loads/stores to
+   non-volatiles */
+#define CFENCE          asm volatile ("":::"memory")
+
+
+#ifdef __llvm__
+#  define HAS_SYNC_BOOL_COMPARE_AND_SWAP
+#endif
+
+#ifdef __GNUC__
+#  if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+#    define HAS_SYNC_BOOL_COMPARE_AND_SWAP
+#  endif
+#endif
+
+
+#ifdef HAS_SYNC_BOOL_COMPARE_AND_SWAP
+#  define bool_cas __sync_bool_compare_and_swap
+#else
+/* x86 (32 bits and 64 bits) */
+static inline _Bool
+bool_cas(volatile unsigned long* ptr, unsigned long old, unsigned long _new)
+{
+    unsigned long prev;
+    asm volatile("lock;"
+#if defined(__amd64__)
+                 "cmpxchgq %1, %2;"
+#else
+                 "cmpxchgl %1, %2;"
+#endif
+                 : "=a"(prev)
+                 : "q"(_new), "m"(*ptr), "a"(old)
+                 : "memory");
+    return prev == old;
+}
+/* end */
+#endif
+
+
+static inline void spinloop(void)
+{
+  asm volatile ("pause");
+}
diff --git a/pypy/translator/stm/src_stm/et.c b/pypy/translator/stm/src_stm/et.c
new file mode 100644
--- /dev/null
+++ b/pypy/translator/stm/src_stm/et.c
@@ -0,0 +1,745 @@
+/* -*- c-basic-offset: 2 -*- */
+
+/* XXX assumes that time never wraps around (in a 'long'), which may be
+ * correct on 64-bit machines but not on 32-bit machines if the process
+ * runs for long enough.
+ *
+ * XXX measure the overhead of the global_timestamp
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+#define USE_PTHREAD_MUTEX    /* optional */
+#ifdef USE_PTHREAD_MUTEX
+# include <pthread.h>
+#endif
+
+#include "src_stm/et.h"
+#include "src_stm/atomic_ops.h"
+
+/************************************************************/
+
+#define IS_LOCKED(num)  ((num) < 0)
+#define IS_LOCKED_OR_NEWER(num, max_age) \
+    (((unsigned long)(num)) > ((unsigned long)(max_age)))
+typedef long owner_version_t;
+
+typedef struct {
+  owner_version_t v;   // the current version number
+} orec_t;
+
+/*** Specify the number of orecs in the global array. */
+#define NUM_STRIPES  1048576
+
+/*** declare the table of orecs */
+static char orecs[NUM_STRIPES * sizeof(orec_t)];
+
+/*** map addresses to orec table entries */
+inline static volatile orec_t* get_orec(void* addr)
+{
+  unsigned long index = (unsigned long)addr;
+  char *p = orecs + (index & ((NUM_STRIPES-1) * sizeof(orec_t)));
+  return (volatile orec_t *)p;
+}
+
+#include "src_stm/lists.c"
+
+/************************************************************/
+
+/* Uncomment the line to try this extra code.  Doesn't work reliably so far */
+/*#define COMMIT_OTHER_INEV*/
+
+#define ABORT_REASONS 7
+#define SPINLOOP_REASONS 10
+#define OTHERINEV_REASONS 5
+
+struct tx_descriptor {
+  jmp_buf *setjmp_buf;
+  owner_version_t start_time;
+  owner_version_t end_time;
+  unsigned long last_known_global_timestamp;
+  struct OrecList reads;
+  unsigned num_commits;
+  unsigned num_aborts[ABORT_REASONS];
+  unsigned num_spinloops[SPINLOOP_REASONS];
+#ifdef COMMIT_OTHER_INEV
+  unsigned num_otherinev[OTHERINEV_REASONS];
+#endif
+  unsigned int spinloop_counter;
+  owner_version_t my_lock_word;
+  struct RedoLog redolog;   /* last item, because it's the biggest one */
+};
+
+/* global_timestamp contains in its lowest bit a flag equal to 1
+   if there is an inevitable transaction running */
+static volatile unsigned long global_timestamp = 2;
+static __thread struct tx_descriptor *thread_descriptor;
+#ifdef COMMIT_OTHER_INEV
+static struct tx_descriptor *volatile thread_descriptor_inev;
+static volatile unsigned long d_inev_checking = 0;
+#endif
+
+/************************************************************/
+
+static unsigned long get_global_timestamp(struct tx_descriptor *d)
+{
+  return (d->last_known_global_timestamp = global_timestamp);
+}
+
+static _Bool change_global_timestamp(struct tx_descriptor *d,
+                                     unsigned long old,
+                                     unsigned long new)
+{
+  if (bool_cas(&global_timestamp, old, new))
+    {
+      d->last_known_global_timestamp = new;
+      return 1;
+    }
+  return 0;
+}
+
+static void set_global_timestamp(struct tx_descriptor *d, unsigned long new)
+{
+  global_timestamp = new;
+  d->last_known_global_timestamp = new;
+}
+
+static void tx_abort(int);
+
+static void tx_spinloop(int num)
+{
+  unsigned int c;
+  int i;
+  struct tx_descriptor *d = thread_descriptor;
+  d->num_spinloops[num]++;
+
+  //printf("tx_spinloop(%d)\n", num);
+  
+  c = d->spinloop_counter;
+  d->spinloop_counter = c * 9;
+  i = c & 0xff0000;
+  while (i >= 0) {
+    spinloop();
+    i -= 0x10000;
+  }
+}
+
+static _Bool is_inevitable(struct tx_descriptor *d)
+{
+  return d->setjmp_buf == NULL;
+}
+
+/*** run the redo log to commit a transaction, and release the locks */
+static void tx_redo(struct tx_descriptor *d)
+{
+  owner_version_t newver = d->end_time;
+  wlog_t *item;
+  /* loop in "forward" order: in this order, if there are duplicate orecs
+     then only the last one has p != -1. */
+  REDOLOG_LOOP_FORWARD(d->redolog, item)
+    {
+      *item->addr = item->val;
+      /* but we must only unlock the orec if it's the last time it
+         appears in the redolog list.  If it's not, then p == -1. */
+      if (item->p != -1)
+        {
+          volatile orec_t* o = get_orec(item->addr);
+          CFENCE;
+          o->v = newver;
+        }
+    } REDOLOG_LOOP_END;
+}
+
+/*** on abort, release locks and restore the old version number. */
+static void releaseAndRevertLocks(struct tx_descriptor *d)
+{
+  wlog_t *item;
+  REDOLOG_LOOP_FORWARD(d->redolog, item)
+    {
+      if (item->p != -1)
+        {
+          volatile orec_t* o = get_orec(item->addr);
+          o->v = item->p;
+        }
+    } REDOLOG_LOOP_END;
+}
+
+/*** release locks and restore the old version number, ready to retry later */
+static void releaseLocksForRetry(struct tx_descriptor *d)
+{
+  wlog_t *item;
+  REDOLOG_LOOP_FORWARD(d->redolog, item)
+    {
+      if (item->p != -1)
+        {
+          volatile orec_t* o = get_orec(item->addr);
+          o->v = item->p;
+          item->p = -1;
+        }
+    } REDOLOG_LOOP_END;
+}
+
+/*** lock all locations */
+static void acquireLocks(struct tx_descriptor *d)
+{
+  wlog_t *item;
+  // try to lock every location in the write set
+  REDOLOG_LOOP_BACKWARD(d->redolog, item)
+    {
+      // get orec, read its version#
+      volatile orec_t* o = get_orec(item->addr);
+      owner_version_t ovt;
+
+    retry:
+      ovt = o->v;
+
+      // if orec not locked, lock it
+      //
+      // NB: if ovt > start time, we may introduce inconsistent
+      // reads.  Since most writes are also reads, we'll just abort under this
+      // condition.  This can introduce false conflicts
+      if (!IS_LOCKED_OR_NEWER(ovt, d->start_time)) {
+        if (!bool_cas(&o->v, ovt, d->my_lock_word))
+          goto retry;
+        // save old version to item->p.  Now we hold the lock.
+        // in case of duplicate orecs, only the last one has p != -1.
+        item->p = ovt;
+      }
+      // else if the location is too recent...
+      else if (!IS_LOCKED(ovt))
+        tx_abort(0);
+      // else it is locked: if we don't hold the lock...
+      else if (ovt != d->my_lock_word) {
+        // we can either abort or spinloop.  Because we are at the end of
+        // the transaction we might try to spinloop, even though after the
+        // lock is released the ovt will be very recent, possibly
+        // > d->start_time.  It is necessary to spinloop in case we are
+        // inevitable, so use that as a criteria.  Another solution to avoid
+        // deadlocks would be to sort the order in which we take the locks.
+        if (is_inevitable(d))
+          tx_spinloop(8);
+        else
+          tx_abort(6);
+        goto retry;
+      }
+    } REDOLOG_LOOP_END;
+}
+
+static void common_cleanup(struct tx_descriptor *d)
+{
+  d->reads.size = 0;
+  redolog_clear(&d->redolog);
+}
+
+static void tx_cleanup(struct tx_descriptor *d)
+{
+  // release the locks and restore version numbers
+  releaseAndRevertLocks(d);
+  // reset all lists
+  common_cleanup(d);
+}
+
+static void tx_restart(struct tx_descriptor *d)
+{
+  tx_cleanup(d);
+  tx_spinloop(0);
+  longjmp(*d->setjmp_buf, 1);
+}
+
+/*** increase the abort count and restart the transaction */
+static void tx_abort(int reason)
+{
+  struct tx_descriptor *d = thread_descriptor;
+  assert(!is_inevitable(d));
+  d->num_aborts[reason]++;
+  tx_restart(d);
+}
+
+/**
+ * fast-path validation, assuming that I don't hold locks.
+ */
+static void validate_fast(struct tx_descriptor *d, int lognum)
+{
+  int i;
+  owner_version_t ovt;
+  assert(!is_inevitable(d));
+  for (i=0; i<d->reads.size; i++)
+    {
+    retry:
+      ovt = d->reads.items[i]->v;
+      if (IS_LOCKED_OR_NEWER(ovt, d->start_time))
+        {
+          // If locked, we wait until it becomes unlocked.  The chances are
+          // that it will then have a very recent start_time, likely
+          // > d->start_time, but it might still be better than always aborting
+          if (IS_LOCKED(ovt))
+            {
+              tx_spinloop(lognum);  /* tx_spinloop(1), tx_spinloop(2),
+                                       tx_spinloop(3) */
+              goto retry;
+            }
+          else
+            // abort if the timestamp is newer than my start time.  
+            tx_abort(lognum);  /* tx_abort(1), tx_abort(2), tx_abort(3) */
+        }
+    }
+}
+
+/**
+ * validate the read set by making sure that all orecs that we've read have
+ * timestamps at least as old as our start time, unless we locked those orecs.
+ */
+static void validate(struct tx_descriptor *d)
+{
+  int i;
+  owner_version_t ovt;
+  assert(!is_inevitable(d));
+  for (i=0; i<d->reads.size; i++)
+    {
+      ovt = d->reads.items[i]->v;      // read this orec
+      if (IS_LOCKED_OR_NEWER(ovt, d->start_time))
+        {
+          if (!IS_LOCKED(ovt))
+            // if unlocked and newer than start time, abort
+            tx_abort(4);
+          else
+            {
+              // if locked and not by me, abort
+              if (ovt != d->my_lock_word)
+                tx_abort(5);
+            }
+        }
+    }
+}
+
+#ifdef USE_PTHREAD_MUTEX
+/* mutex: only to avoid busy-looping too much in tx_spinloop() below */
+static pthread_mutex_t mutex_inevitable = PTHREAD_MUTEX_INITIALIZER;
+#endif
+
+#ifdef COMMIT_OTHER_INEV
+unsigned long can_commit_with_other_inevitable(struct tx_descriptor *d,
+                                               unsigned long expected)
+{
+  int i;
+  owner_version_t ovt;
+  unsigned long result = 0;
+  struct tx_descriptor *d_inev;
+
+  // 'd_inev_checking' is 1 or 2 when an inevitable transaction is running
+  // and didn't start committing yet; otherwise it is 0.  It is normally 1
+  // except in this function.
+  if (!bool_cas(&d_inev_checking, 1, 2))
+    {
+      d->num_otherinev[4]++;
+      return 0;
+    }
+
+  // optimization only: did the inevitable thread 'd_inev' read any data
+  // that we are about to commit?  If we are sure that the answer is
+  // negative, then commit anyway, because it cannot make the inevitable
+  // thread fail.  We can safely check an approximation of this, because
+  // we hold a lock on all orecs that we would like to write.  So if all
+  // orecs read by d_inev are not locked now, then no conflict.  This
+  // function is allowed to "fail" and give up rather than spinloop
+  // waiting for a condition to be true, which is potentially dangerous
+  // here, because we acquired all the locks.
+
+  // Note that if the inevitable thread itself adds in parallel an extra
+  // orec to d_inev->reads, *and* if this new orec is locked, then we
+  // will miss it here; but the d_inev thread will spinloop waiting for
+  // us to be done.  So even if we commit, the d_inev thread will just
+  // wait and load the new committed value.
+
+  // while we are in this function, the d_inev thread is prevented from
+  // going too far with the commitTransaction() code because d_inev_checking
+  // is greater than 1; it will just tx_spinloop(9).  (And of course it
+  // cannot abort.)
+
+  d_inev = thread_descriptor_inev;
+  if (!bool_cas(&d_inev->reads.locked, 0, 1))
+    {
+      d->num_otherinev[1]++;
+      goto give_up_1;
+    }
+
+  for (i=d_inev->reads.size; i--; )
+    {
+      ovt = d_inev->reads.items[i]->v;     // read this orec
+      if (ovt == d->my_lock_word)
+        {
+          d->num_otherinev[2]++;
+          goto give_up_2;
+        }
+    }
+  assert(expected & 1);
+  if (!change_global_timestamp(d, expected, expected + 2))
+    {
+      d->num_otherinev[3]++;
+      goto give_up_2;
+    }
+
+  /* success: scale d_inet forward */
+  d->num_otherinev[0]++;
+  result = expected + 1;
+  assert(d_inev->start_time == result - 2);
+  d_inev->start_time = result;
+  CFENCE;
+
+ give_up_2:
+  d_inev->reads.locked = 0;
+
+ give_up_1:
+  d_inev_checking = 1;
+  return result;
+}
+#endif
+
+void wait_end_inevitability(struct tx_descriptor *d)
+{
+  unsigned long curts;
+  releaseLocksForRetry(d);
+
+  // We are going to wait until the other inevitable transaction
+  // finishes.  XXX We could do better here: we could check if
+  // committing 'd' would create a conflict for the other inevitable
+  // thread 'd_inev' or not.  It requires peeking in 'd_inev' from this
+  // thread (which we never do so far) in order to do something like
+  // 'validate_fast(d_inev); d_inev->start_time = updated;'
+
+  while ((curts = get_global_timestamp(d)) & 1)
+    {
+      // while we're about to wait anyway, we can do a validate_fast
+      if (d->start_time < curts - 1)
+        {
+          validate_fast(d, 3);
+          d->start_time = curts - 1;
+        }
+      tx_spinloop(4);
+#ifdef USE_PTHREAD_MUTEX
+      pthread_mutex_lock(&mutex_inevitable);
+      pthread_mutex_unlock(&mutex_inevitable);
+#endif
+    }
+  acquireLocks(d);
+}
+
+void commitInevitableTransaction(struct tx_descriptor *d)
+{
+  unsigned long ts;
+  _Bool ok;
+
+#ifdef COMMIT_OTHER_INEV
+  // reset d_inev_checking back from 1 to 0
+  while (!bool_cas(&d_inev_checking, 1, 0))
+    tx_spinloop(9);
+#endif
+  // no-one else can modify global_timestamp if I'm inevitable
+  // and d_inev_checking is 0
+  ts = get_global_timestamp(d);
+  assert(ts & 1);
+  set_global_timestamp(d, ts + 1);
+  d->end_time = ts + 1;
+  assert(d->end_time == (d->start_time + 2));
+
+  // run the redo log, and release the locks
+  tx_redo(d);
+
+#ifdef USE_PTHREAD_MUTEX
+  pthread_mutex_unlock(&mutex_inevitable);
+#endif
+}
+
+/* lazy/lazy read instrumentation */
+void* stm_read_word(void** addr)
+{
+  struct tx_descriptor *d = thread_descriptor;
+
+  // check writeset first
+  wlog_t* found;
+  REDOLOG_FIND(d->redolog, addr, found, goto not_found);
+  return found->val;
+
+ not_found:;
+  // get the orec addr
+  volatile orec_t* o = get_orec((void*)addr);
+  owner_version_t ovt;
+
+#ifdef COMMIT_OTHER_INEV
+  // log orec BEFORE we spinloop waiting for the orec lock to be released,
+  // for can_commit_with_other_inevitable()
+  oreclist_insert(&d->reads, (orec_t*)o);
+#endif
+
+ retry:
+  // read the orec BEFORE we read anything else
+  ovt = o->v;
+  CFENCE;
+
+  // this tx doesn't hold any locks, so if the lock for this addr is held,
+  // there is contention.  A lock is never hold for too long, so spinloop
+  // until it is released.
+  if (IS_LOCKED_OR_NEWER(ovt, d->start_time))
+    {
+      if (IS_LOCKED(ovt)) {
+        tx_spinloop(7);
+        goto retry;
+      }
+      // else this location is too new, scale forward
+      owner_version_t newts = get_global_timestamp(d) & ~1;
+#ifdef COMMIT_OTHER_INEV
+      d->reads.size--;   // ignore the newly logged orec
+#endif
+      validate_fast(d, 1);
+#ifdef COMMIT_OTHER_INEV
+      d->reads.size++;
+#endif
+      d->start_time = newts;
+    }
+
+  // orec is unlocked, with ts <= start_time.  read the location
+  void* tmp = *addr;
+
+  // postvalidate AFTER reading addr:
+  CFENCE;
+  if (o->v != ovt)
+    goto retry;       /* oups, try again */
+
+#ifndef COMMIT_OTHER_INEV
+  oreclist_insert(&d->reads, (orec_t*)o);
+#endif
+
+  return tmp;
+}
+
+void stm_write_word(void** addr, void* val)
+{
+  struct tx_descriptor *d = thread_descriptor;
+  redolog_insert(&d->redolog, addr, val);
+}
+
+
+void stm_descriptor_init(void)
+{
+  struct tx_descriptor *d = malloc(sizeof(struct tx_descriptor));
+  memset(d, 0, sizeof(struct tx_descriptor));
+
+  /* initialize 'my_lock_word' to be a unique negative number */
+  d->my_lock_word = (owner_version_t)d;
+  if (!IS_LOCKED(d->my_lock_word))
+    d->my_lock_word = ~d->my_lock_word;
+  assert(IS_LOCKED(d->my_lock_word));
+  d->spinloop_counter = (unsigned int)(d->my_lock_word | 1);
+
+  thread_descriptor = d;
+}
+
+void stm_descriptor_done(void)
+{
+  struct tx_descriptor *d = thread_descriptor;
+  thread_descriptor = NULL;
+
+  int num_aborts = 0, num_spinloops = 0;
+  int i, prevchar;
+  for (i=0; i<ABORT_REASONS; i++)
+    num_aborts += d->num_aborts[i];
+  for (i=0; i<SPINLOOP_REASONS; i++)
+    num_spinloops += d->num_spinloops[i];
+
+  fprintf(stderr, "thread %lx: %d commits, %d aborts ",
+          d->my_lock_word,
+          d->num_commits,
+          num_aborts);
+
+  for (i=0; i<ABORT_REASONS; i++)
+    fprintf(stderr, "%c%d", i == 0 ? '[' : ',',
+            d->num_aborts[i]);
+
+  for (i=1; i<SPINLOOP_REASONS; i++)  /* num_spinloops[0] == num_aborts */
+    fprintf(stderr, "%c%d", i == 1 ? '|' : ',',
+            d->num_spinloops[i]);
+
+#ifdef COMMIT_OTHER_INEV
+  for (i=0; i<OTHERINEV_REASONS; i++)
+    fprintf(stderr, "%c%d", i == 0 ? '|' : ',',
+            d->num_otherinev[i]);
+#endif
+
+  fprintf(stderr, "]\n");
+  free(d);
+}
+
+void* stm_perform_transaction(void*(*callback)(void*), void *arg)
+{
+  void *result;
+  jmp_buf jmpbuf;
+  stm_begin_transaction(&jmpbuf);
+  result = callback(arg);
+  stm_commit_transaction();
+  return result;
+}
+
+void stm_begin_transaction(jmp_buf* buf)
+{
+  struct tx_descriptor *d = thread_descriptor;
+  d->setjmp_buf = buf;
+  d->start_time = d->last_known_global_timestamp & ~1;
+}
+
+long stm_commit_transaction(void)
+{
+  struct tx_descriptor *d = thread_descriptor;
+
+  // if I don't have writes, I'm committed
+  if (!redolog_any_entry(&d->redolog))
+    {
+      if (is_inevitable(d))
+        {
+          unsigned long ts = get_global_timestamp(d);
+          assert(ts & 1);
+          set_global_timestamp(d, ts - 1);
+#ifdef USE_PTHREAD_MUTEX
+          pthread_mutex_unlock(&mutex_inevitable);
+#endif
+        }
+      d->num_commits++;
+      common_cleanup(d);
+      return d->start_time;
+    }
+
+  // bring that variable over to this CPU core (optimization, maybe)
+  global_timestamp;
+
+  // acquire locks
+  acquireLocks(d);
+
+  if (is_inevitable(d))
+    {
+      commitInevitableTransaction(d);
+    }
+  else
+    {
+      while (1)
+        {
+          unsigned long expected = get_global_timestamp(d);
+          if (expected & 1)
+            {
+#ifdef COMMIT_OTHER_INEV
+              // there is another inevitable transaction running.
+              expected = can_commit_with_other_inevitable(d, expected);
+              if (expected != 0)
+                {
+                  d->end_time = expected;
+                  break;
+                }
+#endif
+              // wait until it is done.  hopefully we can then proceed
+              // without conflicts.
+              wait_end_inevitability(d);
+              continue;
+            }
+          if (change_global_timestamp(d, expected, expected + 2))
+            {
+              d->end_time = expected + 2;
+              break;
+            }
+        }
+
+      // validate (but skip validation if nobody else committed)
+      if (d->end_time != (d->start_time + 2))
+        validate(d);
+
+      // run the redo log, and release the locks
+      tx_redo(d);
+    }
+
+  // remember that this was a commit
+  d->num_commits++;
+
+  // reset all lists
+  common_cleanup(d);
+  return d->end_time;
+}
+
+void stm_try_inevitable(void)
+{
+  /* when a transaction is inevitable, its start_time is equal to
+     global_timestamp and global_timestamp cannot be incremented
+     by another thread.  We set the lowest bit in global_timestamp
+     to 1. */
+  struct tx_descriptor *d = thread_descriptor;
+
+  if (is_inevitable(d))
+    return;  /* I am already inevitable */
+
+  while (1)
+    {
+      unsigned long curtime = get_global_timestamp(d);
+      if (d->start_time != (curtime & ~1))
+        {                             /* scale forward */
+          validate_fast(d, 2);
+          d->start_time = curtime & ~1;
+        }
+#ifdef USE_PTHREAD_MUTEX
+      pthread_mutex_lock(&mutex_inevitable);
+#endif
+      if (curtime & 1)   /* there is, or was, already an inevitable thread */
+        {
+          /* should we spinloop here, or abort (and likely come back
+             in try_inevitable() very soon)?  unclear.  For now
+             let's try to spinloop, after the waiting done by
+             acquiring the mutex */
+#ifdef USE_PTHREAD_MUTEX
+          pthread_mutex_unlock(&mutex_inevitable);
+#endif
+          tx_spinloop(6);
+          continue;
+        }
+      if (change_global_timestamp(d, curtime, curtime + 1))
+        break;
+#ifdef USE_PTHREAD_MUTEX
+      pthread_mutex_unlock(&mutex_inevitable);
+#endif
+    }
+  d->setjmp_buf = NULL;   /* inevitable from now on */
+#ifdef COMMIT_OTHER_INEV
+  thread_descriptor_inev = d;
+  CFENCE;
+  d_inev_checking = 1;
+#endif
+}
+
+void stm_begin_inevitable_transaction(void)
+{
+  struct tx_descriptor *d = thread_descriptor;
+  unsigned long curtime;
+
+ retry:
+#ifdef USE_PTHREAD_MUTEX
+  pthread_mutex_lock(&mutex_inevitable);   /* possibly waiting here */
+#endif
+
+  while (1)
+    {
+      curtime = global_timestamp;
+      if (curtime & 1)
+        {
+#ifdef USE_PTHREAD_MUTEX
+          pthread_mutex_unlock(&mutex_inevitable);
+#endif
+          tx_spinloop(5);
+          goto retry;
+        }
+      if (bool_cas(&global_timestamp, curtime, curtime + 1))
+        break;
+    }
+  d->setjmp_buf = NULL;
+  d->start_time = curtime;
+#ifdef COMMIT_OTHER_INEV
+  thread_descriptor_inev = d;
+  CFENCE;
+  d_inev_checking = 1;
+#endif
+}
diff --git a/pypy/translator/stm/src_stm/et.h b/pypy/translator/stm/src_stm/et.h
new file mode 100644
--- /dev/null
+++ b/pypy/translator/stm/src_stm/et.h
@@ -0,0 +1,25 @@
+/*** Extendable Timestamps
+ *
+ * This is a C version of rstm_r5/stm/et.hpp.
+ * See http://www.cs.rochester.edu/research/synchronization/rstm/api.shtml
+ *
+ */
+
+#ifndef _ET_H
+#define _ET_H
+
+#include <setjmp.h>
+
+
+void stm_descriptor_init(void);
+void stm_descriptor_done(void);
+void* stm_perform_transaction(void*(*)(void*), void*);
+void stm_begin_transaction(jmp_buf* buf);
+long stm_commit_transaction(void);
+void* stm_read_word(void** addr);
+void stm_write_word(void** addr, void* val);
+void stm_try_inevitable(void);
+void stm_begin_inevitable_transaction(void);
+
+
+#endif  /* _ET_H */
diff --git a/pypy/translator/stm/src_stm/lists.c b/pypy/translator/stm/src_stm/lists.c
new file mode 100644
--- /dev/null
+++ b/pypy/translator/stm/src_stm/lists.c
@@ -0,0 +1,241 @@
+/* -*- c-basic-offset: 2 -*- */
+
+#include <limits.h>
+
+/************************************************************/
+
+/* The redolog_xx functions are implemented as a tree, supporting
+   very high performance in REDOLOG_FIND in the common case where
+   there are no or few elements in the tree, but scaling correctly
+   if the number of items becomes large. */
+
+#define TREE_BITS   4
+#define TREE_ARITY  (1 << TREE_BITS)
+
+#define TREE_DEPTH_MAX   ((sizeof(void*)*8 - 2 + TREE_BITS-1) / TREE_BITS)
+/* sizeof(void*) = total number of bits
+   2 = bits that we ignore anyway (2 or 3, conservatively 2)
+   (x + TREE_BITS-1) / TREE_BITS = divide by TREE_BITS, rounding up
+*/
+
+#define TREE_MASK   ((TREE_ARITY - 1) * sizeof(void*))
+
+typedef struct {
+  void** addr;
+  void* val;
+  owner_version_t p;   // the previous version number (if locked)
+} wlog_t;
+
+typedef struct {
+  char *items[TREE_ARITY];
+} wlog_node_t;
+
+struct RedoLog {
+  char *raw_start, *raw_current, *raw_end;
+  wlog_node_t toplevel;
+};
+
+static void _redolog_clear_node(wlog_node_t *node)
+{
+  memset(node, 0, sizeof(wlog_node_t));
+}
+
+static void redolog_clear(struct RedoLog *redolog)
+{
+  if (redolog->raw_current != redolog->raw_start)
+    {
+      _redolog_clear_node(&redolog->toplevel);
+      redolog->raw_current = redolog->raw_start;
+    }
+}
+
+static int redolog_any_entry(struct RedoLog *redolog)
+{
+  return redolog->raw_current != redolog->raw_start;
+}
+
+#define _REDOLOG_LOOP(redolog, item, INITIAL, _PLUS_)                   \
+{                                                                       \
+  struct { char **next; char **end; } _stack[TREE_DEPTH_MAX], *_stackp; \
+  char **_next, **_end, *_entry;                                        \
+  /* initialization */                                                  \
+  _stackp = _stack;      /* empty stack */                              \
+  _next = (redolog).toplevel.items + INITIAL;                           \
+  _end = _next _PLUS_ TREE_ARITY;                                       \
+  /* loop */                                                            \
+  while (1)                                                             \
+    {                                                                   \
+      if (_next == _end)                                                \
+        {                                                               \
+          if (_stackp == _stack)                                        \
+            break;   /* done */                                         \
+          /* finished with this level, go to the next one */            \
+          _stackp--;                                                    \
+          _next = _stackp->next;                                        \
+          _end = _stackp->end;                                          \
+          continue;                                                     \
+        }                                                               \
+      _entry = *_next;                                                  \
+      _next = _next _PLUS_ 1;                                           \
+      if (_entry == NULL)   /* empty entry */                           \
+        continue;                                                       \
+      if (((long)_entry) & 1)                                           \
+        {  /* points to a further level: enter it */                    \
+          _stackp->next = _next;                                        \
+          _stackp->end = _end;                                          \
+          _stackp++;                                                    \
+          _next = ((wlog_node_t *)(_entry - 1))->items + INITIAL;       \
+          _end = _next _PLUS_ TREE_ARITY;                               \
+          continue;                                                     \
+        }                                                               \
+      /* points to a wlog_t item */                                     \
+      item = (wlog_t *)_entry;
+
+#define REDOLOG_LOOP_FORWARD(redolog, item)                             \
+                       _REDOLOG_LOOP(redolog, item, 0, +)
+#define REDOLOG_LOOP_BACKWARD(redolog, item)                            \
+                       _REDOLOG_LOOP(redolog, item, (TREE_ARITY-1), -)
+#define REDOLOG_LOOP_END     } }
+
+#define REDOLOG_FIND(redolog, addr1, result, goto_not_found)    \
+{                                                               \
+  unsigned long _key = (unsigned long)(addr1);                  \
+  char *_p = (char *)((redolog).toplevel.items);                \
+  char *_entry = *(char **)(_p + (_key & TREE_MASK));           \
+  if (_entry == NULL)                                           \
+    goto_not_found;    /* common case, hopefully */             \
+  result = _redolog_find(_entry, addr1);                        \
+  if (result == NULL || result->addr != (addr1))                \
+    goto_not_found;                                             \
+}
+
+static wlog_t *_redolog_find(char *entry, void** addr)
+{
+  unsigned long key = (unsigned long)addr;
+  while (((long)entry) & 1)
+    {   /* points to a further level */
+      key >>= TREE_BITS;
+      entry = *(char **)((entry - 1) + (key & TREE_MASK));
+    }
+  return (wlog_t *)entry;   /* may be NULL */
+}
+
+static void redolog_insert(struct RedoLog *redolog, void** addr, void* val);
+
+static void _redolog_grow(struct RedoLog *redolog, long extra)
+{
+  struct RedoLog newredolog;
+  wlog_t *item, *newitem;
+  long alloc = redolog->raw_end - redolog->raw_start;
+  long newalloc = (alloc + extra + (alloc >> 2) + 31) & ~15;
+  //fprintf(stderr, "growth: %ld\n", newalloc);
+  char *newitems = malloc(newalloc);
+  newredolog.raw_start = newitems;
+  newredolog.raw_current = newitems;
+  newredolog.raw_end = newitems + newalloc;
+  _redolog_clear_node(&newredolog.toplevel);
+  REDOLOG_LOOP_FORWARD(*redolog, item)
+    {
+      assert(item->p == -1);
+      redolog_insert(&newredolog, item->addr, item->val);
+    } REDOLOG_LOOP_END;
+  free(redolog->raw_start);
+  *redolog = newredolog;
+}
+
+static char *_redolog_grab(struct RedoLog *redolog, long size)
+{
+  char *result;
+  result = redolog->raw_current;
+  redolog->raw_current += size;
+  if (redolog->raw_current > redolog->raw_end)
+    {
+      _redolog_grow(redolog, size);
+      return NULL;
+    }
+  return result;
+}
+
+static void redolog_insert(struct RedoLog *redolog, void** addr, void* val)
+{
+ retry:;
+  wlog_t *wlog;
+  unsigned long key = (unsigned long)addr;
+  int shift = 0;
+  char *p = (char *)(redolog->toplevel.items);
+  char *entry;
+  while (1)
+    {
+      p += (key >> shift) & TREE_MASK;
+      shift += TREE_BITS;
+      entry = *(char **)p;
+      if (entry == NULL)
+        break;
+      else if (((long)entry) & 1)
+        {   /* points to a further level */
+          p = entry - 1;
+        }
+      else
+        {
+          wlog_t *wlog1 = (wlog_t *)entry;
+          if (wlog1->addr == addr)
+            {
+              /* overwrite and that's it */
+              wlog1->val = val;
+              return;
+            }
+          /* collision: there is already a different wlog here */
+          wlog_node_t *node = (wlog_node_t *)
+                _redolog_grab(redolog, sizeof(wlog_node_t));
+          if (node == NULL) goto retry;
+          _redolog_clear_node(node);
+          unsigned long key1 = (unsigned long)(wlog1->addr);
+          char *p1 = (char *)(node->items);
+          *(wlog_t **)(p1 + ((key1 >> shift) & TREE_MASK)) = wlog1;
+          *(char **)p = ((char *)node) + 1;
+          p = p1;
+        }
+    }
+  wlog = (wlog_t *)_redolog_grab(redolog, sizeof(wlog_t));
+  if (wlog == NULL) goto retry;
+  wlog->addr = addr;
+  wlog->val = val;
+  wlog->p = -1;
+  *(char **)p = (char *)wlog;
+}
+
+/************************************************************/
+
+/* The oreclist_xx functions are implemented as an array that grows
+   as needed. */
+
+struct OrecList {
+  long size, alloc;
+  unsigned long locked;
+  orec_t **items;
+};
+
+static void _oreclist_grow(struct OrecList *oreclist)
+{
+  long newalloc = oreclist->alloc + (oreclist->alloc >> 1) + 16;
+  orec_t **newitems = malloc(newalloc * sizeof(orec_t *));
+  long i;
+  for (i=0; i<oreclist->size; i++)
+    newitems[i] = oreclist->items[i];
+  while (!bool_cas(&oreclist->locked, 0, 1))
+    /* rare case */ ;
+  free(oreclist->items);
+  oreclist->items = newitems;
+  oreclist->alloc = newalloc;
+  CFENCE;
+  oreclist->locked = 0;
+}
+
+static void oreclist_insert(struct OrecList *oreclist, orec_t *newitem)
+{
+  if (oreclist->size == oreclist->alloc)
+    _oreclist_grow(oreclist);
+  oreclist->items[oreclist->size++] = newitem;
+}
+
+/************************************************************/
diff --git a/pypy/translator/stm/test/__init__.py b/pypy/translator/stm/test/__init__.py
new file mode 100644
diff --git a/pypy/translator/stm/test/test_basic.py b/pypy/translator/stm/test/test_basic.py
new file mode 100644
--- /dev/null
+++ b/pypy/translator/stm/test/test_basic.py
@@ -0,0 +1,2 @@
+
+
diff --git a/pypy/translator/stm/test/test_rffi_stm.py b/pypy/translator/stm/test/test_rffi_stm.py
new file mode 100644
--- /dev/null
+++ b/pypy/translator/stm/test/test_rffi_stm.py
@@ -0,0 +1,14 @@
+from pypy.translator.stm._rffi_stm import *
+from pypy.rpython.annlowlevel import llhelper
+
+def test_descriptor():
+    descriptor_init()
+    descriptor_done()
+
+def test_perform_transaction():
+    def callback1(x):
+        return lltype.nullptr(rffi.VOIDP.TO)
+    descriptor_init()
+    perform_transaction(llhelper(CALLBACK, callback1),
+                        lltype.nullptr(rffi.VOIDP.TO))
+    descriptor_done()