[pypy-commit] pypy jit-multilabel: hg merge default

Sat Dec 17 13:13:11 CET 2011

Author: Hakan Ardo <hakan at debian.org>
Branch: jit-multilabel
Changeset: r50607:2d900cd7e4e3
Date: 2011-12-17 12:49 +0100
http://bitbucket.org/pypy/pypy/changeset/2d900cd7e4e3/

Log:	hg merge default

diff --git a/lib-python/modified-2.7/ctypes/test/test_callbacks.py b/lib-python/modified-2.7/ctypes/test/test_callbacks.py
--- a/lib-python/modified-2.7/ctypes/test/test_callbacks.py
+++ b/lib-python/modified-2.7/ctypes/test/test_callbacks.py
@@ -1,5 +1,6 @@
 import unittest
 from ctypes import *
+from ctypes.test import xfail
 import _ctypes_test
 
 class Callbacks(unittest.TestCase):
@@ -98,6 +99,7 @@
 ##        self.check_type(c_char_p, "abc")
 ##        self.check_type(c_char_p, "def")
 
+    @xfail
     def test_pyobject(self):
         o = ()
         from sys import getrefcount as grc
diff --git a/lib-python/modified-2.7/ctypes/test/test_libc.py b/lib-python/modified-2.7/ctypes/test/test_libc.py
--- a/lib-python/modified-2.7/ctypes/test/test_libc.py
+++ b/lib-python/modified-2.7/ctypes/test/test_libc.py
@@ -25,7 +25,10 @@
         lib.my_qsort(chars, len(chars)-1, sizeof(c_char), comparefunc(sort))
         self.assertEqual(chars.raw, "   ,,aaaadmmmnpppsss\x00")
 
-    def test_no_more_xfail(self):
+    def SKIPPED_test_no_more_xfail(self):
+        # We decided to not explicitly support the whole ctypes-2.7
+        # and instead go for a case-by-case, demand-driven approach.
+        # So this test is skipped instead of failing.
         import socket
         import ctypes.test
         self.assertTrue(not hasattr(ctypes.test, 'xfail'),
diff --git a/pypy/doc/config/objspace.std.withspecialisedtuple.txt b/pypy/doc/config/objspace.std.withspecialisedtuple.txt
new file mode 100644
--- /dev/null
+++ b/pypy/doc/config/objspace.std.withspecialisedtuple.txt
@@ -0,0 +1,3 @@
+Use "specialized tuples", a custom implementation for some common kinds
+of tuples.  Currently limited to tuples of length 2, in three variants:
+(int, int), (float, float), and a generic (object, object).
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -190,8 +190,8 @@
     def is_w(self, space, w_other):
         return self is w_other
 
-    def unique_id(self, space):
-        return space.wrap(compute_unique_id(self))
+    def immutable_unique_id(self, space):
+        return None
 
     def str_w(self, space):
         w_msg = typed_unwrap_error_msg(space, "string", self)
@@ -706,7 +706,10 @@
         return w_two.is_w(self, w_one)
 
     def id(self, w_obj):
-        return w_obj.unique_id(self)
+        w_result = w_obj.immutable_unique_id(self)
+        if w_result is None:
+            w_result = self.wrap(compute_unique_id(w_obj))
+        return w_result
 
     def hash_w(self, w_obj):
         """shortcut for space.int_w(space.hash(w_obj))"""
diff --git a/pypy/interpreter/typedef.py b/pypy/interpreter/typedef.py
--- a/pypy/interpreter/typedef.py
+++ b/pypy/interpreter/typedef.py
@@ -54,7 +54,11 @@
 #  Hash support
 
 def default_identity_hash(space, w_obj):
-    return space.wrap(compute_identity_hash(w_obj))
+    w_unique_id = w_obj.immutable_unique_id(space)
+    if w_unique_id is None:     # common case
+        return space.wrap(compute_identity_hash(w_obj))
+    else:
+        return space.hash(w_unique_id)
 
 # ____________________________________________________________
 #
diff --git a/pypy/jit/backend/llgraph/llimpl.py b/pypy/jit/backend/llgraph/llimpl.py
--- a/pypy/jit/backend/llgraph/llimpl.py
+++ b/pypy/jit/backend/llgraph/llimpl.py
@@ -328,6 +328,14 @@
     _variables.append(v)
     return r
 
+def compile_started_vars(clt):
+    if not hasattr(clt, '_debug_argtypes'):    # only when compiling the loop
+        argtypes = [v.concretetype for v in _variables]
+        try:
+            clt._debug_argtypes = argtypes
+        except AttributeError:    # when 'clt' is actually a translated
+            pass                  # GcStruct
+
 def compile_add(loop, opnum):
     loop = _from_opaque(loop)
     loop.operations.append(Operation(opnum))
@@ -355,11 +363,13 @@
 
 TARGET_TOKENS = weakref.WeakKeyDictionary()
 
-def compile_add_target_token(loop, descr):
+def compile_add_target_token(loop, descr, clt):
+    # here, 'clt' is the compiled_loop_token of the original loop that
+    # we are compiling
     loop = _from_opaque(loop)
     op = loop.operations[-1]
     descrobj = _normalize(descr)
-    TARGET_TOKENS[descrobj] = loop, len(loop.operations), op.args
+    TARGET_TOKENS[descrobj] = loop, len(loop.operations), op.args, clt
 
 def compile_add_var(loop, intvar):
     loop = _from_opaque(loop)
@@ -395,17 +405,25 @@
     _variables.append(v)
     return r
 
-def compile_add_jump_target(loop, targettoken):
+def compile_add_jump_target(loop, targettoken, source_clt):
     loop = _from_opaque(loop)
     descrobj = _normalize(targettoken)
-    loop_target, target_opindex, target_inputargs = TARGET_TOKENS[descrobj]
+    (loop_target, target_opindex, target_inputargs, target_clt
+        ) = TARGET_TOKENS[descrobj]
+    #
+    try:
+        assert source_clt._debug_argtypes == target_clt._debug_argtypes
+    except AttributeError:   # when translated
+        pass
     #
     op = loop.operations[-1]
     op.jump_target = loop_target
     op.jump_target_opindex = target_opindex
     op.jump_target_inputargs = target_inputargs
     assert op.opnum == rop.JUMP
-    assert len(op.args) == len(target_inputargs)
+    assert [v.concretetype for v in op.args] == (
+           [v.concretetype for v in target_inputargs])
+    #
     if loop_target == loop:
         log.info("compiling new loop")
     else:
@@ -987,6 +1005,7 @@
         self._may_force = self.opindex
         try:
             inpargs = _from_opaque(ctl.compiled_version).inputargs
+            assert len(inpargs) == len(args)
             for i, inparg in enumerate(inpargs):
                 TYPE = inparg.concretetype
                 if TYPE is lltype.Signed:
@@ -1816,6 +1835,7 @@
 setannotation(compile_start_int_var, annmodel.SomeInteger())
 setannotation(compile_start_ref_var, annmodel.SomeInteger())
 setannotation(compile_start_float_var, annmodel.SomeInteger())
+setannotation(compile_started_vars, annmodel.s_None)
 setannotation(compile_add, annmodel.s_None)
 setannotation(compile_add_descr, annmodel.s_None)
 setannotation(compile_add_descr_arg, annmodel.s_None)
diff --git a/pypy/jit/backend/llgraph/runner.py b/pypy/jit/backend/llgraph/runner.py
--- a/pypy/jit/backend/llgraph/runner.py
+++ b/pypy/jit/backend/llgraph/runner.py
@@ -138,11 +138,12 @@
         clt = original_loop_token.compiled_loop_token
         clt.loop_and_bridges.append(c)
         clt.compiling_a_bridge()
-        self._compile_loop_or_bridge(c, inputargs, operations)
+        self._compile_loop_or_bridge(c, inputargs, operations, clt)
         old, oldindex = faildescr._compiled_fail
         llimpl.compile_redirect_fail(old, oldindex, c)
 
-    def compile_loop(self, inputargs, operations, jitcell_token, log=True, name=''):
+    def compile_loop(self, inputargs, operations, jitcell_token,
+                     log=True, name=''):
         """In a real assembler backend, this should assemble the given
         list of operations.  Here we just generate a similar CompiledLoop
         instance.  The code here is RPython, whereas the code in llimpl
@@ -153,14 +154,14 @@
         clt.loop_and_bridges = [c]
         clt.compiled_version = c
         jitcell_token.compiled_loop_token = clt
-        self._compile_loop_or_bridge(c, inputargs, operations)
+        self._compile_loop_or_bridge(c, inputargs, operations, clt)
 
     def free_loop_and_bridges(self, compiled_loop_token):
         for c in compiled_loop_token.loop_and_bridges:
             llimpl.mark_as_free(c)
         model.AbstractCPU.free_loop_and_bridges(self, compiled_loop_token)
 
-    def _compile_loop_or_bridge(self, c, inputargs, operations):
+    def _compile_loop_or_bridge(self, c, inputargs, operations, clt):
         var2index = {}
         for box in inputargs:
             if isinstance(box, history.BoxInt):
@@ -172,10 +173,11 @@
                 var2index[box] = llimpl.compile_start_float_var(c)
             else:
                 raise Exception("box is: %r" % (box,))
-        self._compile_operations(c, operations, var2index)
+        llimpl.compile_started_vars(clt)
+        self._compile_operations(c, operations, var2index, clt)
         return c
 
-    def _compile_operations(self, c, operations, var2index):
+    def _compile_operations(self, c, operations, var2index, clt):
         for op in operations:
             llimpl.compile_add(c, op.getopnum())
             descr = op.getdescr()
@@ -187,7 +189,7 @@
                 assert op.getopnum() != rop.JUMP
                 llimpl.compile_add_loop_token(c, descr)
             if isinstance(descr, history.TargetToken) and op.getopnum() == rop.LABEL:
-                llimpl.compile_add_target_token(c, descr)
+                llimpl.compile_add_target_token(c, descr, clt)
             if self.is_oo and isinstance(descr, (OODescr, MethDescr)):
                 # hack hack, not rpython
                 c._obj.externalobj.operations[-1].setdescr(descr)
@@ -241,7 +243,7 @@
         assert op.is_final()
         if op.getopnum() == rop.JUMP:
             targettoken = op.getdescr()
-            llimpl.compile_add_jump_target(c, targettoken)
+            llimpl.compile_add_jump_target(c, targettoken, clt)
         elif op.getopnum() == rop.FINISH:
             faildescr = op.getdescr()
             index = self.get_fail_descr_number(faildescr)
@@ -260,23 +262,28 @@
         self.latest_frame = frame
         return fail_index
 
-    def execute_token(self, loop_token):
-        """Calls the fake 'assembler' generated for the given loop.
-        Returns the descr of the last executed operation: either the one
-        attached to the failing guard, or the one attached to the FINISH.
-        Use set_future_value_xxx() before, and get_latest_value_xxx() after.
-        """
-        fail_index = self._execute_token(loop_token)
-        return self.get_fail_descr_from_number(fail_index)
-
-    def set_future_value_int(self, index, intvalue):
-        llimpl.set_future_value_int(index, intvalue)
-
-    def set_future_value_ref(self, index, objvalue):
-        llimpl.set_future_value_ref(index, objvalue)
-
-    def set_future_value_float(self, index, floatvalue):
-        llimpl.set_future_value_float(index, floatvalue)
+    def make_execute_token(self, *argtypes):
+        nb_args = len(argtypes)
+        unroll_argtypes = unrolling_iterable(list(enumerate(argtypes)))
+        #
+        def execute_token(loop_token, *args):
+            assert len(args) == nb_args
+            for index, TYPE in unroll_argtypes:
+                x = args[index]
+                assert TYPE == lltype.typeOf(x)
+                if TYPE == lltype.Signed:
+                    llimpl.set_future_value_int(index, x)
+                elif TYPE == llmemory.GCREF:
+                    llimpl.set_future_value_ref(index, x)
+                elif TYPE == longlong.FLOATSTORAGE:
+                    llimpl.set_future_value_float(index, x)
+                else:
+                    assert 0
+            #
+            fail_index = self._execute_token(loop_token)
+            return self.get_fail_descr_from_number(fail_index)
+        #
+        return execute_token
 
     def get_latest_value_int(self, index):
         return llimpl.frame_int_getvalue(self.latest_frame, index)
diff --git a/pypy/jit/backend/llsupport/regalloc.py b/pypy/jit/backend/llsupport/regalloc.py
--- a/pypy/jit/backend/llsupport/regalloc.py
+++ b/pypy/jit/backend/llsupport/regalloc.py
@@ -69,6 +69,8 @@
         self.bindings[box] = loc
         #
         index = self.get_loc_index(loc)
+        if index < 0:
+            return
         endindex = index + self.frame_size(box.type)
         while len(self.used) < endindex:
             self.used.append(False)
@@ -91,6 +93,8 @@
         #
         size = self.frame_size(box.type)
         baseindex = self.get_loc_index(loc)
+        if baseindex < 0:
+            return
         for i in range(size):
             index = baseindex + i
             assert 0 <= index < len(self.used)
@@ -98,7 +102,8 @@
 
     def try_to_reuse_location(self, box, loc):
         index = self.get_loc_index(loc)
-        assert index >= 0
+        if index < 0:
+            return False
         size = self.frame_size(box.type)
         for i in range(size):
             while (index + i) >= len(self.used):
diff --git a/pypy/jit/backend/model.py b/pypy/jit/backend/model.py
--- a/pypy/jit/backend/model.py
+++ b/pypy/jit/backend/model.py
@@ -1,5 +1,6 @@
 from pypy.rlib.debug import debug_start, debug_print, debug_stop
 from pypy.jit.metainterp import history
+from pypy.rpython.lltypesystem import lltype
 
 
 class AbstractCPU(object):
@@ -84,24 +85,21 @@
         """Print a disassembled version of looptoken to stdout"""
         raise NotImplementedError
 
-    def execute_token(self, looptoken):
-        """Execute the generated code referenced by the looptoken.
+    def execute_token(self, looptoken, *args):
+        """NOT_RPYTHON (for tests only)
+        Execute the generated code referenced by the looptoken.
         Returns the descr of the last executed operation: either the one
         attached to the failing guard, or the one attached to the FINISH.
-        Use set_future_value_xxx() before, and get_latest_value_xxx() after.
+        Use get_latest_value_xxx() afterwards to read the result(s).
         """
-        raise NotImplementedError
+        argtypes = [lltype.typeOf(x) for x in args]
+        execute = self.make_execute_token(*argtypes)
+        return execute(looptoken, *args)
 
-    def set_future_value_int(self, index, intvalue):
-        """Set the value for the index'th argument for the loop to run."""
-        raise NotImplementedError
-
-    def set_future_value_float(self, index, floatvalue):
-        """Set the value for the index'th argument for the loop to run."""
-        raise NotImplementedError
-
-    def set_future_value_ref(self, index, objvalue):
-        """Set the value for the index'th argument for the loop to run."""
+    def make_execute_token(self, *argtypes):
+        """Must make and return an execute_token() function that will be
+        called with the given argtypes.
+        """
         raise NotImplementedError
 
     def get_latest_value_int(self, index):
diff --git a/pypy/jit/backend/test/calling_convention_test.py b/pypy/jit/backend/test/calling_convention_test.py
--- a/pypy/jit/backend/test/calling_convention_test.py
+++ b/pypy/jit/backend/test/calling_convention_test.py
@@ -40,17 +40,18 @@
         local_floats = list(floats)
         local_ints = list(ints)
         expected_result = 0.0
+        arguments = []
         for i in range(len(args)):
             x = args[i]
             if x[0] == 'f':
                 x = local_floats.pop()
                 t = longlong.getfloatstorage(x)
-                self.cpu.set_future_value_float(i, t)
+                arguments.append(t)
             else:
                 x = local_ints.pop()
-                self.cpu.set_future_value_int(i, x)
+                arguments.append(x)
             expected_result += x
-        return expected_result
+        return arguments, expected_result
 
     @classmethod
     def get_funcbox(cls, cpu, func_ptr):
@@ -110,9 +111,9 @@
             looptoken = JitCellToken()
             done_number = self.cpu.get_fail_descr_number(loop.operations[-1].getdescr())
             self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
-            expected_result = self._prepare_args(args, floats, ints)
+            argvals, expected_result = self._prepare_args(args, floats, ints)
 
-            res = self.cpu.execute_token(looptoken)
+            res = self.cpu.execute_token(looptoken, *argvals)
             x = longlong.getrealfloat(cpu.get_latest_value_float(0))
             assert abs(x - expected_result) < 0.0001
 
@@ -258,8 +259,8 @@
             done_number = self.cpu.get_fail_descr_number(called_loop.operations[-1].getdescr())
             self.cpu.compile_loop(called_loop.inputargs, called_loop.operations, called_looptoken)
 
-            expected_result = self._prepare_args(args, floats, ints)
-            res = cpu.execute_token(called_looptoken)
+            argvals, expected_result = self._prepare_args(args, floats, ints)
+            res = cpu.execute_token(called_looptoken, *argvals)
             assert res.identifier == 3
             t = longlong.getrealfloat(cpu.get_latest_value_float(0))
             assert abs(t - expected_result) < 0.0001
@@ -288,8 +289,8 @@
                 self.cpu.compile_loop(loop.inputargs, loop.operations, othertoken)
 
                 # prepare call to called_loop
-                self._prepare_args(args, floats, ints)
-                res = cpu.execute_token(othertoken)
+                argvals, _ = self._prepare_args(args, floats, ints)
+                res = cpu.execute_token(othertoken, *argvals)
                 x = longlong.getrealfloat(cpu.get_latest_value_float(0))
                 assert res.identifier == 4
                 assert abs(x - expected_result) < 0.0001
diff --git a/pypy/jit/backend/test/runner_test.py b/pypy/jit/backend/test/runner_test.py
--- a/pypy/jit/backend/test/runner_test.py
+++ b/pypy/jit/backend/test/runner_test.py
@@ -34,20 +34,17 @@
                                                                 descr)
         looptoken = JitCellToken()
         self.cpu.compile_loop(inputargs, operations, looptoken)
-        j = 0
+        args = []
         for box in inputargs:
             if isinstance(box, BoxInt):
-                self.cpu.set_future_value_int(j, box.getint())
-                j += 1
+                args.append(box.getint())
             elif isinstance(box, (BoxPtr, BoxObj)):
-                self.cpu.set_future_value_ref(j, box.getref_base())
-                j += 1
+                args.append(box.getref_base())
             elif isinstance(box, BoxFloat):
-                self.cpu.set_future_value_float(j, box.getfloatstorage())
-                j += 1
+                args.append(box.getfloatstorage())
             else:
                 raise NotImplementedError(box)
-        res = self.cpu.execute_token(looptoken)
+        res = self.cpu.execute_token(looptoken, *args)
         if res is operations[-1].getdescr():
             self.guard_failed = False
         else:
@@ -108,8 +105,7 @@
         inputargs = [i0]
         looptoken = JitCellToken()
         self.cpu.compile_loop(inputargs, operations, looptoken)
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         res = self.cpu.get_latest_value_int(0)
         assert res == 3
         assert fail.identifier == 1
@@ -131,8 +127,7 @@
         operations[3].setfailargs([i1])
 
         self.cpu.compile_loop(inputargs, operations, looptoken)
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_int(0)
         assert res == 10
@@ -156,8 +151,7 @@
         operations[4].setfailargs([None, None, i1, None])
 
         self.cpu.compile_loop(inputargs, operations, looptoken)
-        self.cpu.set_future_value_int(0, 44)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 44)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_int(2)
         assert res == 10
@@ -222,8 +216,7 @@
 
         self.cpu.compile_bridge(faildescr1, [i1b], bridge, looptoken)
 
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_int(0)
         assert res == 20
@@ -264,8 +257,7 @@
 
         self.cpu.compile_bridge(faildescr1, [i1b], bridge, looptoken)
 
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_int(0)
         assert res == 20
@@ -288,8 +280,7 @@
         operations[3].setfailargs([None, i1, None])
         self.cpu.compile_loop(inputargs, operations, looptoken)
 
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail is faildescr1
 
         count = self.cpu.get_latest_value_count()
@@ -311,8 +302,7 @@
             ResOperation(rop.FINISH, [i0], None, descr=faildescr)
             ]
         self.cpu.compile_loop([i0], operations, looptoken)
-        self.cpu.set_future_value_int(0, 99)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 99)
         assert fail is faildescr
         res = self.cpu.get_latest_value_int(0)
         assert res == 99
@@ -343,8 +333,7 @@
                 ]
             self.cpu.compile_loop([f0], operations, looptoken)
             value = longlong.getfloatstorage(-61.25)
-            self.cpu.set_future_value_float(0, value)
-            fail = self.cpu.execute_token(looptoken)
+            fail = self.cpu.execute_token(looptoken, value)
             assert fail is faildescr
             res = self.cpu.get_latest_value_float(0)
             assert longlong.getrealfloat(res) == -61.25
@@ -379,9 +368,7 @@
             ]
         operations[-2].setfailargs([t, z])
         cpu.compile_loop([x, y], operations, looptoken)
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.set_future_value_int(1, 10)
-        res = self.cpu.execute_token(looptoken)
+        res = self.cpu.execute_token(looptoken, 0, 10)
         assert self.cpu.get_latest_value_int(0) == 0
         assert self.cpu.get_latest_value_int(1) == 55
 
@@ -442,9 +429,7 @@
             for x, y, z in testcases:
                 excvalue = self.cpu.grab_exc_value()
                 assert not excvalue
-                self.cpu.set_future_value_int(0, x)
-                self.cpu.set_future_value_int(1, y)
-                fail = self.cpu.execute_token(looptoken)
+                fail = self.cpu.execute_token(looptoken, x, y)
                 if (z == boom) ^ reversed:
                     assert fail.identifier == 1
                 else:
@@ -1129,17 +1114,7 @@
                     assert 0
             values[index_counter] = 11
             #
-            for i, (box, val) in enumerate(zip(inputargs, values)):
-                if isinstance(box, BoxInt):
-                    self.cpu.set_future_value_int(i, val)
-                elif isinstance(box, BoxPtr):
-                    self.cpu.set_future_value_ref(i, val)
-                elif isinstance(box, BoxFloat):
-                    self.cpu.set_future_value_float(i, val)
-                else:
-                    assert 0
-            #
-            fail = self.cpu.execute_token(looptoken)
+            fail = self.cpu.execute_token(looptoken, *values)
             assert fail.identifier == 15
             #
             dstvalues = values[:]
@@ -1191,10 +1166,11 @@
 
         self.cpu.compile_bridge(faildescr1, fboxes2, bridge, looptoken)
 
+        args = []
         for i in range(len(fboxes)):
             x = 13.5 + 6.73 * i
-            self.cpu.set_future_value_float(i, longlong.getfloatstorage(x))
-        fail = self.cpu.execute_token(looptoken)
+            args.append(longlong.getfloatstorage(x))
+        fail = self.cpu.execute_token(looptoken, *args)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(res) == 8.5
@@ -1244,14 +1220,12 @@
                         if test1 == -42 or combinaison[0] == 'b':
                             for test2 in [-65, -42, -11]:
                                 if test2 == -42 or combinaison[1] == 'b':
-                                    n = 0
+                                    args = []
                                     if combinaison[0] == 'b':
-                                        cpu.set_future_value_int(n, test1)
-                                        n += 1
+                                        args.append(test1)
                                     if combinaison[1] == 'b':
-                                        cpu.set_future_value_int(n, test2)
-                                        n += 1
-                                    fail = cpu.execute_token(looptoken)
+                                        args.append(test2)
+                                    fail = cpu.execute_token(looptoken, *args)
                                     #
                                     expected = compare(test1, test2)
                                     expected ^= guard_case
@@ -1303,16 +1277,14 @@
                         if test1 == -4.5 or combinaison[0] == 'b':
                             for test2 in [-6.5, -4.5, -2.5, nan]:
                                 if test2 == -4.5 or combinaison[1] == 'b':
-                                    n = 0
+                                    args = []
                                     if combinaison[0] == 'b':
-                                        cpu.set_future_value_float(
-                                            n, longlong.getfloatstorage(test1))
-                                        n += 1
+                                        args.append(
+                                            longlong.getfloatstorage(test1))
                                     if combinaison[1] == 'b':
-                                        cpu.set_future_value_float(
-                                            n, longlong.getfloatstorage(test2))
-                                        n += 1
-                                    fail = cpu.execute_token(looptoken)
+                                        args.append(
+                                            longlong.getfloatstorage(test2))
+                                    fail = cpu.execute_token(looptoken, *args)
                                     #
                                     expected = compare(test1, test2)
                                     expected ^= guard_case
@@ -1356,15 +1328,16 @@
         #
         self.cpu.compile_loop(inputargs, operations, looptoken)
         #
-        for i, box in enumerate(inputargs):
+        args = []
+        for box in inputargs:
             if isinstance(box, BoxInt):
-                self.cpu.set_future_value_int(i, box.getint())
+                args.append(box.getint())
             elif isinstance(box, BoxFloat):
-                self.cpu.set_future_value_float(i, box.getfloatstorage())
+                args.append(box.getfloatstorage())
             else:
                 assert 0
         #
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, *args)
         assert fail.identifier == 1
 
     def test_nan_and_infinity(self):
@@ -1427,10 +1400,9 @@
                         unique_testcase_list = list(set(testcase))
                         self.cpu.compile_loop(unique_testcase_list, operations,
                                               looptoken)
-                        for i, box in enumerate(unique_testcase_list):
-                            self.cpu.set_future_value_float(
-                                i, box.getfloatstorage())
-                        fail = self.cpu.execute_token(looptoken)
+                        args = [box.getfloatstorage()
+                                for box in unique_testcase_list]
+                        fail = self.cpu.execute_token(looptoken, *args)
                         if fail.identifier != 5 - (expected_id^expected):
                             if fail.identifier == 4:
                                 msg = "was taken"
@@ -1699,14 +1671,12 @@
         loop = parse(ops, self.cpu, namespace=locals())
         looptoken = JitCellToken()
         self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
-        self.cpu.set_future_value_int(0, 1)
-        self.cpu.execute_token(looptoken)
+        self.cpu.execute_token(looptoken, 1)
         assert self.cpu.get_latest_value_int(0) == 0
         assert self.cpu.get_latest_value_ref(1) == xptr
         excvalue = self.cpu.grab_exc_value()
         assert not excvalue
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.execute_token(looptoken)
+        self.cpu.execute_token(looptoken, 0)
         assert self.cpu.get_latest_value_int(0) == 1
         excvalue = self.cpu.grab_exc_value()
         assert not excvalue
@@ -1725,8 +1695,7 @@
         loop = parse(ops, self.cpu, namespace=locals())
         looptoken = JitCellToken()
         self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
-        self.cpu.set_future_value_int(0, 1)
-        self.cpu.execute_token(looptoken)
+        self.cpu.execute_token(looptoken, 1)
         assert self.cpu.get_latest_value_int(0) == 1
         excvalue = self.cpu.grab_exc_value()
         assert excvalue == yptr
@@ -1744,13 +1713,11 @@
         loop = parse(ops, self.cpu, namespace=locals())
         looptoken = JitCellToken()
         self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
-        self.cpu.set_future_value_int(0, 1)
-        self.cpu.execute_token(looptoken)
+        self.cpu.execute_token(looptoken, 1)
         assert self.cpu.get_latest_value_int(0) == 1
         excvalue = self.cpu.grab_exc_value()
         assert excvalue == xptr
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.execute_token(looptoken)
+        self.cpu.execute_token(looptoken, 0)
         assert self.cpu.get_latest_value_int(0) == 0
         excvalue = self.cpu.grab_exc_value()
         assert not excvalue
@@ -1922,16 +1889,12 @@
         ops[2].setfailargs([i1, i0])
         looptoken = JitCellToken()
         self.cpu.compile_loop([i0, i1], ops, looptoken)
-        self.cpu.set_future_value_int(0, 20)
-        self.cpu.set_future_value_int(1, 0)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 20, 0)
         assert fail.identifier == 0
         assert self.cpu.get_latest_value_int(0) == 20
         assert values == []
 
-        self.cpu.set_future_value_int(0, 10)
-        self.cpu.set_future_value_int(1, 1)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 10, 1)
         assert fail.identifier == 1
         assert self.cpu.get_latest_value_int(0) == 1
         assert self.cpu.get_latest_value_int(1) == 10
@@ -1967,16 +1930,12 @@
         ops[2].setfailargs([i1, i2, i0])
         looptoken = JitCellToken()
         self.cpu.compile_loop([i0, i1], ops, looptoken)
-        self.cpu.set_future_value_int(0, 20)
-        self.cpu.set_future_value_int(1, 0)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 20, 0)
         assert fail.identifier == 0
         assert self.cpu.get_latest_value_int(0) == 42
         assert values == []
 
-        self.cpu.set_future_value_int(0, 10)
-        self.cpu.set_future_value_int(1, 1)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 10, 1)
         assert fail.identifier == 1
         assert self.cpu.get_latest_value_int(0) == 1
         assert self.cpu.get_latest_value_int(1) == 42
@@ -2013,17 +1972,13 @@
         ops[2].setfailargs([i1, f2, i0])
         looptoken = JitCellToken()
         self.cpu.compile_loop([i0, i1], ops, looptoken)
-        self.cpu.set_future_value_int(0, 20)
-        self.cpu.set_future_value_int(1, 0)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 20, 0)
         assert fail.identifier == 0
         x = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(x) == 42.5
         assert values == []
 
-        self.cpu.set_future_value_int(0, 10)
-        self.cpu.set_future_value_int(1, 1)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 10, 1)
         assert fail.identifier == 1
         assert self.cpu.get_latest_value_int(0) == 1
         x = self.cpu.get_latest_value_float(1)
@@ -2058,8 +2013,7 @@
         ops[1].setfailargs([i1, i2])
         looptoken = JitCellToken()
         self.cpu.compile_loop([i1], ops, looptoken)
-        self.cpu.set_future_value_int(0, ord('G'))
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, ord('G'))
         assert fail.identifier == 0
         assert self.cpu.get_latest_value_int(0) == ord('g')
 
@@ -2118,12 +2072,12 @@
         ops[1].setfailargs([])
         looptoken = JitCellToken()
         self.cpu.compile_loop([i0, i1, i2, i3], ops, looptoken)
-        self.cpu.set_future_value_int(0, rffi.cast(lltype.Signed, raw))
-        self.cpu.set_future_value_int(1, 2)
-        self.cpu.set_future_value_int(2, 4)
-        self.cpu.set_future_value_int(3, rffi.cast(lltype.Signed, fn))
+        args = [rffi.cast(lltype.Signed, raw),
+                2,
+                4,
+                rffi.cast(lltype.Signed, fn)]
         assert glob.lst == []
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, *args)
         assert fail.identifier == 0
         assert len(glob.lst) > 0
         lltype.free(raw, flavor='raw')
@@ -2176,9 +2130,8 @@
         self.cpu.compile_loop([i1, i2], ops, looptoken)
 
         buffer = lltype.malloc(rffi.CCHARP.TO, buflen, flavor='raw')
-        self.cpu.set_future_value_int(0, buflen)
-        self.cpu.set_future_value_int(1, rffi.cast(lltype.Signed, buffer))
-        fail = self.cpu.execute_token(looptoken)
+        args = [buflen, rffi.cast(lltype.Signed, buffer)]
+        fail = self.cpu.execute_token(looptoken, *args)
         assert fail.identifier == 0
         assert self.cpu.get_latest_value_int(0) == len(cwd)
         assert rffi.charp2strn(buffer, buflen) == cwd
@@ -2197,9 +2150,7 @@
         looptoken = JitCellToken()
         self.cpu.compile_loop([i0, i1], ops, looptoken)
 
-        self.cpu.set_future_value_int(0, -42)
-        self.cpu.set_future_value_int(1, 9)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, -42, 9)
         assert fail.identifier == 0
         assert self.cpu.get_latest_value_int(0) == -42
         print 'step 1 ok'
@@ -2208,9 +2159,7 @@
         # mark as failing
         self.cpu.invalidate_loop(looptoken)
 
-        self.cpu.set_future_value_int(0, -42)
-        self.cpu.set_future_value_int(1, 9)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, -42, 9)
         assert fail is faildescr
         assert self.cpu.get_latest_value_int(0) == 9
         print 'step 2 ok'
@@ -2226,9 +2175,7 @@
         ops[0].setfailargs([])
         self.cpu.compile_bridge(faildescr, [i2], ops, looptoken)
 
-        self.cpu.set_future_value_int(0, -42)
-        self.cpu.set_future_value_int(1, 9)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, -42, 9)
         assert fail.identifier == 3
         assert self.cpu.get_latest_value_int(0) == 9
         print 'step 3 ok'
@@ -2237,9 +2184,7 @@
         # mark as failing again
         self.cpu.invalidate_loop(looptoken)
 
-        self.cpu.set_future_value_int(0, -42)
-        self.cpu.set_future_value_int(1, 9)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, -42, 9)
         assert fail is faildescr2
         print 'step 4 ok'
         print '-'*79
@@ -2448,9 +2393,8 @@
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
             lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
             EffectInfo.MOST_GENERAL)
-        for i in range(10):
-            self.cpu.set_future_value_int(i, i+1)
-        res = self.cpu.execute_token(looptoken)
+        args = [i+1 for i in range(10)]
+        res = self.cpu.execute_token(looptoken, *args)
         assert self.cpu.get_latest_value_int(0) == 55
         ops = '''
         [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9]
@@ -2462,9 +2406,8 @@
         loop = parse(ops, namespace=locals())
         othertoken = JitCellToken()
         self.cpu.compile_loop(loop.inputargs, loop.operations, othertoken)
-        for i in range(10):
-            self.cpu.set_future_value_int(i, i+1)
-        res = self.cpu.execute_token(othertoken)
+        args = [i+1 for i in range(10)]
+        res = self.cpu.execute_token(othertoken, *args)
         assert self.cpu.get_latest_value_int(0) == 13
         assert called
 
@@ -2499,9 +2442,9 @@
         looptoken = JitCellToken()
         looptoken.outermost_jitdriver_sd = FakeJitDriverSD()
         self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
-        self.cpu.set_future_value_float(0, longlong.getfloatstorage(1.2))
-        self.cpu.set_future_value_float(1, longlong.getfloatstorage(2.3))
-        res = self.cpu.execute_token(looptoken)
+        args = [longlong.getfloatstorage(1.2),
+                longlong.getfloatstorage(2.3)]
+        res = self.cpu.execute_token(looptoken, *args)
         x = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(x) == 1.2 + 2.3
         ops = '''
@@ -2513,9 +2456,9 @@
         loop = parse(ops, namespace=locals())
         othertoken = JitCellToken()
         self.cpu.compile_loop(loop.inputargs, loop.operations, othertoken)
-        self.cpu.set_future_value_float(0, longlong.getfloatstorage(1.2))
-        self.cpu.set_future_value_float(1, longlong.getfloatstorage(3.2))
-        res = self.cpu.execute_token(othertoken)
+        args = [longlong.getfloatstorage(1.2),
+                longlong.getfloatstorage(3.2)]
+        res = self.cpu.execute_token(othertoken, *args)
         x = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(x) == 13.5
         assert called
@@ -2526,9 +2469,9 @@
         try:
             othertoken = JitCellToken()
             self.cpu.compile_loop(loop.inputargs, loop.operations, othertoken)
-            self.cpu.set_future_value_float(0, longlong.getfloatstorage(1.2))
-            self.cpu.set_future_value_float(1, longlong.getfloatstorage(3.2))
-            res = self.cpu.execute_token(othertoken)
+            args = [longlong.getfloatstorage(1.2),
+                    longlong.getfloatstorage(3.2)]
+            res = self.cpu.execute_token(othertoken, *args)
             x = self.cpu.get_latest_value_float(0)
             assert longlong.getrealfloat(x) == 1.2 + 3.2
             assert not called
@@ -2589,9 +2532,9 @@
         looptoken = JitCellToken()
         looptoken.outermost_jitdriver_sd = FakeJitDriverSD()
         self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
-        self.cpu.set_future_value_float(0, longlong.getfloatstorage(1.25))
-        self.cpu.set_future_value_float(1, longlong.getfloatstorage(2.35))
-        res = self.cpu.execute_token(looptoken)
+        args = [longlong.getfloatstorage(1.25),
+                longlong.getfloatstorage(2.35)]
+        res = self.cpu.execute_token(looptoken, *args)
         x = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(x) == 1.25 + 2.35
         assert not called
@@ -2607,9 +2550,9 @@
         self.cpu.compile_loop(loop.inputargs, loop.operations, othertoken)
 
         # normal call_assembler: goes to looptoken
-        self.cpu.set_future_value_float(0, longlong.getfloatstorage(1.25))
-        self.cpu.set_future_value_float(1, longlong.getfloatstorage(3.25))
-        res = self.cpu.execute_token(othertoken)
+        args = [longlong.getfloatstorage(1.25),
+                longlong.getfloatstorage(3.25)]
+        res = self.cpu.execute_token(othertoken, *args)
         x = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(x) == 13.5
         assert called
@@ -2629,10 +2572,9 @@
         self.cpu.redirect_call_assembler(looptoken, looptoken2)
 
         # now, our call_assembler should go to looptoken2
-        self.cpu.set_future_value_float(0, longlong.getfloatstorage(6.0))
-        self.cpu.set_future_value_float(1, longlong.getfloatstorage(1.5))
-                                                       # 6.0-1.5 == 1.25+3.25
-        res = self.cpu.execute_token(othertoken)
+        args = [longlong.getfloatstorage(6.0),
+                longlong.getfloatstorage(1.5)]         # 6.0-1.5 == 1.25+3.25
+        res = self.cpu.execute_token(othertoken, *args)
         x = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(x) == 13.5
         assert called
@@ -2986,8 +2928,7 @@
         looptoken = JitCellToken()
         self.cpu.compile_loop(inputargs, operations, looptoken)
         # overflowing value:
-        self.cpu.set_future_value_int(0, sys.maxint // 4 + 1)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, sys.maxint // 4 + 1)
         assert fail.identifier == excdescr.identifier
 
     def test_compile_loop_with_target(self):
@@ -3014,8 +2955,7 @@
         operations[6].setfailargs([i1])
 
         self.cpu.compile_loop(inputargs, operations, looptoken)
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_int(0)
         assert res == 10
@@ -3027,8 +2967,7 @@
             ]
         self.cpu.compile_bridge(faildescr, inputargs, operations, looptoken)
         
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail.identifier == 3
         res = self.cpu.get_latest_value_int(0)
         assert res == -10
@@ -3108,13 +3047,13 @@
         self.cpu.compile_bridge(faildescr1, inputargs, operations, looptoken1)
 
         looptoken2 = JitCellToken()
-        inputargs = []
+        inputargs = [BoxInt()]
         operations = [
             ResOperation(rop.JUMP, [ConstInt(0)], None, descr=targettoken1),
             ]
         self.cpu.compile_loop(inputargs, operations, looptoken2)
 
-        fail = self.cpu.execute_token(looptoken2)
+        fail = self.cpu.execute_token(looptoken2, -9)
         assert fail.identifier == 42
 
 
diff --git a/pypy/jit/backend/test/test_random.py b/pypy/jit/backend/test/test_random.py
--- a/pypy/jit/backend/test/test_random.py
+++ b/pypy/jit/backend/test/test_random.py
@@ -6,6 +6,7 @@
 from pypy.jit.metainterp.history import BoxInt, ConstInt, JitCellToken
 from pypy.jit.metainterp.history import BoxPtr, ConstPtr, TargetToken
 from pypy.jit.metainterp.history import BoxFloat, ConstFloat, Const
+from pypy.jit.metainterp.history import INT, FLOAT
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.executor import execute_nonspec
 from pypy.jit.metainterp.resoperation import opname
@@ -616,8 +617,13 @@
             return self.loop._jitcelltoken
         if not hasattr(self, '_initialjumploop_celltoken'):
             self._initialjumploop_celltoken = JitCellToken()
-            self.cpu.compile_loop(self.startvars[:],
-                                  [ResOperation(rop.JUMP, self.startvars[:], None,
+            args = []
+            for box in self.startvars:
+                if box not in self.loop.inputargs:
+                    box = box.constbox()
+                args.append(box)
+            self.cpu.compile_loop(self.loop.inputargs,
+                                  [ResOperation(rop.JUMP, args, None,
                                                 descr=self.loop._targettoken)],
                                   self._initialjumploop_celltoken)
         return self._initialjumploop_celltoken
@@ -649,14 +655,8 @@
         exc = cpu.grab_exc_value()
         assert not exc
 
-        for i, box in enumerate(self.startvars):
-            if isinstance(box, BoxInt):
-                cpu.set_future_value_int(i, box.value)
-            elif isinstance(box, BoxFloat):
-                cpu.set_future_value_float(i, box.value)
-            else:
-                raise NotImplementedError(box)
-        fail = cpu.execute_token(self.runjitcelltoken())
+        arguments = [box.value for box in self.loop.inputargs]
+        fail = cpu.execute_token(self.runjitcelltoken(), *arguments)
         assert fail is self.should_fail_by.getdescr()
         for i, v in enumerate(self.get_fail_args()):
             if isinstance(v, (BoxFloat, ConstFloat)):
@@ -717,10 +717,21 @@
             # to build_bridge().)
 
             # First make up the other loop...
-            subset = bridge_builder.subset_of_intvars(r)
-            subset = [i for i in subset if i in fail_args]
-            if len(subset) == 0:
-                return False
+            #
+            # New restriction: must have the same argument count and types
+            # as the original loop
+            subset = []
+            for box in self.loop.inputargs:
+                srcbox = r.choice(fail_args)
+                if srcbox.type != box.type:
+                    if box.type == INT:
+                        srcbox = ConstInt(r.random_integer())
+                    elif box.type == FLOAT:
+                        srcbox = ConstFloat(r.random_float_storage())
+                    else:
+                        raise AssertionError(box.type)
+                subset.append(srcbox)
+            #
             args = [x.clonebox() for x in subset]
             rl = RandomLoop(self.builder.cpu, self.builder.fork,
                                      r, args)
diff --git a/pypy/jit/backend/x86/assembler.py b/pypy/jit/backend/x86/assembler.py
--- a/pypy/jit/backend/x86/assembler.py
+++ b/pypy/jit/backend/x86/assembler.py
@@ -38,6 +38,7 @@
 from pypy.jit.backend.x86.jump import remap_frame_layout
 from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.codewriter import longlong
+from pypy.rlib.rarithmetic import intmask
 
 # darwin requires the stack to be 16 bytes aligned on calls. Same for gcc 4.5.0,
 # better safe than sorry
@@ -309,12 +310,11 @@
                 mc.MOVSD_sx(8*i, i)     # xmm0 to xmm7
         #
         if IS_X86_32:
-            mc.LEA_rb(eax.value, +8)
             stack_size += 2*WORD
             mc.PUSH_r(eax.value)        # alignment
-            mc.PUSH_r(eax.value)
+            mc.PUSH_r(esp.value)
         elif IS_X86_64:
-            mc.LEA_rb(edi.value, +16)
+            mc.MOV_rr(edi.value, esp.value)
         #
         # esp is now aligned to a multiple of 16 again
         mc.CALL(imm(slowpathaddr))
@@ -325,7 +325,7 @@
         jnz_location = mc.get_relative_pos()
         #
         if IS_X86_32:
-            mc.ADD_ri(esp.value, 2*WORD)
+            mc.ADD_ri(esp.value, 2*WORD)    # cancel the two PUSHes above
         elif IS_X86_64:
             # restore the registers
             for i in range(7, -1, -1):
@@ -421,10 +421,8 @@
 
     def assemble_loop(self, loopname, inputargs, operations, looptoken, log):
         '''adds the following attributes to looptoken:
-               _x86_loop_code       (an integer giving an address)
-               _x86_bootstrap_code  (an integer giving an address)
-               _x86_direct_bootstrap_code  ( "    "     "    "   )
-               _x86_arglocs
+               _x86_function_addr   (address of the generated func, as an int)
+               _x86_loop_code       (debug: addr of the start of the ResOps)
                _x86_debug_checksum
         '''
         # XXX this function is too longish and contains some code
@@ -445,12 +443,12 @@
             operations = self._inject_debugging_code(looptoken, operations)
 
         regalloc = RegAlloc(self, self.cpu.translate_support_code)
-        arglocs, operations = regalloc.prepare_loop(inputargs, operations,
-                                                    looptoken, clt.allgcrefs)
-        looptoken._x86_arglocs = arglocs
-
-        bootstrappos = self.mc.get_relative_pos()
-        stackadjustpos = self._assemble_bootstrap_code(inputargs, arglocs)
+        #
+        self._call_header_with_stack_check()
+        stackadjustpos = self._patchable_stackadjust()
+        clt._debug_nbargs = len(inputargs)
+        operations = regalloc.prepare_loop(inputargs, operations,
+                                           looptoken, clt.allgcrefs)
         looppos = self.mc.get_relative_pos()
         looptoken._x86_loop_code = looppos
         clt.frame_depth = -1     # temporarily
@@ -458,19 +456,17 @@
         frame_depth, param_depth = self._assemble(regalloc, operations)
         clt.frame_depth = frame_depth
         clt.param_depth = param_depth
-
-        directbootstrappos = self.mc.get_relative_pos()
-        self._assemble_bootstrap_direct_call(arglocs, looppos,
-                                             frame_depth+param_depth)
+        #
+        size_excluding_failure_stuff = self.mc.get_relative_pos()
         self.write_pending_failure_recoveries()
-        fullsize = self.mc.get_relative_pos()
+        full_size = self.mc.get_relative_pos()
         #
         rawstart = self.materialize_loop(looptoken)
         debug_start("jit-backend-addr")
         debug_print("Loop %d (%s) has address %x to %x (bootstrap %x)" % (
             looptoken.number, loopname,
             rawstart + looppos,
-            rawstart + directbootstrappos,
+            rawstart + size_excluding_failure_stuff,
             rawstart))
         debug_stop("jit-backend-addr")
         self._patch_stackadjust(rawstart + stackadjustpos,
@@ -481,18 +477,17 @@
         if not we_are_translated():
             # used only by looptoken.dump() -- useful in tests
             looptoken._x86_rawstart = rawstart
-            looptoken._x86_fullsize = fullsize
+            looptoken._x86_fullsize = full_size
             looptoken._x86_ops_offset = ops_offset
+        looptoken._x86_function_addr = rawstart
 
-        looptoken._x86_bootstrap_code = rawstart + bootstrappos
-        looptoken._x86_direct_bootstrap_code = rawstart + directbootstrappos
         self.fixup_target_tokens(rawstart)
         self.teardown()
         # oprofile support
         if self.cpu.profile_agent is not None:
             name = "Loop # %s: %s" % (looptoken.number, loopname)
             self.cpu.profile_agent.native_code_written(name,
-                                                       rawstart, fullsize)
+                                                       rawstart, full_size)
         return ops_offset
 
     def assemble_bridge(self, faildescr, inputargs, operations,
@@ -802,152 +797,21 @@
             self.mc.MOV_ri(ebx.value, rst)           # MOV ebx, rootstacktop
             self.mc.SUB_mi8((ebx.value, 0), 2*WORD)  # SUB [ebx], 2*WORD
 
-    def _assemble_bootstrap_direct_call(self, arglocs, jmppos, stackdepth):
-        if IS_X86_64:
-            return self._assemble_bootstrap_direct_call_64(arglocs, jmppos, stackdepth)
-        # XXX pushing ebx esi and edi is a bit pointless, since we store
-        #     all regsiters anyway, for the case of guard_not_forced
-        # XXX this can be improved greatly. Right now it'll behave like
-        #     a normal call
-        nonfloatlocs, floatlocs = arglocs
-        self._call_header_with_stack_check()
-        self.mc.LEA_rb(esp.value, self._get_offset_of_ebp_from_esp(stackdepth))
-        offset = 2 * WORD
-        tmp = eax
-        xmmtmp = xmm0
-        for i in range(len(nonfloatlocs)):
-            loc = nonfloatlocs[i]
-            if loc is not None:
-                if isinstance(loc, RegLoc):
-                    assert not loc.is_xmm
-                    self.mc.MOV_rb(loc.value, offset)
-                else:
-                    self.mc.MOV_rb(tmp.value, offset)
-                    self.mc.MOV(loc, tmp)
-                offset += WORD
-            loc = floatlocs[i]
-            if loc is not None:
-                if isinstance(loc, RegLoc):
-                    assert loc.is_xmm
-                    self.mc.MOVSD_xb(loc.value, offset)
-                else:
-                    self.mc.MOVSD_xb(xmmtmp.value, offset)
-                    assert isinstance(loc, StackLoc)
-                    self.mc.MOVSD_bx(loc.value, xmmtmp.value)
-                offset += 2 * WORD
-        endpos = self.mc.get_relative_pos() + 5
-        self.mc.JMP_l(jmppos - endpos)
-        assert endpos == self.mc.get_relative_pos()
-
-    def _assemble_bootstrap_direct_call_64(self, arglocs, jmppos, stackdepth):
-        # XXX: Very similar to _emit_call_64
-
-        src_locs = []
-        dst_locs = []
-        xmm_src_locs = []
-        xmm_dst_locs = []
-        get_from_stack = []
-
-        # In reverse order for use with pop()
-        unused_gpr = [r9, r8, ecx, edx, esi, edi]
-        unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
-
-        nonfloatlocs, floatlocs = arglocs
-        self._call_header_with_stack_check()
-        self.mc.LEA_rb(esp.value, self._get_offset_of_ebp_from_esp(stackdepth))
-
-        # The lists are padded with Nones
-        assert len(nonfloatlocs) == len(floatlocs)
-
-        for i in range(len(nonfloatlocs)):
-            loc = nonfloatlocs[i]
-            if loc is not None:
-                if len(unused_gpr) > 0:
-                    src_locs.append(unused_gpr.pop())
-                    dst_locs.append(loc)
-                else:
-                    get_from_stack.append((loc, False))
-
-            floc = floatlocs[i]
-            if floc is not None:
-                if len(unused_xmm) > 0:
-                    xmm_src_locs.append(unused_xmm.pop())
-                    xmm_dst_locs.append(floc)
-                else:
-                    get_from_stack.append((floc, True))
-
-        remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
-        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs, X86_64_XMM_SCRATCH_REG)
-
-        for i in range(len(get_from_stack)):
-            loc, is_xmm = get_from_stack[i]
-            if is_xmm:
-                self.mc.MOVSD_xb(X86_64_XMM_SCRATCH_REG.value, (2 + i) * WORD)
-                self.mc.MOVSD(loc, X86_64_XMM_SCRATCH_REG)
-            else:
-                self.mc.MOV_rb(X86_64_SCRATCH_REG.value, (2 + i) * WORD)
-                # XXX: We're assuming that "loc" won't require regloc to
-                # clobber the scratch register
-                self.mc.MOV(loc, X86_64_SCRATCH_REG)
-
-        endpos = self.mc.get_relative_pos() + 5
-        self.mc.JMP_l(jmppos - endpos)
-        assert endpos == self.mc.get_relative_pos()
-
     def redirect_call_assembler(self, oldlooptoken, newlooptoken):
         # some minimal sanity checking
-        oldnonfloatlocs, oldfloatlocs = oldlooptoken._x86_arglocs
-        newnonfloatlocs, newfloatlocs = newlooptoken._x86_arglocs
-        assert len(oldnonfloatlocs) == len(newnonfloatlocs)
-        assert len(oldfloatlocs) == len(newfloatlocs)
+        old_nbargs = oldlooptoken.compiled_loop_token._debug_nbargs
+        new_nbargs = newlooptoken.compiled_loop_token._debug_nbargs
+        assert old_nbargs == new_nbargs
         # we overwrite the instructions at the old _x86_direct_bootstrap_code
         # to start with a JMP to the new _x86_direct_bootstrap_code.
         # Ideally we should rather patch all existing CALLs, but well.
-        oldadr = oldlooptoken._x86_direct_bootstrap_code
-        target = newlooptoken._x86_direct_bootstrap_code
+        oldadr = oldlooptoken._x86_function_addr
+        target = newlooptoken._x86_function_addr
         mc = codebuf.MachineCodeBlockWrapper()
         mc.JMP(imm(target))
+        assert mc.get_relative_pos() <= 13  # keep in sync with prepare_loop()
         mc.copy_to_raw_memory(oldadr)
 
-    def _assemble_bootstrap_code(self, inputargs, arglocs):
-        nonfloatlocs, floatlocs = arglocs
-        self._call_header()
-        stackadjustpos = self._patchable_stackadjust()
-        tmp = eax
-        xmmtmp = xmm0
-        self.mc.begin_reuse_scratch_register()
-        for i in range(len(nonfloatlocs)):
-            loc = nonfloatlocs[i]
-            if loc is None:
-                continue
-            if isinstance(loc, RegLoc):
-                target = loc
-            else:
-                target = tmp
-            if inputargs[i].type == REF:
-                adr = self.fail_boxes_ptr.get_addr_for_num(i)
-                self.mc.MOV(target, heap(adr))
-                self.mc.MOV(heap(adr), imm0)
-            else:
-                adr = self.fail_boxes_int.get_addr_for_num(i)
-                self.mc.MOV(target, heap(adr))
-            if target is not loc:
-                assert isinstance(loc, StackLoc)
-                self.mc.MOV_br(loc.value, target.value)
-        for i in range(len(floatlocs)):
-            loc = floatlocs[i]
-            if loc is None:
-                continue
-            adr = self.fail_boxes_float.get_addr_for_num(i)
-            if isinstance(loc, RegLoc):
-                self.mc.MOVSD(loc, heap(adr))
-            else:
-                self.mc.MOVSD(xmmtmp, heap(adr))
-                assert isinstance(loc, StackLoc)
-                self.mc.MOVSD_bx(loc.value, xmmtmp.value)
-        self.mc.end_reuse_scratch_register()
-        return stackadjustpos
-
     def dump(self, text):
         if not self.verbose:
             return
@@ -974,7 +838,7 @@
         if isinstance(loc, RegLoc) and loc.is_xmm:
             self.mc.SUB_ri(esp.value, 8)   # = size of doubles
             self.mc.MOVSD_sx(0, loc.value)
-        elif WORD == 4 and isinstance(loc, StackLoc) and loc.width == 8:
+        elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
             # XXX evil trick
             self.mc.PUSH_b(get_ebp_ofs(loc.position))
             self.mc.PUSH_b(get_ebp_ofs(loc.position + 1))
@@ -985,13 +849,25 @@
         if isinstance(loc, RegLoc) and loc.is_xmm:
             self.mc.MOVSD_xs(loc.value, 0)
             self.mc.ADD_ri(esp.value, 8)   # = size of doubles
-        elif WORD == 4 and isinstance(loc, StackLoc) and loc.width == 8:
+        elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
             # XXX evil trick
             self.mc.POP_b(get_ebp_ofs(loc.position + 1))
             self.mc.POP_b(get_ebp_ofs(loc.position))
         else:
             self.mc.POP(loc)
 
+    def regalloc_immedmem2mem(self, from_loc, to_loc):
+        # move a ConstFloatLoc directly to a StackLoc, as two MOVs
+        # (even on x86-64, because the immediates are encoded as 32 bits)
+        assert isinstance(from_loc, ConstFloatLoc)
+        assert isinstance(to_loc,   StackLoc)
+        low_part  = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[0]
+        high_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[1]
+        low_part  = intmask(low_part)
+        high_part = intmask(high_part)
+        self.mc.MOV_bi(to_loc.value,     low_part)
+        self.mc.MOV_bi(to_loc.value + 4, high_part)
+
     def regalloc_perform(self, op, arglocs, resloc):
         genop_list[op.getopnum()](self, op, arglocs, resloc)
 
@@ -1143,18 +1019,18 @@
                     self.mc.MOVSD_sx(p, loc.value)
                 else:
                     self.mc.MOV_sr(p, loc.value)
-            p += round_up_to_4(loc.width)
+            p += loc.get_width()
         p = 0
         for i in range(start, n):
             loc = arglocs[i]
             if not isinstance(loc, RegLoc):
-                if loc.width == 8:
+                if loc.get_width() == 8:
                     self.mc.MOVSD(xmm0, loc)
                     self.mc.MOVSD_sx(p, xmm0.value)
                 else:
                     self.mc.MOV(tmp, loc)
                     self.mc.MOV_sr(p, tmp.value)
-            p += round_up_to_4(loc.width)
+            p += loc.get_width()
         self._regalloc.reserve_param(p//WORD)
         # x is a location
         self.mc.CALL(x)
@@ -1891,10 +1767,10 @@
     DESCR_INT       = 0x01
     DESCR_FLOAT     = 0x02
     DESCR_SPECIAL   = 0x03
-    # XXX: 4*8 works on i386, should we optimize for that case?
-    CODE_FROMSTACK  = 4*16
+    CODE_FROMSTACK  = 4 * (8 + 8*IS_X86_64)
     CODE_STOP       = 0 | DESCR_SPECIAL
     CODE_HOLE       = 4 | DESCR_SPECIAL
+    CODE_INPUTARG   = 8 | DESCR_SPECIAL
 
     def write_failure_recovery_description(self, mc, failargs, locs):
         for i in range(len(failargs)):
@@ -1910,7 +1786,11 @@
                     raise AssertionError("bogus kind")
                 loc = locs[i]
                 if isinstance(loc, StackLoc):
-                    n = self.CODE_FROMSTACK//4 + loc.position
+                    pos = loc.position
+                    if pos < 0:
+                        mc.writechar(chr(self.CODE_INPUTARG))
+                        pos = ~pos
+                    n = self.CODE_FROMSTACK//4 + pos
                 else:
                     assert isinstance(loc, RegLoc)
                     n = loc.value
@@ -1930,6 +1810,7 @@
         descr_to_box_type = [REF, INT, FLOAT]
         bytecode = rffi.cast(rffi.UCHARP, bytecode)
         arglocs = []
+        code_inputarg = False
         while 1:
             # decode the next instruction from the bytecode
             code = rffi.cast(lltype.Signed, bytecode[0])
@@ -1948,11 +1829,17 @@
                             break
                 kind = code & 3
                 code = (code - self.CODE_FROMSTACK) >> 2
+                if code_inputarg:
+                    code = ~code
+                    code_inputarg = False
                 loc = X86FrameManager.frame_pos(code, descr_to_box_type[kind])
             elif code == self.CODE_STOP:
                 break
             elif code == self.CODE_HOLE:
                 continue
+            elif code == self.CODE_INPUTARG:
+                code_inputarg = True
+                continue
             else:
                 # 'code' identifies a register
                 kind = code & 3
@@ -1968,6 +1855,7 @@
     def grab_frame_values(self, bytecode, frame_addr, allregisters):
         # no malloc allowed here!!
         self.fail_ebp = allregisters[16 + ebp.value]
+        code_inputarg = False
         num = 0
         value_hi = 0
         while 1:
@@ -1988,6 +1876,9 @@
                 # load the value from the stack
                 kind = code & 3
                 code = (code - self.CODE_FROMSTACK) >> 2
+                if code_inputarg:
+                    code = ~code
+                    code_inputarg = False
                 stackloc = frame_addr + get_ebp_ofs(code)
                 value = rffi.cast(rffi.LONGP, stackloc)[0]
                 if kind == self.DESCR_FLOAT and WORD == 4:
@@ -2000,6 +1891,9 @@
                     if code == self.CODE_HOLE:
                         num += 1
                         continue
+                    if code == self.CODE_INPUTARG:
+                        code_inputarg = True
+                        continue
                     assert code == self.CODE_STOP
                     break
                 code >>= 2
@@ -2104,9 +1998,9 @@
         # returns in eax the fail_index
 
         # now we return from the complete frame, which starts from
-        # _assemble_bootstrap_code().  The LEA in _call_footer below throws
-        # away most of the frame, including all the PUSHes that we did just
-        # above.
+        # _call_header_with_stack_check().  The LEA in _call_footer below
+        # throws away most of the frame, including all the PUSHes that we
+        # did just above.
 
         self._call_footer()
         rawstart = mc.materialize(self.cpu.asmmemmgr, [])
@@ -2189,7 +2083,7 @@
                         argtypes=op.getdescr().get_arg_types(),
                         callconv=op.getdescr().get_call_conv())
 
-        if IS_X86_32 and isinstance(resloc, StackLoc) and resloc.width == 8:
+        if IS_X86_32 and isinstance(resloc, StackLoc) and resloc.type == FLOAT:
             # a float or a long long return
             if op.getdescr().get_return_type() == 'L':
                 self.mc.MOV_br(resloc.value, eax.value)      # long long
@@ -2354,10 +2248,10 @@
         self.mc.MOV_bi(FORCE_INDEX_OFS, fail_index)
         descr = op.getdescr()
         assert isinstance(descr, JitCellToken)
-        assert len(arglocs) - 2 == len(descr._x86_arglocs[0])
+        assert len(arglocs) - 2 == descr.compiled_loop_token._debug_nbargs
         #
-        # Write a call to the direct_bootstrap_code of the target assembler
-        self._emit_call(fail_index, imm(descr._x86_direct_bootstrap_code),
+        # Write a call to the target assembler
+        self._emit_call(fail_index, imm(descr._x86_function_addr),
                         arglocs, 2, tmp=eax)
         if op.result is None:
             assert result_loc is None
@@ -2588,6 +2482,14 @@
                     self.gcrootmap_retaddr_forced = -1
 
     def closing_jump(self, target_token):
+        # The backend's logic assumes that the target code is in a piece of
+        # assembler that was also called with the same number of arguments,
+        # so that the locations [ebp+8..] of the input arguments are valid
+        # stack locations both before and after the jump.
+        my_nbargs = self.current_clt._debug_nbargs
+        target_nbargs = target_token._x86_clt._debug_nbargs
+        assert my_nbargs == target_nbargs
+        #
         target = target_token._x86_loop_code
         if target_token in self.target_tokens_currently_compiling:
             curpos = self.mc.get_relative_pos() + 5
@@ -2666,11 +2568,6 @@
         num = getattr(rop, opname.upper())
         genop_list[num] = value
 
-def round_up_to_4(size):
-    if size < 4:
-        return 4
-    return size
-
 # XXX: ri386 migration shims:
 def addr_add(reg_or_imm1, reg_or_imm2, offset=0, scale=0):
     return AddressLoc(reg_or_imm1, reg_or_imm2, scale, offset)
diff --git a/pypy/jit/backend/x86/jump.py b/pypy/jit/backend/x86/jump.py
--- a/pypy/jit/backend/x86/jump.py
+++ b/pypy/jit/backend/x86/jump.py
@@ -1,6 +1,6 @@
 import sys
 from pypy.tool.pairtype import extendabletype
-from pypy.jit.backend.x86.regloc import ImmedLoc, StackLoc
+from pypy.jit.backend.x86.regloc import ImmediateAssemblerLocation, StackLoc
 
 def remap_frame_layout(assembler, src_locations, dst_locations, tmpreg):
     pending_dests = len(dst_locations)
@@ -12,7 +12,7 @@
         srccount[key] = 0
     for i in range(len(dst_locations)):
         src = src_locations[i]
-        if isinstance(src, ImmedLoc):
+        if isinstance(src, ImmediateAssemblerLocation):
             continue
         key = src._getregkey()
         if key in srccount:
@@ -31,7 +31,7 @@
                 srccount[key] = -1       # means "it's done"
                 pending_dests -= 1
                 src = src_locations[i]
-                if not isinstance(src, ImmedLoc):
+                if not isinstance(src, ImmediateAssemblerLocation):
                     key = src._getregkey()
                     if key in srccount:
                         srccount[key] -= 1
@@ -66,6 +66,13 @@
 
 def _move(assembler, src, dst, tmpreg):
     if dst.is_memory_reference() and src.is_memory_reference():
+        if isinstance(src, ImmediateAssemblerLocation):
+            assembler.regalloc_immedmem2mem(src, dst)
+            return
+        if tmpreg is None:
+            assembler.regalloc_push(src)
+            assembler.regalloc_pop(dst)
+            return
         assembler.regalloc_mov(src, tmpreg)
         src = tmpreg
     assembler.regalloc_mov(src, dst)
@@ -87,7 +94,7 @@
         dstloc = dst_locations2[i]
         if isinstance(loc, StackLoc):
             key = loc._getregkey()
-            if (key in dst_keys or (loc.width > WORD and
+            if (key in dst_keys or (loc.get_width() > WORD and
                                     (key + WORD) in dst_keys)):
                 assembler.regalloc_push(loc)
                 extrapushes.append(dstloc)
diff --git a/pypy/jit/backend/x86/regalloc.py b/pypy/jit/backend/x86/regalloc.py
--- a/pypy/jit/backend/x86/regalloc.py
+++ b/pypy/jit/backend/x86/regalloc.py
@@ -28,7 +28,7 @@
 class X86RegisterManager(RegisterManager):
 
     box_types = [INT, REF]
-    all_regs = [eax, ecx, edx, ebx, esi, edi]
+    all_regs = [ecx, eax, edx, ebx, esi, edi]
     no_lower_byte_regs = [esi, edi]
     save_around_call_regs = [eax, edx, ecx]
     frame_reg = ebp
@@ -60,7 +60,7 @@
 
 class X86_64_RegisterManager(X86RegisterManager):
     # r11 omitted because it's used as scratch
-    all_regs = [eax, ecx, edx, ebx, esi, edi, r8, r9, r10, r12, r13, r14, r15]
+    all_regs = [ecx, eax, edx, ebx, esi, edi, r8, r9, r10, r12, r13, r14, r15]
     no_lower_byte_regs = []
     save_around_call_regs = [eax, ecx, edx, esi, edi, r8, r9, r10]
 
@@ -130,9 +130,9 @@
     @staticmethod
     def frame_pos(i, box_type):
         if IS_X86_32 and box_type == FLOAT:
-            return StackLoc(i, get_ebp_ofs(i+1), 2, box_type)
+            return StackLoc(i, get_ebp_ofs(i+1), box_type)
         else:
-            return StackLoc(i, get_ebp_ofs(i), 1, box_type)
+            return StackLoc(i, get_ebp_ofs(i), box_type)
     @staticmethod
     def frame_size(box_type):
         if IS_X86_32 and box_type == FLOAT:
@@ -165,6 +165,7 @@
         self.jump_target_descr = None
         self.close_stack_struct = 0
         self.final_jump_op = None
+        self.min_bytes_before_label = 0
 
     def _prepare(self, inputargs, operations, allgcrefs):
         self.fm = X86FrameManager()
@@ -173,22 +174,26 @@
         operations = cpu.gc_ll_descr.rewrite_assembler(cpu, operations,
                                                        allgcrefs)
         # compute longevity of variables
-        longevity, useful = self._compute_vars_longevity(inputargs, operations)
-        self.longevity = longevity
-        self.rm = gpr_reg_mgr_cls(longevity,
+        self._compute_vars_longevity(inputargs, operations)
+        self.rm = gpr_reg_mgr_cls(self.longevity,
                                   frame_manager = self.fm,
                                   assembler = self.assembler)
-        self.xrm = xmm_reg_mgr_cls(longevity, frame_manager = self.fm,
+        self.xrm = xmm_reg_mgr_cls(self.longevity, frame_manager = self.fm,
                                    assembler = self.assembler)
-        return operations, useful
+        return operations
 
     def prepare_loop(self, inputargs, operations, looptoken, allgcrefs):
-        operations, useful = self._prepare(inputargs, operations, allgcrefs)
-        return self._process_inputargs(inputargs, useful), operations
+        operations = self._prepare(inputargs, operations, allgcrefs)
+        self._set_initial_bindings(inputargs)
+        # note: we need to make a copy of inputargs because possibly_free_vars
+        # is also used on op args, which is a non-resizable list
+        self.possibly_free_vars(list(inputargs))
+        self.min_bytes_before_label = 13
+        return operations
 
     def prepare_bridge(self, prev_depths, inputargs, arglocs, operations,
                        allgcrefs):
-        operations, _ = self._prepare(inputargs, operations, allgcrefs)
+        operations = self._prepare(inputargs, operations, allgcrefs)
         self._update_bindings(arglocs, inputargs)
         self.param_depth = prev_depths[1]
         return operations
@@ -196,46 +201,56 @@
     def reserve_param(self, n):
         self.param_depth = max(self.param_depth, n)
 
-    def _process_inputargs(self, inputargs, useful):
-        # XXX we can sort out here by longevity if we need something
-        # more optimal
-        floatlocs = [None] * len(inputargs)
-        nonfloatlocs = [None] * len(inputargs)
-        # Don't use all_regs[0] for passing arguments around a loop.
-        # Must be kept in sync with consider_jump().
-        # XXX this should probably go to llsupport/regalloc.py
-        xmmtmp = self.xrm.free_regs.pop(0)
-        tmpreg = self.rm.free_regs.pop(0)
-        assert tmpreg == X86RegisterManager.all_regs[0]
-        assert xmmtmp == X86XMMRegisterManager.all_regs[0]
-        for i in range(len(inputargs)):
-            arg = inputargs[i]
-            assert not isinstance(arg, Const)
-            reg = None
-            if self.longevity[arg][1] > -1 and arg in useful:
-                if arg.type == FLOAT:
-                    # xxx is it really a good idea?  at the first CALL they
-                    # will all be flushed anyway
-                    reg = self.xrm.try_allocate_reg(arg)
+    def _set_initial_bindings(self, inputargs):
+        if IS_X86_64:
+            inputargs = self._set_initial_bindings_regs_64(inputargs)
+        #                   ...
+        # stack layout:     arg2
+        #                   arg1
+        #                   arg0
+        #                   return address
+        #                   saved ebp        <-- ebp points here
+        #                   ...
+        cur_frame_pos = - 1 - FRAME_FIXED_SIZE
+        assert get_ebp_ofs(cur_frame_pos-1) == 2*WORD
+        assert get_ebp_ofs(cur_frame_pos-2) == 3*WORD
+        #
+        for box in inputargs:
+            assert isinstance(box, Box)
+            #
+            if IS_X86_32 and box.type == FLOAT:
+                cur_frame_pos -= 2
+            else:
+                cur_frame_pos -= 1
+            loc = self.fm.frame_pos(cur_frame_pos, box.type)
+            self.fm.set_binding(box, loc)
+
+    def _set_initial_bindings_regs_64(self, inputargs):
+        # In reverse order for use with pop()
+        unused_gpr = [r9, r8, ecx, edx, esi, edi]
+        unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
+        #
+        pass_on_stack = []
+        #
+        for box in inputargs:
+            assert isinstance(box, Box)
+            #
+            if box.type == FLOAT:
+                if len(unused_xmm) > 0:
+                    ask = unused_xmm.pop()
+                    got = self.xrm.try_allocate_reg(box, selected_reg=ask)
+                    assert ask == got
                 else:
-                    reg = self.rm.try_allocate_reg(arg)
-            if reg:
-                loc = reg
+                    pass_on_stack.append(box)
             else:
-                loc = self.fm.loc(arg)
-            if arg.type == FLOAT:
-                floatlocs[i] = loc
-            else:
-                nonfloatlocs[i] = loc
-            # otherwise we have it saved on stack, so no worry
-        self.rm.free_regs.insert(0, tmpreg)
-        self.xrm.free_regs.insert(0, xmmtmp)
-        assert tmpreg not in nonfloatlocs
-        assert xmmtmp not in floatlocs
-        # note: we need to make a copy of inputargs because possibly_free_vars
-        # is also used on op args, which is a non-resizable list
-        self.possibly_free_vars(list(inputargs))
-        return nonfloatlocs, floatlocs
+                if len(unused_gpr) > 0:
+                    ask = unused_gpr.pop()
+                    got = self.rm.try_allocate_reg(box, selected_reg=ask)
+                    assert ask == got
+                else:
+                    pass_on_stack.append(box)
+        #
+        return pass_on_stack
 
     def possibly_free_var(self, var):
         if var.type == FLOAT:
@@ -446,8 +461,15 @@
             i += 1
         assert not self.rm.reg_bindings
         assert not self.xrm.reg_bindings
+        self.flush_loop()
         self.assembler.mc.mark_op(None) # end of the loop
 
+    def flush_loop(self):
+        # rare case: if the loop is too short, pad with NOPs
+        mc = self.assembler.mc
+        while mc.get_relative_pos() < self.min_bytes_before_label:
+            mc.NOP()
+
     def _compute_vars_longevity(self, inputargs, operations):
         # compute a dictionary that maps variables to index in
         # operations that is a "last-time-seen"
@@ -458,7 +480,7 @@
         # only to guard operations or to jump or to finish
         produced = {}
         last_used = {}
-        useful = {}
+        last_real_usage = {}
         for i in range(len(operations)-1, -1, -1):
             op = operations[i]
             if op.result:
@@ -469,10 +491,13 @@
             opnum = op.getopnum()
             for j in range(op.numargs()):
                 arg = op.getarg(j)
-                if opnum != rop.JUMP and opnum != rop.FINISH:
-                    useful[arg] = None
-                if isinstance(arg, Box) and arg not in last_used:
+                if not isinstance(arg, Box):
+                    continue
+                if arg not in last_used:
                     last_used[arg] = i
+                if opnum != rop.JUMP and opnum != rop.LABEL:
+                    if arg not in last_real_usage:
+                        last_real_usage[arg] = i
             if op.is_guard():
                 for arg in op.getfailargs():
                     if arg is None: # hole
@@ -480,7 +505,8 @@
                     assert isinstance(arg, Box)
                     if arg not in last_used:
                         last_used[arg] = i
-
+        self.last_real_usage = last_real_usage
+        #
         longevity = {}
         for arg in produced:
             if arg in last_used:
@@ -496,7 +522,7 @@
                 longevity[arg] = (0, last_used[arg])
                 del last_used[arg]
         assert len(last_used) == 0
-        return longevity, useful
+        self.longevity = longevity
 
     def loc(self, v):
         if v is None: # xxx kludgy
@@ -1344,51 +1370,51 @@
         #   we would like the boxes to be after the jump.
 
     def _compute_hint_frame_locations_from_descr(self, descr):
-        nonfloatlocs, floatlocs = descr._x86_arglocs
+        arglocs = descr._x86_arglocs
         jump_op = self.final_jump_op
-        assert len(nonfloatlocs) == jump_op.numargs()
+        assert len(arglocs) == jump_op.numargs()
         for i in range(jump_op.numargs()):
             box = jump_op.getarg(i)
             if isinstance(box, Box):
-                loc = nonfloatlocs[i]
+                loc = arglocs[i]
                 if isinstance(loc, StackLoc):
-                    assert box.type != FLOAT
                     self.fm.hint_frame_locations[box] = loc
-                else:
-                    loc = floatlocs[i]
-                    if isinstance(loc, StackLoc):
-                        assert box.type == FLOAT
-                        self.fm.hint_frame_locations[box] = loc
 
     def consider_jump(self, op):
         assembler = self.assembler
         assert self.jump_target_descr is None
         descr = op.getdescr()
         assert isinstance(descr, TargetToken)
-        nonfloatlocs, floatlocs = descr._x86_arglocs
+        arglocs = descr._x86_arglocs
         self.jump_target_descr = descr
-        # compute 'tmploc' to be all_regs[0] by spilling what is there
-        box = TempBox()
-        box1 = TempBox()
-        tmpreg = X86RegisterManager.all_regs[0]
-        tmploc = self.rm.force_allocate_reg(box, selected_reg=tmpreg)
-        xmmtmp = X86XMMRegisterManager.all_regs[0]
-        self.xrm.force_allocate_reg(box1, selected_reg=xmmtmp)
         # Part about non-floats
-        # XXX we don't need a copy, we only just the original list
-        src_locations1 = [self.loc(op.getarg(i)) for i in range(op.numargs())
-                         if op.getarg(i).type != FLOAT]
-        assert tmploc not in nonfloatlocs
-        dst_locations1 = [loc for loc in nonfloatlocs if loc is not None]
+        src_locations1 = []
+        dst_locations1 = []
         # Part about floats
-        src_locations2 = [self.loc(op.getarg(i)) for i in range(op.numargs())
-                         if op.getarg(i).type == FLOAT]
-        dst_locations2 = [loc for loc in floatlocs if loc is not None]
+        src_locations2 = []
+        dst_locations2 = []
+        # Build the four lists
+        for i in range(op.numargs()):
+            box = op.getarg(i)
+            src_loc = self.loc(box)
+            dst_loc = arglocs[i]
+            if box.type != FLOAT:
+                src_locations1.append(src_loc)
+                dst_locations1.append(dst_loc)
+            else:
+                src_locations2.append(src_loc)
+                dst_locations2.append(dst_loc)
+        # Do we have a temp var?
+        if IS_X86_64:
+            tmpreg = X86_64_SCRATCH_REG
+            xmmtmp = X86_64_XMM_SCRATCH_REG
+        else:
+            tmpreg = None
+            xmmtmp = None
+        # Do the remapping
         remap_frame_layout_mixed(assembler,
-                                 src_locations1, dst_locations1, tmploc,
+                                 src_locations1, dst_locations1, tmpreg,
                                  src_locations2, dst_locations2, xmmtmp)
-        self.rm.possibly_free_var(box)
-        self.xrm.possibly_free_var(box1)
         self.possibly_free_vars_for_op(op)
         assembler.closing_jump(self.jump_target_descr)
 
@@ -1440,23 +1466,20 @@
         self.rm.force_allocate_frame_reg(op.result)
 
     def consider_label(self, op):
-        # XXX big refactoring needed?
         descr = op.getdescr()
         assert isinstance(descr, TargetToken)
         inputargs = op.getarglist()
-        floatlocs = [None] * len(inputargs)
-        nonfloatlocs = [None] * len(inputargs)
+        arglocs = [None] * len(inputargs)
         #
-        # we need to make sure that the tmpreg and xmmtmp are free
-        tmpreg = X86RegisterManager.all_regs[0]
-        tmpvar = TempBox()
-        self.rm.force_allocate_reg(tmpvar, selected_reg=tmpreg)
-        self.rm.possibly_free_var(tmpvar)
-        #
-        xmmtmp = X86XMMRegisterManager.all_regs[0]
-        tmpvar = TempBox()
-        self.xrm.force_allocate_reg(tmpvar, selected_reg=xmmtmp)
-        self.xrm.possibly_free_var(tmpvar)
+        # we use force_spill() on the boxes that are not going to be really
+        # used any more in the loop, but that are kept alive anyway
+        # by being in a next LABEL's or a JUMP's argument or fail_args
+        # of some guard
+        position = self.rm.position
+        for arg in inputargs:
+            assert isinstance(arg, Box)
+            if self.last_real_usage.get(arg, -1) <= position:
+                self.force_spill_var(arg)
         #
         # we need to make sure that no variable is stored in ebp
         for arg in inputargs:
@@ -1467,16 +1490,18 @@
         #
         for i in range(len(inputargs)):
             arg = inputargs[i]
-            assert not isinstance(arg, Const)
+            assert isinstance(arg, Box)
             loc = self.loc(arg)
-            assert not (loc is tmpreg or loc is xmmtmp or loc is ebp)
-            if arg.type == FLOAT:
-                floatlocs[i] = loc
-            else:
-                nonfloatlocs[i] = loc
+            assert loc is not ebp
+            arglocs[i] = loc
             if isinstance(loc, RegLoc):
                 self.fm.mark_as_free(arg)
-        descr._x86_arglocs = nonfloatlocs, floatlocs
+        #
+        # if we are too close to the start of the loop, the label's target may
+        # get overridden by redirect_call_assembler().  (rare case)
+        self.flush_loop()
+        #
+        descr._x86_arglocs = arglocs
         descr._x86_loop_code = self.assembler.mc.get_relative_pos()
         descr._x86_clt = self.assembler.current_clt
         self.assembler.target_tokens_currently_compiling[descr] = None
@@ -1490,23 +1515,6 @@
         if jump_op is not None and jump_op.getdescr() is descr:
             self._compute_hint_frame_locations_from_descr(descr)
 
-##        from pypy.rpython.annlowlevel import llhelper
-##        def fn(addr):
-##            print '...label:', hex(addr), nonfloatlocs
-##        FUNC = lltype.Ptr(lltype.FuncType([lltype.Signed], lltype.Void))
-##        ll_disp = llhelper(FUNC, fn)
-##        faddr = rffi.cast(lltype.Signed, ll_disp)
-##        for i in range(16):
-##            self.assembler.mc.PUSH_r(i)
-##        self.assembler.mc.CALL_l(0)
-##        self.assembler.mc.POP(edi)
-##        self.assembler.mc.MOV(r11, imm(faddr))
-##        self.assembler.mc.CALL(r11)
-##        for i in range(15, -1, -1):
-##            if i == esp.value:
-##                i -= 1
-##            self.assembler.mc.POP_r(i)
-
     def not_implemented_op(self, op):
         not_implemented("not implemented operation: %s" % op.getopname())
 
diff --git a/pypy/jit/backend/x86/regloc.py b/pypy/jit/backend/x86/regloc.py
--- a/pypy/jit/backend/x86/regloc.py
+++ b/pypy/jit/backend/x86/regloc.py
@@ -16,8 +16,7 @@
 #
 
 class AssemblerLocation(object):
-    # XXX: Is adding "width" here correct?
-    _attrs_ = ('value', 'width', '_location_code')
+    _attrs_ = ('value', '_location_code')
     _immutable_ = True
     def _getregkey(self):
         return self.value
@@ -28,6 +27,9 @@
     def location_code(self):
         return self._location_code
 
+    def get_width(self):
+        raise NotImplementedError
+
     def value_r(self): return self.value
     def value_b(self): return self.value
     def value_s(self): return self.value
@@ -43,14 +45,21 @@
     _immutable_ = True
     _location_code = 'b'
 
-    def __init__(self, position, ebp_offset, num_words, type):
-        assert ebp_offset < 0   # so no confusion with RegLoc.value
+    def __init__(self, position, ebp_offset, type):
+        # _getregkey() returns self.value; the value returned must not
+        # conflict with RegLoc._getregkey().  It doesn't a bit by chance,
+        # so let it fail the following assert if it no longer does.
+        assert not (0 <= ebp_offset < 8 + 8 * IS_X86_64)
         self.position = position
         self.value = ebp_offset
-        self.width = num_words * WORD
         # One of INT, REF, FLOAT
         self.type = type
 
+    def get_width(self):
+        if self.type == FLOAT:
+            return 8
+        return WORD
+
     def __repr__(self):
         return '%d(%%ebp)' % (self.value,)
 
@@ -64,10 +73,8 @@
         self.value = regnum
         self.is_xmm = is_xmm
         if self.is_xmm:
-            self.width = 8
             self._location_code = 'x'
         else:
-            self.width = WORD
             self._location_code = 'r'
     def __repr__(self):
         if self.is_xmm:
@@ -75,6 +82,11 @@
         else:
             return rx86.R.names[self.value]
 
+    def get_width(self):
+        if self.is_xmm:
+            return 8
+        return WORD
+
     def lowest8bits(self):
         assert not self.is_xmm
         return RegLoc(rx86.low_byte(self.value), False)
@@ -92,9 +104,11 @@
         else:
             return eax
 
-class ImmedLoc(AssemblerLocation):
+class ImmediateAssemblerLocation(AssemblerLocation):
     _immutable_ = True
-    width = WORD
+
+class ImmedLoc(ImmediateAssemblerLocation):
+    _immutable_ = True
     _location_code = 'i'
 
     def __init__(self, value):
@@ -105,6 +119,9 @@
     def getint(self):
         return self.value
 
+    def get_width(self):
+        return WORD
+
     def __repr__(self):
         return "ImmedLoc(%d)" % (self.value)
 
@@ -117,7 +134,6 @@
 class AddressLoc(AssemblerLocation):
     _immutable_ = True
 
-    width = WORD
     # The address is base_loc + (scaled_loc << scale) + static_offset
     def __init__(self, base_loc, scaled_loc, scale=0, static_offset=0):
         assert 0 <= scale < 4
@@ -146,6 +162,9 @@
         info = getattr(self, attr, '?')
         return '<AddressLoc %r: %s>' % (self._location_code, info)
 
+    def get_width(self):
+        return WORD
+
     def value_a(self):
         return self.loc_a
 
@@ -180,32 +199,34 @@
             raise AssertionError(self._location_code)
         return result
 
-class ConstFloatLoc(AssemblerLocation):
-    # XXX: We have to use this class instead of just AddressLoc because
-    # we want a width of 8  (... I think.  Check this!)
+class ConstFloatLoc(ImmediateAssemblerLocation):
     _immutable_ = True
-    width = 8
     _location_code = 'j'
 
     def __init__(self, address):
         self.value = address
 
+    def get_width(self):
+        return 8
+
     def __repr__(self):
         return '<ConstFloatLoc @%s>' % (self.value,)
 
 if IS_X86_32:
-    class FloatImmedLoc(AssemblerLocation):
+    class FloatImmedLoc(ImmediateAssemblerLocation):
         # This stands for an immediate float.  It cannot be directly used in
         # any assembler instruction.  Instead, it is meant to be decomposed
         # in two 32-bit halves.  On 64-bit, FloatImmedLoc() is a function
         # instead; see below.
         _immutable_ = True
-        width = 8
         _location_code = '#'     # don't use me
 
         def __init__(self, floatstorage):
             self.aslonglong = floatstorage
 
+        def get_width(self):
+            return 8
+
         def low_part(self):
             return intmask(self.aslonglong)
 
diff --git a/pypy/jit/backend/x86/runner.py b/pypy/jit/backend/x86/runner.py
--- a/pypy/jit/backend/x86/runner.py
+++ b/pypy/jit/backend/x86/runner.py
@@ -3,6 +3,7 @@
 from pypy.rpython.lltypesystem.lloperation import llop
 from pypy.rpython.llinterp import LLInterpreter
 from pypy.rlib.objectmodel import we_are_translated
+from pypy.jit.codewriter import longlong
 from pypy.jit.metainterp import history, compile
 from pypy.jit.backend.x86.assembler import Assembler386
 from pypy.jit.backend.x86.arch import FORCE_INDEX_OFS
@@ -21,7 +22,6 @@
     supports_floats = True
     supports_singlefloats = True
 
-    BOOTSTRAP_TP = lltype.FuncType([], lltype.Signed)
     dont_keepalive_stuff = False # for tests
     with_threads = False
 
@@ -91,15 +91,6 @@
         return self.assembler.assemble_bridge(faildescr, inputargs, operations,
                                               original_loop_token, log=log)
 
-    def set_future_value_int(self, index, intvalue):
-        self.assembler.fail_boxes_int.setitem(index, intvalue)
-
-    def set_future_value_float(self, index, floatvalue):
-        self.assembler.fail_boxes_float.setitem(index, floatvalue)
-
-    def set_future_value_ref(self, index, ptrvalue):
-        self.assembler.fail_boxes_ptr.setitem(index, ptrvalue)
-
     def get_latest_value_int(self, index):
         return self.assembler.fail_boxes_int.getitem(index)
 
@@ -122,27 +113,28 @@
         # the FORCE_TOKEN operation and this helper both return 'ebp'.
         return self.assembler.fail_ebp
 
-    def execute_token(self, executable_token):
-        addr = executable_token._x86_bootstrap_code
-        #llop.debug_print(lltype.Void, ">>>> Entering", addr)
-        func = rffi.cast(lltype.Ptr(self.BOOTSTRAP_TP), addr)
-        fail_index = self._execute_call(func)
-        #llop.debug_print(lltype.Void, "<<<< Back")
-        return self.get_fail_descr_from_number(fail_index)
-
-    def _execute_call(self, func):
-        # help flow objspace
-        prev_interpreter = None
-        if not self.translate_support_code:
-            prev_interpreter = LLInterpreter.current_interpreter
-            LLInterpreter.current_interpreter = self.debug_ll_interpreter
-        res = 0
-        try:
-            res = func()
-        finally:
+    def make_execute_token(self, *ARGS):
+        FUNCPTR = lltype.Ptr(lltype.FuncType(ARGS, lltype.Signed))
+        #
+        def execute_token(executable_token, *args):
+            clt = executable_token.compiled_loop_token
+            assert len(args) == clt._debug_nbargs
+            #
+            addr = executable_token._x86_function_addr
+            func = rffi.cast(FUNCPTR, addr)
+            #llop.debug_print(lltype.Void, ">>>> Entering", addr)
+            prev_interpreter = None   # help flow space
             if not self.translate_support_code:
-                LLInterpreter.current_interpreter = prev_interpreter
-        return res
+                prev_interpreter = LLInterpreter.current_interpreter
+                LLInterpreter.current_interpreter = self.debug_ll_interpreter
+            try:
+                fail_index = func(*args)
+            finally:
+                if not self.translate_support_code:
+                    LLInterpreter.current_interpreter = prev_interpreter
+            #llop.debug_print(lltype.Void, "<<<< Back")
+            return self.get_fail_descr_from_number(fail_index)
+        return execute_token
 
     def cast_ptr_to_int(x):
         adr = llmemory.cast_ptr_to_adr(x)
diff --git a/pypy/jit/backend/x86/test/test_assembler.py b/pypy/jit/backend/x86/test/test_assembler.py
--- a/pypy/jit/backend/x86/test/test_assembler.py
+++ b/pypy/jit/backend/x86/test/test_assembler.py
@@ -46,12 +46,13 @@
             xmm2]
     assert len(failargs) == len(locs)
     assembler.write_failure_recovery_description(mc, failargs, locs)
-    nums = [Assembler386.DESCR_INT   + 4*(16+0),
-            Assembler386.DESCR_REF   + 4*(16+1),
-            Assembler386.DESCR_FLOAT + 4*(16+10),
-            Assembler386.DESCR_INT   + 4*(16+100),
-            Assembler386.DESCR_REF   + 4*(16+101),
-            Assembler386.DESCR_FLOAT + 4*(16+110),
+    base = 8 + 8*IS_X86_64
+    nums = [Assembler386.DESCR_INT   + 4*(base+0),
+            Assembler386.DESCR_REF   + 4*(base+1),
+            Assembler386.DESCR_FLOAT + 4*(base+10),
+            Assembler386.DESCR_INT   + 4*(base+100),
+            Assembler386.DESCR_REF   + 4*(base+101),
+            Assembler386.DESCR_FLOAT + 4*(base+110),
             Assembler386.CODE_HOLE,
             Assembler386.CODE_HOLE,
             Assembler386.DESCR_INT   + 4*ebx.value,
diff --git a/pypy/jit/backend/x86/test/test_jump.py b/pypy/jit/backend/x86/test/test_jump.py
--- a/pypy/jit/backend/x86/test/test_jump.py
+++ b/pypy/jit/backend/x86/test/test_jump.py
@@ -71,6 +71,18 @@
                              ('mov', eax, s24),
                              ('mov', s12, edi)]
 
+def test_no_tmp_reg():
+    assembler = MockAssembler()
+    s8 = frame_pos(0, INT)
+    s12 = frame_pos(13, INT)
+    s20 = frame_pos(20, INT)
+    s24 = frame_pos(221, INT)
+    remap_frame_layout(assembler, [s8, eax, s12], [s20, s24, edi], None)
+    assert assembler.ops == [('push', s8),
+                             ('pop', s20),
+                             ('mov', eax, s24),
+                             ('mov', s12, edi)]
+
 def test_reordering():
     assembler = MockAssembler()
     s8 = frame_pos(8, INT)
@@ -237,7 +249,7 @@
         while len(result) < count:
             x = fn()
             keys = [x._getregkey()]
-            if isinstance(x, StackLoc) and x.width > WORD:
+            if isinstance(x, StackLoc) and x.get_width() > WORD:
                 keys.append(keys[0] + WORD)
             for key in keys:
                 if key in seen:
@@ -255,7 +267,7 @@
         for i, loc in enumerate(locations):
             if isinstance(loc, RegLoc):
                 if loc.is_xmm:
-                    if loc.width > WORD:
+                    if loc.get_width() > WORD:
                         newvalue = ('value-xmm-%d' % i,
                                     'value-xmm-hiword-%d' % i)
                     else:
@@ -264,8 +276,8 @@
                 else:
                     regs1[loc.value] = 'value-int-%d' % i
             elif isinstance(loc, StackLoc):
-                stack[loc.value] = 'value-width%d-%d' % (loc.width, i)
-                if loc.width > WORD:
+                stack[loc.value] = 'value-width%d-%d' % (loc.get_width(), i)
+                if loc.get_width() > WORD:
                     stack[loc.value+WORD] = 'value-hiword-%d' % i
             else:
                 assert isinstance(loc, ImmedLoc)
@@ -287,7 +299,7 @@
         #
         def read(loc, expected_width=None):
             if expected_width is not None:
-                assert loc.width == expected_width
+                assert loc.get_width() == expected_width
             if isinstance(loc, RegLoc):
                 if loc.is_xmm:
                     return regs2[loc.value]
@@ -295,7 +307,7 @@
                     return regs1[loc.value]
             if isinstance(loc, StackLoc):
                 got = stack[loc.value]
-                if loc.width > WORD:
+                if loc.get_width() > WORD:
                     got = (got, stack[loc.value+WORD])
                 return got
             if isinstance(loc, ImmedLoc):
@@ -309,7 +321,7 @@
                 else:
                     regs1[loc.value] = newvalue
             elif isinstance(loc, StackLoc):
-                if loc.width > WORD:
+                if loc.get_width() > WORD:
                     newval1, newval2 = newvalue
                     stack[loc.value] = newval1
                     stack[loc.value+WORD] = newval2
diff --git a/pypy/jit/backend/x86/test/test_recompilation.py b/pypy/jit/backend/x86/test/test_recompilation.py
--- a/pypy/jit/backend/x86/test/test_recompilation.py
+++ b/pypy/jit/backend/x86/test/test_recompilation.py
@@ -19,8 +19,7 @@
         finish(i3, descr=fdescr2)
         '''
         bridge = self.attach_bridge(ops, loop, -2)
-        self.cpu.set_future_value_int(0, 0)
-        fail = self.run(loop)
+        fail = self.run(loop, 0)
         assert fail.identifier == 2
         assert self.getint(0) == 21
     
@@ -55,8 +54,7 @@
         assert descr._x86_bridge_param_depth == 0
         # the force_spill() forces the stack to grow
         assert new > previous
-        self.cpu.set_future_value_int(0, 0)
-        fail = self.run(loop)
+        fail = self.run(loop, 0)
         assert fail.identifier == 2
         assert self.getint(0) == 21
         assert self.getint(1) == 22
@@ -71,20 +69,19 @@
         i2 = int_lt(i1, 20)
         guard_true(i2, descr=fdescr1) [i1]
         jump(i1, i10, i11, i12, i13, i14, i15, i16, descr=targettoken)
-        ''', [0])
+        ''', [0, 0, 0, 0, 0, 0, 0, 0])
         other_loop = self.interpret('''
-        [i3]
+        [i3, i10, i11, i12, i13, i14, i15, i16]
         label(i3, descr=targettoken2)
         guard_false(i3, descr=fdescr2) [i3]
         jump(i3, descr=targettoken2)
-        ''', [1])
+        ''', [1, 0, 0, 0, 0, 0, 0, 0])
         ops = '''
         [i3]
         jump(i3, 1, 2, 3, 4, 5, 6, 7, descr=targettoken)
         '''
         bridge = self.attach_bridge(ops, other_loop, 1)
-        self.cpu.set_future_value_int(0, 1)
-        fail = self.run(other_loop)
+        fail = self.run(other_loop, 1, 0, 0, 0, 0, 0, 0, 0)
         assert fail.identifier == 1
 
     def test_bridge_jumps_to_self_deeper(self):
@@ -100,7 +97,7 @@
         i5 = int_lt(i3, 20)
         guard_true(i5) [i99, i3]
         jump(i3, i30, 1, i30, i30, i30, descr=targettoken)
-        ''', [0])
+        ''', [0, 0, 0, 0, 0, 0])
         assert self.getint(0) == 0
         assert self.getint(1) == 1
         ops = '''
@@ -123,10 +120,7 @@
         # the force_spill() forces the stack to grow
         assert guard_op.getdescr()._x86_bridge_frame_depth > loop_frame_depth
         assert guard_op.getdescr()._x86_bridge_param_depth == 0
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.set_future_value_int(1, 0)
-        self.cpu.set_future_value_int(2, 0)
-        self.run(loop)
+        self.run(loop, 0, 0, 0, 0, 0, 0)
         assert self.getint(0) == 1
         assert self.getint(1) == 20
 
@@ -142,7 +136,7 @@
         i5 = int_lt(i3, 20)
         guard_true(i5) [i99, i3]
         jump(i3, i1, i2, descr=targettoken)
-        ''', [0])
+        ''', [0, 0, 0])
         assert self.getint(0) == 0
         assert self.getint(1) == 1
         ops = '''
@@ -150,10 +144,7 @@
         jump(i3, 0, 1, descr=targettoken)
         '''
         bridge = self.attach_bridge(ops, loop, 5)
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.set_future_value_int(1, 0)
-        self.cpu.set_future_value_int(2, 0)
-        self.run(loop)
+        self.run(loop, 0, 0, 0)
         assert self.getint(0) == 1
         assert self.getint(1) == 20
         
diff --git a/pypy/jit/backend/x86/test/test_regalloc.py b/pypy/jit/backend/x86/test/test_regalloc.py
--- a/pypy/jit/backend/x86/test/test_regalloc.py
+++ b/pypy/jit/backend/x86/test/test_regalloc.py
@@ -142,19 +142,20 @@
         loop = self.parse(ops)
         looptoken = JitCellToken()
         self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
-        for i, arg in enumerate(args):
+        arguments = []
+        for arg in args:
             if isinstance(arg, int):
-                self.cpu.set_future_value_int(i, arg)
+                arguments.append(arg)
             elif isinstance(arg, float):
                 arg = longlong.getfloatstorage(arg)
-                self.cpu.set_future_value_float(i, arg)
+                arguments.append(arg)
             else:
                 assert isinstance(lltype.typeOf(arg), lltype.Ptr)
                 llgcref = lltype.cast_opaque_ptr(llmemory.GCREF, arg)
-                self.cpu.set_future_value_ref(i, llgcref)
+                arguments.append(llgcref)
         loop._jitcelltoken = looptoken
         if run:
-            self.cpu.execute_token(looptoken)
+            self.cpu.execute_token(looptoken, *arguments)
         return loop
 
     def prepare_loop(self, ops):
@@ -193,8 +194,8 @@
                                 loop._jitcelltoken)
         return bridge
 
-    def run(self, loop):
-        return self.cpu.execute_token(loop._jitcelltoken)
+    def run(self, loop, *arguments):
+        return self.cpu.execute_token(loop._jitcelltoken, *arguments)
 
 class TestRegallocSimple(BaseTestRegalloc):
     def test_simple_loop(self):
@@ -220,7 +221,7 @@
         '''
         loop = self.interpret(ops, [0, 0, 0, 0])
         ops2 = '''
-        [i5]
+        [i5, i6, i7, i8]
         label(i5, descr=targettoken2)
         i1 = int_add(i5, 1)
         i3 = int_add(i1, 1)
@@ -229,14 +230,13 @@
         guard_true(i2) [i4]
         jump(i4, descr=targettoken2)
         '''
-        loop2 = self.interpret(ops2, [0])
+        loop2 = self.interpret(ops2, [0, 0, 0, 0])
         bridge_ops = '''
         [i4]
         jump(i4, i4, i4, i4, descr=targettoken)
         '''
         bridge = self.attach_bridge(bridge_ops, loop2, 5)
-        self.cpu.set_future_value_int(0, 0)
-        self.run(loop2)
+        self.run(loop2, 0, 0, 0, 0)
         assert self.getint(0) == 31
         assert self.getint(1) == 30
         assert self.getint(2) == 30
@@ -274,8 +274,7 @@
         loop = self.interpret(ops, [0])
         assert self.getint(0) == 1
         bridge = self.attach_bridge(bridge_ops, loop, 2)
-        self.cpu.set_future_value_int(0, 0)
-        self.run(loop)
+        self.run(loop, 0)
         assert self.getint(0) == 1
 
     def test_inputarg_unused(self):
@@ -301,9 +300,7 @@
         assert self.getint(0) == 0
         assert self.getint(1) == 10
         bridge = self.attach_bridge(bridge_ops, loop, 0)
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.set_future_value_int(1, 10)
-        self.run(loop)
+        self.run(loop, 0, 10)
         assert self.getint(0) == 0
         assert self.getint(1) == 10
 
@@ -320,9 +317,7 @@
         finish(1, 2)
         '''
         self.attach_bridge(bridge_ops, loop, 0)
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.set_future_value_int(1, 1)
-        self.run(loop)
+        self.run(loop, 0, 1)
 
     def test_spill_for_constant(self):
         ops = '''
@@ -406,7 +401,7 @@
         guard_true(i5) [i2, i1]
         jump(i0, i18, i15, i16, i2, i1, i4, descr=targettoken)
         '''
-        self.interpret(ops, [0, 1, 2, 3])
+        self.interpret(ops, [0, 1, 2, 3, 0, 0, 0])
 
     def test_op_result_unused(self):
         ops = '''
@@ -440,9 +435,7 @@
         finish(i0, i1, i2, i3, i4, i5, i6, i7, i8)
         '''
         self.attach_bridge(bridge_ops, loop, 1)
-        for i in range(9):
-            self.cpu.set_future_value_int(i, i)
-        self.run(loop)
+        self.run(loop, 0, 1, 2, 3, 4, 5, 6, 7, 8)
         assert self.getints(9) == range(9)
 
     def test_loopargs(self):
@@ -452,27 +445,13 @@
         jump(i4, i1, i2, i3)
         """
         regalloc = self.prepare_loop(ops)
-        assert len(regalloc.rm.reg_bindings) == 2
+        if IS_X86_64:
+            assert len(regalloc.rm.reg_bindings) == 4
+            assert len(regalloc.fm.bindings) == 0
+        else:
+            assert len(regalloc.rm.reg_bindings) == 0
+            assert len(regalloc.fm.bindings) == 4
 
-    def test_loopargs_2(self):
-        ops = """
-        [i0, i1, i2, i3]
-        i4 = int_add(i0, i1)
-        finish(i4, i1, i2, i3)
-        """
-        regalloc = self.prepare_loop(ops)
-        assert len(regalloc.rm.reg_bindings) == 2
-
-    def test_loopargs_3(self):
-        ops = """
-        [i0, i1, i2, i3]
-        i4 = int_add(i0, i1)
-        guard_true(i4) [i0, i1, i2, i3, i4]
-        jump(i4, i1, i2, i3)
-        """
-        regalloc = self.prepare_loop(ops)
-        assert len(regalloc.rm.reg_bindings) == 2
-    
 
 class TestRegallocCompOps(BaseTestRegalloc):
     
@@ -640,8 +619,8 @@
         i10 = call(ConstClass(f1ptr), i0, descr=f1_calldescr)
         finish(i10, i1, i2, i3, i4, i5, i6, i7, i8, i9)
         '''
-        loop = self.interpret(ops, [4, 7, 9, 9 ,9, 9, 9, 9, 9, 9, 9])
-        assert self.getints(11) == [5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9]
+        loop = self.interpret(ops, [4, 7, 9, 9 ,9, 9, 9, 9, 9, 9])
+        assert self.getints(10) == [5, 7, 9, 9, 9, 9, 9, 9, 9, 9]
         clt = loop._jitcelltoken.compiled_loop_token
         assert clt.param_depth == self.expected_param_depth(1)
 
@@ -652,8 +631,8 @@
         i11 = call(ConstClass(f2ptr), i10, i1, descr=f2_calldescr)        
         finish(i11, i1,  i2, i3, i4, i5, i6, i7, i8, i9)
         '''
-        loop = self.interpret(ops, [4, 7, 9, 9 ,9, 9, 9, 9, 9, 9, 9])
-        assert self.getints(11) == [5*7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9]
+        loop = self.interpret(ops, [4, 7, 9, 9 ,9, 9, 9, 9, 9, 9])
+        assert self.getints(10) == [5*7, 7, 9, 9, 9, 9, 9, 9, 9, 9]
         clt = loop._jitcelltoken.compiled_loop_token
         assert clt.param_depth == self.expected_param_depth(2)
 
@@ -689,9 +668,7 @@
 
         assert loop.operations[-2].getdescr()._x86_bridge_param_depth == self.expected_param_depth(2)
 
-        self.cpu.set_future_value_int(0, 4)
-        self.cpu.set_future_value_int(1, 7)        
-        self.run(loop)
+        self.run(loop, 4, 7)
         assert self.getint(0) == 5*7
 
     def test_bridge_calls_2(self):
@@ -712,8 +689,6 @@
 
         assert loop.operations[-2].getdescr()._x86_bridge_param_depth == self.expected_param_depth(2)
 
-        self.cpu.set_future_value_int(0, 4)
-        self.cpu.set_future_value_int(1, 7)        
-        self.run(loop)
+        self.run(loop, 4, 7)
         assert self.getint(0) == 29
 
diff --git a/pypy/jit/backend/x86/test/test_regalloc2.py b/pypy/jit/backend/x86/test/test_regalloc2.py
--- a/pypy/jit/backend/x86/test/test_regalloc2.py
+++ b/pypy/jit/backend/x86/test/test_regalloc2.py
@@ -22,8 +22,7 @@
     cpu.setup_once()
     looptoken = JitCellToken()
     cpu.compile_loop(inputargs, operations, looptoken)
-    cpu.set_future_value_int(0, 9)
-    cpu.execute_token(looptoken)
+    cpu.execute_token(looptoken, 9)
     assert cpu.get_latest_value_int(0) == (9 >> 3)
     assert cpu.get_latest_value_int(1) == (~18)
 
@@ -45,8 +44,7 @@
     cpu.setup_once()
     looptoken = JitCellToken()
     cpu.compile_loop(inputargs, operations, looptoken)
-    cpu.set_future_value_int(0, -10)
-    cpu.execute_token(looptoken)
+    cpu.execute_token(looptoken, -10)
     assert cpu.get_latest_value_int(0) == 0
     assert cpu.get_latest_value_int(1) == -1000
     assert cpu.get_latest_value_int(2) == 1
@@ -142,17 +140,7 @@
     cpu.setup_once()
     looptoken = JitCellToken()
     cpu.compile_loop(inputargs, operations, looptoken)
-    cpu.set_future_value_int(0, -13)
-    cpu.set_future_value_int(1, 10)
-    cpu.set_future_value_int(2, 10)
-    cpu.set_future_value_int(3, 8)
-    cpu.set_future_value_int(4, -8)
-    cpu.set_future_value_int(5, -16)
-    cpu.set_future_value_int(6, -18)
-    cpu.set_future_value_int(7, 46)
-    cpu.set_future_value_int(8, -12)
-    cpu.set_future_value_int(9, 26)
-    cpu.execute_token(looptoken)
+    cpu.execute_token(looptoken, -13, 10, 10, 8, -8, -16, -18, 46, -12, 26)
     assert cpu.get_latest_value_int(0) == 0
     assert cpu.get_latest_value_int(1) == 0
     assert cpu.get_latest_value_int(2) == 0
@@ -257,17 +245,7 @@
     cpu.setup_once()
     looptoken = JitCellToken()
     cpu.compile_loop(inputargs, operations, looptoken)
-    cpu.set_future_value_int(0, 17)
-    cpu.set_future_value_int(1, -20)
-    cpu.set_future_value_int(2, -6)
-    cpu.set_future_value_int(3, 6)
-    cpu.set_future_value_int(4, 1)
-    cpu.set_future_value_int(5, 13)
-    cpu.set_future_value_int(6, 13)
-    cpu.set_future_value_int(7, 9)
-    cpu.set_future_value_int(8, 49)
-    cpu.set_future_value_int(9, 8)
-    cpu.execute_token(looptoken)
+    cpu.execute_token(looptoken, 17, -20, -6, 6, 1, 13, 13, 9, 49, 8)
     assert cpu.get_latest_value_int(0) == 0
     assert cpu.get_latest_value_int(1) == 8
     assert cpu.get_latest_value_int(2) == 1
diff --git a/pypy/jit/backend/x86/test/test_runner.py b/pypy/jit/backend/x86/test/test_runner.py
--- a/pypy/jit/backend/x86/test/test_runner.py
+++ b/pypy/jit/backend/x86/test/test_runner.py
@@ -282,11 +282,7 @@
                     ops[-2].setfailargs([i1])
                     looptoken = JitCellToken()
                     self.cpu.compile_loop([b], ops, looptoken)
-                    if op == rop.INT_IS_TRUE:
-                        self.cpu.set_future_value_int(0, b.value)
-                    else:
-                        self.cpu.set_future_value_ref(0, b.value)
-                    self.cpu.execute_token(looptoken)
+                    self.cpu.execute_token(looptoken, b.value)
                     result = self.cpu.get_latest_value_int(0)
                     if guard == rop.GUARD_FALSE:
                         assert result == execute(self.cpu, None,
@@ -332,9 +328,8 @@
                     inputargs = [i for i in (a, b) if isinstance(i, Box)]
                     looptoken = JitCellToken()
                     self.cpu.compile_loop(inputargs, ops, looptoken)
-                    for i, box in enumerate(inputargs):
-                        self.cpu.set_future_value_int(i, box.value)
-                    self.cpu.execute_token(looptoken)
+                    inputvalues = [box.value for box in inputargs]
+                    self.cpu.execute_token(looptoken, *inputvalues)
                     result = self.cpu.get_latest_value_int(0)
                     expected = execute(self.cpu, None, op, None, a, b).value
                     if guard == rop.GUARD_FALSE:
@@ -400,8 +395,7 @@
         assert address >= loopaddress + loopsize
         assert size >= 10 # randomish number
 
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_int(0)
         assert res == 20
@@ -507,9 +501,7 @@
             looptoken = JitCellToken()
             self.cpu.compile_loop([i1, i2], ops, looptoken)
 
-            self.cpu.set_future_value_int(0, 123450)
-            self.cpu.set_future_value_int(1, 123408)
-            fail = self.cpu.execute_token(looptoken)
+            fail = self.cpu.execute_token(looptoken, 123450, 123408)
             assert fail.identifier == 0
             assert self.cpu.get_latest_value_int(0) == 42
             assert self.cpu.get_latest_value_int(1) == 42
@@ -541,8 +533,7 @@
             self.cpu.assembler.set_debug(True)
             looptoken = JitCellToken()
             self.cpu.compile_loop(ops.inputargs, ops.operations, looptoken)
-            self.cpu.set_future_value_int(0, 0)
-            self.cpu.execute_token(looptoken)
+            self.cpu.execute_token(looptoken, 0)
             # check debugging info
             struct = self.cpu.assembler.loop_run_counters[0]
             assert struct.i == 10
@@ -565,7 +556,6 @@
         self.cpu.assembler.set_debug(True)
         looptoken = JitCellToken()
         self.cpu.compile_loop(ops.inputargs, ops.operations, looptoken)
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.execute_token(looptoken)
+        self.cpu.execute_token(looptoken, 0)
         assert looptoken._x86_debug_checksum == sum([op.getopnum()
                                                      for op in ops.operations])
diff --git a/pypy/jit/codewriter/jtransform.py b/pypy/jit/codewriter/jtransform.py
--- a/pypy/jit/codewriter/jtransform.py
+++ b/pypy/jit/codewriter/jtransform.py
@@ -498,27 +498,29 @@
         else:
             log.WARNING('ignoring hint %r at %r' % (hints, self.graph))
 
+    def _rewrite_raw_malloc(self, op, name, args):
+        d = op.args[1].value.copy()
+        d.pop('flavor')
+        add_memory_pressure = d.pop('add_memory_pressure', False)
+        zero = d.pop('zero', False)
+        track_allocation = d.pop('track_allocation', True)
+        if d:
+            raise UnsupportedMallocFlags(d)
+        TYPE = op.args[0].value
+        if zero:
+            name += '_zero'
+        if add_memory_pressure:
+            name += '_add_memory_pressure'
+        if not track_allocation:
+            name += '_no_track_allocation'
+        return self._do_builtin_call(op, name, args,
+                                     extra = (TYPE,),
+                                     extrakey = TYPE)
+
     def rewrite_op_malloc_varsize(self, op):
         if op.args[1].value['flavor'] == 'raw':
-            d = op.args[1].value.copy()
-            d.pop('flavor')
-            add_memory_pressure = d.pop('add_memory_pressure', False)
-            zero = d.pop('zero', False)
-            track_allocation = d.pop('track_allocation', True)
-            if d:
-                raise UnsupportedMallocFlags(d)
-            ARRAY = op.args[0].value
-            name = 'raw_malloc'
-            if zero:
-                name += '_zero'
-            if add_memory_pressure:
-                name += '_add_memory_pressure'
-            if not track_allocation:
-                name += '_no_track_allocation'
-            return self._do_builtin_call(op, name,
-                                         [op.args[2]],
-                                         extra = (ARRAY,),
-                                         extrakey = ARRAY)
+            return self._rewrite_raw_malloc(op, 'raw_malloc_varsize',
+                                            [op.args[2]])
         if op.args[0].value == rstr.STR:
             return SpaceOperation('newstr', [op.args[2]], op.result)
         elif op.args[0].value == rstr.UNICODE:
@@ -531,11 +533,18 @@
                                   op.result)
 
     def rewrite_op_free(self, op):
-        flags = op.args[1].value
-        assert flags['flavor'] == 'raw'
-        ARRAY = op.args[0].concretetype.TO
-        return self._do_builtin_call(op, 'raw_free', [op.args[0]],
-                                     extra = (ARRAY,), extrakey = ARRAY)
+        d = op.args[1].value.copy()
+        assert d['flavor'] == 'raw'
+        d.pop('flavor')
+        track_allocation = d.pop('track_allocation', True)
+        if d:
+            raise UnsupportedMallocFlags(d)
+        STRUCT = op.args[0].concretetype.TO
+        name = 'raw_free'
+        if not track_allocation:
+            name += '_no_track_allocation'
+        return self._do_builtin_call(op, name, [op.args[0]],
+                                     extra = (STRUCT,), extrakey = STRUCT)
 
     def rewrite_op_getarrayitem(self, op):
         ARRAY = op.args[0].concretetype.TO
@@ -736,6 +745,9 @@
         return [op0, op1]
 
     def rewrite_op_malloc(self, op):
+        if op.args[1].value['flavor'] == 'raw':
+            return self._rewrite_raw_malloc(op, 'raw_malloc_fixedsize', [])
+        #
         assert op.args[1].value == {'flavor': 'gc'}
         STRUCT = op.args[0].value
         vtable = heaptracker.get_vtable_for_gcstruct(self.cpu, STRUCT)
diff --git a/pypy/jit/codewriter/support.py b/pypy/jit/codewriter/support.py
--- a/pypy/jit/codewriter/support.py
+++ b/pypy/jit/codewriter/support.py
@@ -599,26 +599,75 @@
             return p
         return _ll_0_alloc_with_del
 
-    def build_raw_malloc_builder(zero=False, add_memory_pressure=False, track_allocation=True):
-        def build_ll_1_raw_malloc(ARRAY):
-            def _ll_1_raw_malloc(n):
-                return lltype.malloc(ARRAY, n, flavor='raw', zero=zero, add_memory_pressure=add_memory_pressure)
-            return _ll_1_raw_malloc
-        return build_ll_1_raw_malloc
+    def build_raw_malloc_varsize_builder(zero=False,
+                                         add_memory_pressure=False,
+                                         track_allocation=True):
+        def build_ll_1_raw_malloc_varsize(ARRAY):
+            def _ll_1_raw_malloc_varsize(n):
+                return lltype.malloc(ARRAY, n, flavor='raw', zero=zero,
+                                     add_memory_pressure=add_memory_pressure,
+                                     track_allocation=track_allocation)
+            return _ll_1_raw_malloc_varsize
+        return build_ll_1_raw_malloc_varsize
 
-    build_ll_1_raw_malloc = build_raw_malloc_builder()
-    build_ll_1_raw_malloc_zero = build_raw_malloc_builder(zero=True)
-    build_ll_1_raw_malloc_zero_add_memory_pressure = build_raw_malloc_builder(zero=True, add_memory_pressure=True)
-    build_ll_1_raw_malloc_add_memory_pressure = build_raw_malloc_builder(add_memory_pressure=True)
-    build_ll_1_raw_malloc_no_track_allocation = build_raw_malloc_builder(track_allocation=False)
-    build_ll_1_raw_malloc_zero_no_track_allocation = build_raw_malloc_builder(zero=True, track_allocation=False)
-    build_ll_1_raw_malloc_zero_add_memory_pressure_no_track_allocation = build_raw_malloc_builder(zero=True, add_memory_pressure=True, track_allocation=False)
-    build_ll_1_raw_malloc_add_memory_pressure_no_track_allocation = build_raw_malloc_builder(add_memory_pressure=True, track_allocation=False)
+    build_ll_1_raw_malloc_varsize = (
+        build_raw_malloc_varsize_builder())
+    build_ll_1_raw_malloc_varsize_zero = (
+        build_raw_malloc_varsize_builder(zero=True))
+    build_ll_1_raw_malloc_varsize_zero_add_memory_pressure = (
+        build_raw_malloc_varsize_builder(zero=True, add_memory_pressure=True))
+    build_ll_1_raw_malloc_varsize_add_memory_pressure = (
+        build_raw_malloc_varsize_builder(add_memory_pressure=True))
+    build_ll_1_raw_malloc_varsize_no_track_allocation = (
+        build_raw_malloc_varsize_builder(track_allocation=False))
+    build_ll_1_raw_malloc_varsize_zero_no_track_allocation = (
+        build_raw_malloc_varsize_builder(zero=True, track_allocation=False))
+    build_ll_1_raw_malloc_varsize_zero_add_memory_pressure_no_track_allocation = (
+        build_raw_malloc_varsize_builder(zero=True, add_memory_pressure=True, track_allocation=False))
+    build_ll_1_raw_malloc_varsize_add_memory_pressure_no_track_allocation = (
+        build_raw_malloc_varsize_builder(add_memory_pressure=True, track_allocation=False))
 
-    def build_ll_1_raw_free(ARRAY):
-        def _ll_1_raw_free(p):
-            lltype.free(p, flavor='raw')
-        return _ll_1_raw_free
+    def build_raw_malloc_fixedsize_builder(zero=False,
+                                           add_memory_pressure=False,
+                                           track_allocation=True):
+        def build_ll_0_raw_malloc_fixedsize(STRUCT):
+            def _ll_0_raw_malloc_fixedsize():
+                return lltype.malloc(STRUCT, flavor='raw', zero=zero,
+                                     add_memory_pressure=add_memory_pressure,
+                                     track_allocation=track_allocation)
+            return _ll_0_raw_malloc_fixedsize
+        return build_ll_0_raw_malloc_fixedsize
+
+    build_ll_0_raw_malloc_fixedsize = (
+        build_raw_malloc_fixedsize_builder())
+    build_ll_0_raw_malloc_fixedsize_zero = (
+        build_raw_malloc_fixedsize_builder(zero=True))
+    build_ll_0_raw_malloc_fixedsize_zero_add_memory_pressure = (
+        build_raw_malloc_fixedsize_builder(zero=True, add_memory_pressure=True))
+    build_ll_0_raw_malloc_fixedsize_add_memory_pressure = (
+        build_raw_malloc_fixedsize_builder(add_memory_pressure=True))
+    build_ll_0_raw_malloc_fixedsize_no_track_allocation = (
+        build_raw_malloc_fixedsize_builder(track_allocation=False))
+    build_ll_0_raw_malloc_fixedsize_zero_no_track_allocation = (
+        build_raw_malloc_fixedsize_builder(zero=True, track_allocation=False))
+    build_ll_0_raw_malloc_fixedsize_zero_add_memory_pressure_no_track_allocation = (
+        build_raw_malloc_fixedsize_builder(zero=True, add_memory_pressure=True, track_allocation=False))
+    build_ll_0_raw_malloc_fixedsize_add_memory_pressure_no_track_allocation = (
+        build_raw_malloc_fixedsize_builder(add_memory_pressure=True, track_allocation=False))
+
+    def build_raw_free_builder(track_allocation=True):
+        def build_ll_1_raw_free(ARRAY):
+            def _ll_1_raw_free(p):
+                lltype.free(p, flavor='raw',
+                            track_allocation=track_allocation)
+            return _ll_1_raw_free
+        return build_ll_1_raw_free
+
+    build_ll_1_raw_free = (
+        build_raw_free_builder())
+    build_ll_1_raw_free_no_track_allocation = (
+        build_raw_free_builder(track_allocation=False))
+
 
 class OOtypeHelpers:
 
diff --git a/pypy/jit/codewriter/test/test_codewriter.py b/pypy/jit/codewriter/test/test_codewriter.py
--- a/pypy/jit/codewriter/test/test_codewriter.py
+++ b/pypy/jit/codewriter/test/test_codewriter.py
@@ -217,7 +217,7 @@
     cw.make_jitcodes(verbose=True)
     #
     s = jitdriver_sd.mainjitcode.dump()
-    assert 'residual_call_ir_i $<* fn _ll_1_raw_malloc__Signed>' in s
+    assert 'residual_call_ir_i $<* fn _ll_1_raw_malloc_varsize__Signed>' in s
     assert 'setarrayitem_raw_i' in s
     assert 'getarrayitem_raw_i' in s
     assert 'residual_call_ir_v $<* fn _ll_1_raw_free__arrayPtr>' in s
diff --git a/pypy/jit/codewriter/test/test_jtransform.py b/pypy/jit/codewriter/test/test_jtransform.py
--- a/pypy/jit/codewriter/test/test_jtransform.py
+++ b/pypy/jit/codewriter/test/test_jtransform.py
@@ -550,7 +550,7 @@
     tr = Transformer(FakeCPU(), FakeResidualCallControl())
     op0, op1 = tr.rewrite_operation(op)
     assert op0.opname == 'residual_call_ir_i'
-    assert op0.args[0].value == 'raw_malloc'    # pseudo-function as a str
+    assert op0.args[0].value == 'raw_malloc_varsize' # pseudo-function as a str
     assert op1.opname == '-live-'
     assert op1.args == []
 
@@ -564,7 +564,7 @@
     tr = Transformer(FakeCPU(), FakeResidualCallControl())
     op0, op1 = tr.rewrite_operation(op)
     assert op0.opname == 'residual_call_ir_i'
-    assert op0.args[0].value == 'raw_malloc_zero'    # pseudo-function as a str
+    assert op0.args[0].value == 'raw_malloc_varsize_zero'  # pseudo-fn as a str
     assert op1.opname == '-live-'
     assert op1.args == []
 
@@ -578,6 +578,35 @@
     tr = Transformer(FakeCPU(), FakeResidualCallControl())
     py.test.raises(UnsupportedMallocFlags, tr.rewrite_operation, op)
 
+def test_raw_malloc_fixedsize():
+    S = lltype.Struct('dummy', ('x', lltype.Signed))
+    v = varoftype(lltype.Ptr(S))
+    flags = Constant({'flavor': 'raw', 'zero': True}, lltype.Void)
+    op = SpaceOperation('malloc', [Constant(S, lltype.Void), flags], v)
+    tr = Transformer(FakeCPU(), FakeResidualCallControl())
+    op0, op1 = tr.rewrite_operation(op)
+    assert op0.opname == 'residual_call_r_i'
+    assert op0.args[0].value == 'raw_malloc_fixedsize_zero' #pseudo-fn as a str
+    assert op1.opname == '-live-'
+    assert op1.args == []
+
+def test_raw_free():
+    S = lltype.Struct('dummy', ('x', lltype.Signed))
+    for flag in [True, False]:
+        flags = Constant({'flavor': 'raw', 'track_allocation': flag},
+                         lltype.Void)
+        op = SpaceOperation('free', [varoftype(lltype.Ptr(S)), flags],
+                            varoftype(lltype.Void))
+        tr = Transformer(FakeCPU(), FakeResidualCallControl())
+        op0, op1 = tr.rewrite_operation(op)
+        assert op0.opname == 'residual_call_ir_v'
+        if flag:
+            pseudo_op_name = 'raw_free'
+        else:
+            pseudo_op_name = 'raw_free_no_track_allocation'
+        assert op0.args[0].value == pseudo_op_name   # pseudo-function as a str
+        assert op1.opname == '-live-'
+
 def test_rename_on_links():
     v1 = Variable()
     v2 = Variable(); v2.concretetype = llmemory.Address
diff --git a/pypy/jit/metainterp/blackhole.py b/pypy/jit/metainterp/blackhole.py
--- a/pypy/jit/metainterp/blackhole.py
+++ b/pypy/jit/metainterp/blackhole.py
@@ -1504,7 +1504,6 @@
                         all_virtuals=None):
     from pypy.jit.metainterp.resume import blackhole_from_resumedata
     #debug_start('jit-blackhole')
-    metainterp_sd.profiler.start_blackhole()
     blackholeinterp = blackhole_from_resumedata(
         metainterp_sd.blackholeinterpbuilder,
         jitdriver_sd,
@@ -1518,10 +1517,9 @@
     current_exc = blackholeinterp._prepare_resume_from_failure(
         resumedescr.guard_opnum, dont_change_position)
 
-    try:
-        _run_forever(blackholeinterp, current_exc)
-    finally:
-        metainterp_sd.profiler.end_blackhole()
+    #try:
+    _run_forever(blackholeinterp, current_exc)
+    #finally:
         #debug_stop('jit-blackhole')
 
 def convert_and_run_from_pyjitpl(metainterp, raising_exception=False):
@@ -1529,7 +1527,6 @@
     # 'metainterp.framestack'.
     #debug_start('jit-blackhole')
     metainterp_sd = metainterp.staticdata
-    metainterp_sd.profiler.start_blackhole()
     nextbh = None
     for frame in metainterp.framestack:
         curbh = metainterp_sd.blackholeinterpbuilder.acquire_interp()
@@ -1546,8 +1543,7 @@
         firstbh.exception_last_value = current_exc
         current_exc = lltype.nullptr(rclass.OBJECTPTR.TO)
     #
-    try:
-        _run_forever(firstbh, current_exc)
-    finally:
-        metainterp_sd.profiler.end_blackhole()
+    #try:
+    _run_forever(firstbh, current_exc)
+    #finally:
         #debug_stop('jit-blackhole')
diff --git a/pypy/jit/metainterp/compile.py b/pypy/jit/metainterp/compile.py
--- a/pypy/jit/metainterp/compile.py
+++ b/pypy/jit/metainterp/compile.py
@@ -11,7 +11,7 @@
 from pypy.jit.metainterp.resoperation import ResOperation, rop, get_deep_immutable_oplist
 from pypy.jit.metainterp.history import TreeLoop, Box, History, JitCellToken, TargetToken
 from pypy.jit.metainterp.history import AbstractFailDescr, BoxInt
-from pypy.jit.metainterp.history import BoxPtr, BoxObj, BoxFloat, Const
+from pypy.jit.metainterp.history import BoxPtr, BoxObj, BoxFloat, Const, ConstInt
 from pypy.jit.metainterp import history
 from pypy.jit.metainterp.typesystem import llhelper, oohelper
 from pypy.jit.metainterp.optimize import InvalidLoop
@@ -254,7 +254,44 @@
     record_loop_or_bridge(metainterp_sd, loop)
     return target_token
 
+def patch_new_loop_to_load_virtualizable_fields(loop, jitdriver_sd):
+    vinfo = jitdriver_sd.virtualizable_info
+    extra_ops = []
+    inputargs = loop.inputargs
+    vable_box = inputargs[jitdriver_sd.index_of_virtualizable]
+    i = jitdriver_sd.num_red_args
+    loop.inputargs = inputargs[:i]
+    for descr in vinfo.static_field_descrs:
+        assert i < len(inputargs)
+        box = inputargs[i]
+        extra_ops.append(
+            ResOperation(rop.GETFIELD_GC, [vable_box], box, descr))
+        i += 1
+    arrayindex = 0
+    for descr in vinfo.array_field_descrs:
+        vable = vable_box.getref_base()
+        arraylen = vinfo.get_array_length(vable, arrayindex)
+        arraybox = BoxPtr()
+        extra_ops.append(
+            ResOperation(rop.GETFIELD_GC, [vable_box], arraybox, descr))
+        arraydescr = vinfo.array_descrs[arrayindex]
+        assert i + arraylen <= len(inputargs)
+        for index in range(arraylen):
+            box = inputargs[i]
+            extra_ops.append(
+                ResOperation(rop.GETARRAYITEM_GC,
+                             [arraybox, ConstInt(index)],
+                             box, descr=arraydescr))
+            i += 1
+        arrayindex += 1
+    assert i == len(inputargs)
+    loop.operations = extra_ops + loop.operations
+
 def send_loop_to_backend(greenkey, jitdriver_sd, metainterp_sd, loop, type):
+    vinfo = jitdriver_sd.virtualizable_info
+    if vinfo is not None:
+        patch_new_loop_to_load_virtualizable_fields(loop, jitdriver_sd)
+
     original_jitcell_token = loop.original_jitcell_token
     jitdriver_sd.on_compile(metainterp_sd.logger_ops, original_jitcell_token,
                             loop.operations, type, greenkey)
@@ -435,14 +472,14 @@
         if self.must_compile(metainterp_sd, jitdriver_sd):
             self.start_compiling()
             try:
-                return self._trace_and_compile_from_bridge(metainterp_sd,
-                                                           jitdriver_sd)
+                self._trace_and_compile_from_bridge(metainterp_sd,
+                                                    jitdriver_sd)
             finally:
                 self.done_compiling()
         else:
             from pypy.jit.metainterp.blackhole import resume_in_blackhole
             resume_in_blackhole(metainterp_sd, jitdriver_sd, self)
-            assert 0, "unreachable"
+        assert 0, "unreachable"
 
     def _trace_and_compile_from_bridge(self, metainterp_sd, jitdriver_sd):
         # 'jitdriver_sd' corresponds to the outermost one, i.e. the one
@@ -451,7 +488,7 @@
         # jitdrivers.
         from pypy.jit.metainterp.pyjitpl import MetaInterp
         metainterp = MetaInterp(metainterp_sd, jitdriver_sd)
-        return metainterp.handle_guard_failure(self)
+        metainterp.handle_guard_failure(self)
     _trace_and_compile_from_bridge._dont_inline_ = True
 
     def must_compile(self, metainterp_sd, jitdriver_sd):
@@ -767,21 +804,25 @@
         assert exception, "PropagateExceptionDescr: no exception??"
         raise metainterp_sd.ExitFrameWithExceptionRef(cpu, exception)
 
-def compile_tmp_callback(cpu, jitdriver_sd, greenboxes, redboxes,
+def compile_tmp_callback(cpu, jitdriver_sd, greenboxes, redargtypes,
                          memory_manager=None):
     """Make a LoopToken that corresponds to assembler code that just
     calls back the interpreter.  Used temporarily: a fully compiled
     version of the code may end up replacing it.
     """
-    # 'redboxes' is only used to know the types of red arguments.
-    inputargs = [box.clonebox() for box in redboxes]
     jitcell_token = make_jitcell_token(jitdriver_sd)
-    # 'nb_red_args' might be smaller than len(redboxes),
-    # because it doesn't include the virtualizable boxes.
     nb_red_args = jitdriver_sd.num_red_args
+    assert len(redargtypes) == nb_red_args
+    inputargs = []
+    for kind in redargtypes:
+        if   kind == history.INT:   box = BoxInt()
+        elif kind == history.REF:   box = BoxPtr()
+        elif kind == history.FLOAT: box = BoxFloat()
+        else: raise AssertionError
+        inputargs.append(box)
     k = jitdriver_sd.portal_runner_adr
     funcbox = history.ConstInt(heaptracker.adr2int(k))
-    callargs = [funcbox] + greenboxes + inputargs[:nb_red_args]
+    callargs = [funcbox] + greenboxes + inputargs
     #
     result_type = jitdriver_sd.result_type
     if result_type == history.INT:
diff --git a/pypy/jit/metainterp/history.py b/pypy/jit/metainterp/history.py
--- a/pypy/jit/metainterp/history.py
+++ b/pypy/jit/metainterp/history.py
@@ -124,9 +124,6 @@
     def sort_key(self):
         raise NotImplementedError
 
-    def set_future_value(self, cpu, j):
-        raise NotImplementedError
-
     def nonnull(self):
         raise NotImplementedError
 
@@ -289,9 +286,6 @@
     def _get_hash_(self):
         return make_hashable_int(self.value)
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_int(j, self.value)
-
     def same_constant(self, other):
         if isinstance(other, ConstInt):
             return self.value == other.value
@@ -329,9 +323,6 @@
     def _get_hash_(self):
         return longlong.gethash(self.value)
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_float(j, self.value)
-
     def same_constant(self, other):
         if isinstance(other, ConstFloat):
             return self.value == other.value
@@ -378,9 +369,6 @@
     def getaddr(self):
         return llmemory.cast_ptr_to_adr(self.value)
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_ref(j, self.value)
-
     def same_constant(self, other):
         if isinstance(other, ConstPtr):
             return self.value == other.value
@@ -432,9 +420,6 @@
         else:
             return 0
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_ref(j, self.value)
-
 ##    def getaddr(self):
 ##        # so far this is used only when calling
 ##        # CodeWriter.IndirectCallset.bytecode_for_address.  We don't need a
@@ -540,9 +525,6 @@
     def _get_hash_(self):
         return make_hashable_int(self.value)
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_int(j, self.value)
-
     def nonnull(self):
         return self.value != 0
 
@@ -575,9 +557,6 @@
     def _get_hash_(self):
         return longlong.gethash(self.value)
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_float(j, self.value)
-
     def nonnull(self):
         return self.value != longlong.ZEROF
 
@@ -620,9 +599,6 @@
         else:
             return 0
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_ref(j, self.value)
-
     def nonnull(self):
         return bool(self.value)
 
@@ -667,19 +643,12 @@
     def nonnull(self):
         return bool(self.value)
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_ref(j, self.value)
-
     def repr_rpython(self):
         return repr_rpython(self, 'bo')
 
     _getrepr_ = repr_object
 
 
-def set_future_values(cpu, boxes):
-    for j in range(len(boxes)):
-        boxes[j].set_future_value(cpu, j)
-
 # ____________________________________________________________
 
 
@@ -768,10 +737,23 @@
 
 class TargetToken(AbstractDescr):
     def __init__(self, targeting_jitcell_token=None):
-        # The jitcell to which jumps might result in a jump to this label
+        # Warning, two different jitcell_tokens here!
+        #
+        # * 'targeting_jitcell_token' is only useful for the front-end,
+        #   and it means: consider the LABEL that uses this TargetToken.
+        #   At this position, the state is logically the one given
+        #   by targeting_jitcell_token.  So e.g. if we want to enter the
+        #   JIT with some given green args, if the jitcell matches, then
+        #   we can jump to this LABEL.
+        #
+        # * 'original_jitcell_token' is information from the backend's
+        #   point of view: it means that this TargetToken is used in
+        #   a LABEL that belongs to either:
+        #   - a loop; then 'original_jitcell_token' is this loop
+        #   - or a bridge; then 'original_jitcell_token' is the loop
+        #     out of which we made this bridge
+        #
         self.targeting_jitcell_token = targeting_jitcell_token
-        
-        # The jitcell where the trace containing the label with this TargetToken begins
         self.original_jitcell_token = None
 
         self.virtual_state = None
@@ -981,15 +963,19 @@
         self.aborted_keys = []
         self.invalidated_token_numbers = set()    # <- not RPython
         self.jitcell_token_wrefs = []
+        self.jitcell_dicts = []                   # <- not RPython
 
     def clear(self):
         del self.loops[:]
         del self.locations[:]
         del self.aborted_keys[:]
+        del self.jitcell_token_wrefs[:]
         self.invalidated_token_numbers.clear()
         self.compiled_count = 0
         self.enter_count = 0
         self.aborted_count = 0
+        for dict in self.jitcell_dicts:
+            dict.clear()
 
     def add_jitcell_token(self, token):
         assert isinstance(token, JitCellToken)
diff --git a/pypy/jit/metainterp/jitdriver.py b/pypy/jit/metainterp/jitdriver.py
--- a/pypy/jit/metainterp/jitdriver.py
+++ b/pypy/jit/metainterp/jitdriver.py
@@ -11,6 +11,7 @@
     #    self.portal_calldescr  ... pypy.jit.metainterp.warmspot
     #    self.num_green_args    ... pypy.jit.metainterp.warmspot
     #    self.num_red_args      ... pypy.jit.metainterp.warmspot
+    #    self.red_args_types    ... pypy.jit.metainterp.warmspot
     #    self.result_type       ... pypy.jit.metainterp.warmspot
     #    self.virtualizable_info... pypy.jit.metainterp.warmspot
     #    self.greenfield_info   ... pypy.jit.metainterp.warmspot
diff --git a/pypy/jit/metainterp/jitprof.py b/pypy/jit/metainterp/jitprof.py
--- a/pypy/jit/metainterp/jitprof.py
+++ b/pypy/jit/metainterp/jitprof.py
@@ -10,8 +10,6 @@
 counters="""
 TRACING
 BACKEND
-RUNNING
-BLACKHOLE
 OPS
 RECORDED_OPS
 GUARDS
@@ -67,18 +65,6 @@
     def end_backend(self):
         pass
 
-    def start_running(self):
-        pass
-
-    def end_running(self):
-        pass
-
-    def start_blackhole(self):
-        pass
-
-    def end_blackhole(self):
-        pass
-
     def count(self, kind, inc=1):
         pass
 
@@ -134,16 +120,6 @@
     def start_backend(self):   self._start(BACKEND)
     def end_backend(self):     self._end  (BACKEND)
 
-    # Don't record times for 'running' and 'blackhole' because there are
-    # too many of them: calling time.time() is a major blocker.
-    # If you are interested in these numbers, use 'PYPYLOG=file' and
-    # look at the resulting file with pypy/tool/logparser.py.
-    def start_running(self): self.count(RUNNING)
-    def end_running(self):   pass
-
-    def start_blackhole(self): self.count(BLACKHOLE)
-    def end_blackhole(self):   pass
-
     def count(self, kind, inc=1):
         self.counters[kind] += inc        
     
@@ -165,8 +141,6 @@
         calls = self.calls
         self._print_line_time("Tracing", cnt[TRACING],   tim[TRACING])
         self._print_line_time("Backend", cnt[BACKEND],   tim[BACKEND])
-        self._print_intline("Running asm", cnt[RUNNING])
-        self._print_intline("Blackhole", cnt[BLACKHOLE])
         line = "TOTAL:      \t\t%f" % (self.tk - self.starttime, )
         debug_print(line)
         self._print_intline("ops", cnt[OPS])
diff --git a/pypy/jit/metainterp/memmgr.py b/pypy/jit/metainterp/memmgr.py
--- a/pypy/jit/metainterp/memmgr.py
+++ b/pypy/jit/metainterp/memmgr.py
@@ -1,5 +1,5 @@
 import math
-from pypy.rlib.rarithmetic import r_int64
+from pypy.rlib.rarithmetic import r_int64, r_uint
 from pypy.rlib.debug import debug_start, debug_print, debug_stop
 from pypy.rlib.objectmodel import we_are_translated
 
@@ -21,6 +21,7 @@
 #
 
 class MemoryManager(object):
+    NO_NEXT_CHECK = r_int64(2 ** 63 - 1)
 
     def __init__(self):
         self.check_frequency = -1
@@ -36,12 +37,13 @@
         # According to my estimates it's about 5e9 years given 1000 loops
         # per second
         self.current_generation = r_int64(1)
-        self.next_check = r_int64(-1)
+        self.next_check = self.NO_NEXT_CHECK
         self.alive_loops = {}
+        self._cleanup_jitcell_dicts = lambda: None
 
     def set_max_age(self, max_age, check_frequency=0):
         if max_age <= 0:
-            self.next_check = r_int64(-1)
+            self.next_check = self.NO_NEXT_CHECK
         else:
             self.max_age = max_age
             if check_frequency <= 0:
@@ -49,10 +51,11 @@
             self.check_frequency = check_frequency
             self.next_check = self.current_generation + 1
 
-    def next_generation(self):
+    def next_generation(self, do_cleanups_now=True):
         self.current_generation += 1
-        if self.current_generation == self.next_check:
+        if do_cleanups_now and self.current_generation >= self.next_check:
             self._kill_old_loops_now()
+            self._cleanup_jitcell_dicts()
             self.next_check = self.current_generation + self.check_frequency
 
     def keep_loop_alive(self, looptoken):
@@ -81,3 +84,22 @@
             # a single one is not enough for all tests :-(
             rgc.collect(); rgc.collect(); rgc.collect()
         debug_stop("jit-mem-collect")
+
+    def get_current_generation_uint(self):
+        """Return the current generation, possibly truncated to a uint.
+        To use only as an approximation for decaying counters."""
+        return r_uint(self.current_generation)
+
+    def record_jitcell_dict(self, callback):
+        """NOT_RPYTHON.  The given jitcell_dict is a dict that needs
+        occasional clean-ups of old cells.  A cell is old if it never
+        reached the threshold, and its counter decayed to a tiny value."""
+        # note that the various jitcell_dicts have different RPython types,
+        # so we have to make a different function for each one.  These
+        # functions are chained to each other: each calls the previous one.
+        def cleanup_dict():
+            callback()
+            cleanup_previous()
+        #
+        cleanup_previous = self._cleanup_jitcell_dicts
+        self._cleanup_jitcell_dicts = cleanup_dict
diff --git a/pypy/jit/metainterp/optimizeopt/heap.py b/pypy/jit/metainterp/optimizeopt/heap.py
--- a/pypy/jit/metainterp/optimizeopt/heap.py
+++ b/pypy/jit/metainterp/optimizeopt/heap.py
@@ -246,15 +246,16 @@
                 self.force_lazy_setfields_and_arrayitems_for_guard())
             return
         opnum = op.getopnum()
-        if (opnum == rop.SETFIELD_GC or        # handled specially
-            opnum == rop.SETFIELD_RAW or       # no effect on GC struct/array
-            opnum == rop.SETARRAYITEM_GC or    # handled specially
-            opnum == rop.SETARRAYITEM_RAW or   # no effect on GC struct
-            opnum == rop.STRSETITEM or         # no effect on GC struct/array
-            opnum == rop.UNICODESETITEM or     # no effect on GC struct/array
-            opnum == rop.DEBUG_MERGE_POINT or  # no effect whatsoever
-            opnum == rop.COPYSTRCONTENT or     # no effect on GC struct/array
-            opnum == rop.COPYUNICODECONTENT):  # no effect on GC struct/array
+        if (opnum == rop.SETFIELD_GC or          # handled specially
+            opnum == rop.SETFIELD_RAW or         # no effect on GC struct/array
+            opnum == rop.SETARRAYITEM_GC or      # handled specially
+            opnum == rop.SETARRAYITEM_RAW or     # no effect on GC struct
+            opnum == rop.SETINTERIORFIELD_RAW or # no effect on GC struct
+            opnum == rop.STRSETITEM or           # no effect on GC struct/array
+            opnum == rop.UNICODESETITEM or       # no effect on GC struct/array
+            opnum == rop.DEBUG_MERGE_POINT or    # no effect whatsoever
+            opnum == rop.COPYSTRCONTENT or       # no effect on GC struct/array
+            opnum == rop.COPYUNICODECONTENT):    # no effect on GC struct/array
             return
         assert opnum != rop.CALL_PURE
         if (opnum == rop.CALL or
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py b/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py
@@ -7744,6 +7744,22 @@
         """
         self.optimize_loop(ops, expected)
 
+    def test_setinteriorfield_should_not_clear_cache(self):
+        ops = """
+        [i0, p0]
+        i2 = getfield_gc(p0, descr=adescr)
+        i3 = call(i2, descr=nonwritedescr)
+        setinteriorfield_raw(i0, i2, i3)
+        jump(i0, p0)
+        """
+        expected = """
+        [i0, p0, i2]
+        i3 = call(i2, descr=nonwritedescr)
+        setinteriorfield_raw(i0, i2, i3)
+        jump(i0, p0, i2)
+        """
+        self.optimize_loop(ops, expected)
+
 class TestLLtype(OptimizeOptTest, LLtypeMixin):
     pass
 
diff --git a/pypy/jit/metainterp/pyjitpl.py b/pypy/jit/metainterp/pyjitpl.py
--- a/pypy/jit/metainterp/pyjitpl.py
+++ b/pypy/jit/metainterp/pyjitpl.py
@@ -1810,7 +1810,7 @@
 
     def _interpret(self):
         # Execute the frames forward until we raise a DoneWithThisFrame,
-        # a ExitFrameWithException, or a GenerateMergePoint exception.
+        # a ExitFrameWithException, or a ContinueRunningNormally exception.
         self.staticdata.stats.entered()
         while True:
             self.framestack[-1].run_one_step()
@@ -1858,8 +1858,6 @@
         self.seen_loop_header_for_jdindex = -1
         try:
             self.interpret()
-        except GenerateMergePoint, gmp:
-            return self.designate_target_loop(gmp)
         except SwitchToBlackhole, stb:
             self.run_blackhole_interp_to_cancel_tracing(stb)
         assert False, "should always raise"
@@ -1894,8 +1892,6 @@
             if self.resumekey_original_loop_token is None:   # very rare case
                 raise SwitchToBlackhole(ABORT_BRIDGE)
             self.interpret()
-        except GenerateMergePoint, gmp:
-            return self.designate_target_loop(gmp)
         except SwitchToBlackhole, stb:
             self.run_blackhole_interp_to_cancel_tracing(stb)
         assert False, "should always raise"
@@ -1980,12 +1976,48 @@
         start = len(self.history.operations)
         self.current_merge_points.append((live_arg_boxes, start))
 
-    def designate_target_loop(self, gmp):
-        loop_token = gmp.target_loop_token
+    def _unpack_boxes(self, boxes, start, stop):
+        ints = []; refs = []; floats = []
+        for i in range(start, stop):
+            box = boxes[i]
+            if   box.type == history.INT: ints.append(box.getint())
+            elif box.type == history.REF: refs.append(box.getref_base())
+            elif box.type == history.FLOAT:floats.append(box.getfloatstorage())
+            else: assert 0
+        return ints[:], refs[:], floats[:]
+
+    def raise_continue_running_normally(self, live_arg_boxes, loop_token):
+        self.history.inputargs = None
+        self.history.operations = None
+        # For simplicity, we just raise ContinueRunningNormally here and
+        # ignore the loop_token passed in.  It means that we go back to
+        # interpreted mode, but it should come back very quickly to the
+        # JIT, find probably the same 'loop_token', and execute it.
+        if we_are_translated():
+            num_green_args = self.jitdriver_sd.num_green_args
+            gi, gr, gf = self._unpack_boxes(live_arg_boxes, 0, num_green_args)
+            ri, rr, rf = self._unpack_boxes(live_arg_boxes, num_green_args,
+                                            len(live_arg_boxes))
+            CRN = self.staticdata.ContinueRunningNormally
+            raise CRN(gi, gr, gf, ri, rr, rf)
+        else:
+            # However, in order to keep the existing tests working
+            # (which are based on the assumption that 'loop_token' is
+            # directly used here), a bit of custom non-translatable code...
+            self._nontranslated_run_directly(live_arg_boxes, loop_token)
+            assert 0, "unreachable"
+
+    def _nontranslated_run_directly(self, live_arg_boxes, loop_token):
+        "NOT_RPYTHON"
+        args = []
         num_green_args = self.jitdriver_sd.num_green_args
-        residual_args = gmp.argboxes[num_green_args:]
-        history.set_future_values(self.cpu, residual_args)
-        return loop_token
+        num_red_args = self.jitdriver_sd.num_red_args
+        for box in live_arg_boxes[num_green_args:num_green_args+num_red_args]:
+            if   box.type == history.INT: args.append(box.getint())
+            elif box.type == history.REF: args.append(box.getref_base())
+            elif box.type == history.FLOAT: args.append(box.getfloatstorage())
+            else: assert 0
+        self.jitdriver_sd.warmstate.execute_assembler(loop_token, *args)
 
     def prepare_resume_from_failure(self, opnum, dont_change_position=False):
         frame = self.framestack[-1]
@@ -2054,10 +2086,9 @@
 
 
         if target_token is not None: # raise if it *worked* correctly
-            self.history.inputargs = None
-            self.history.operations = None
             assert isinstance(target_token, TargetToken)
-            raise GenerateMergePoint(live_arg_boxes, target_token.targeting_jitcell_token)
+            jitcell_token = target_token.targeting_jitcell_token
+            self.raise_continue_running_normally(live_arg_boxes, jitcell_token)
 
     def compile_trace(self, live_arg_boxes, start_resumedescr):
         num_green_args = self.jitdriver_sd.num_green_args
@@ -2075,10 +2106,9 @@
         finally:
             self.history.operations.pop()     # remove the JUMP
         if target_token is not None: # raise if it *worked* correctly
-            self.history.inputargs = None
-            self.history.operations = None
             assert isinstance(target_token, TargetToken)
-            raise GenerateMergePoint(live_arg_boxes, target_token.targeting_jitcell_token)
+            jitcell_token = target_token.targeting_jitcell_token
+            self.raise_continue_running_normally(live_arg_boxes, jitcell_token)
 
     def compile_bridge_and_loop(self, original_boxes, live_arg_boxes, start,
                                 bridge_arg_boxes, start_resumedescr):
@@ -2114,10 +2144,8 @@
         except RetraceLoop:
             assert False
         assert target_loop_token is not None
-
-        self.history.inputargs = None
-        self.history.operations = None
-        raise GenerateMergePoint(live_arg_boxes, old_loop_tokens[0])
+        self.raise_continue_running_normally(live_arg_boxes,
+                                             old_loop_tokens[0])
 
     def compile_done_with_this_frame(self, exitbox):
         self.gen_store_back_in_virtualizable()
@@ -2395,22 +2423,6 @@
                                             abox, ConstInt(j), itembox)
             assert i + 1 == len(self.virtualizable_boxes)
 
-    def gen_load_from_other_virtualizable(self, vinfo, vbox):
-        boxes = []
-        assert vinfo is not None
-        for i in range(vinfo.num_static_extra_boxes):
-            descr = vinfo.static_field_descrs[i]
-            boxes.append(self.execute_and_record(rop.GETFIELD_GC, descr, vbox))
-        virtualizable = vinfo.unwrap_virtualizable_box(vbox)
-        for k in range(vinfo.num_arrays):
-            descr = vinfo.array_field_descrs[k]
-            abox = self.execute_and_record(rop.GETFIELD_GC, descr, vbox)
-            descr = vinfo.array_descrs[k]
-            for j in range(vinfo.get_array_length(virtualizable, k)):
-                boxes.append(self.execute_and_record(rop.GETARRAYITEM_GC, descr,
-                                                     abox, ConstInt(j)))
-        return boxes
-
     def replace_box(self, oldbox, newbox):
         assert isinstance(oldbox, Box)
         for frame in self.framestack:
@@ -2482,25 +2494,13 @@
         greenargs = arglist[1:num_green_args+1]
         args = arglist[num_green_args+1:]
         assert len(args) == targetjitdriver_sd.num_red_args
-        vinfo = targetjitdriver_sd.virtualizable_info
-        if vinfo is not None:
-            index = targetjitdriver_sd.index_of_virtualizable
-            vbox = args[index]
-            args = args + self.gen_load_from_other_virtualizable(vinfo, vbox)
-            # ^^^ and not "+=", which makes 'args' a resizable list
         warmrunnerstate = targetjitdriver_sd.warmstate
-        token = warmrunnerstate.get_assembler_token(greenargs, args)
+        token = warmrunnerstate.get_assembler_token(greenargs)
         op = op.copy_and_change(rop.CALL_ASSEMBLER, args=args, descr=token)
         self.history.operations.append(op)
 
 # ____________________________________________________________
 
-class GenerateMergePoint(JitException):
-    def __init__(self, args, target_loop_token):
-        assert target_loop_token is not None
-        self.argboxes = args
-        self.target_loop_token = target_loop_token
-
 class ChangeFrame(JitException):
     """Raised after we mutated metainterp.framestack, in order to force
     it to reload the current top-of-stack frame that gets interpreted."""
diff --git a/pypy/jit/metainterp/resoperation.py b/pypy/jit/metainterp/resoperation.py
--- a/pypy/jit/metainterp/resoperation.py
+++ b/pypy/jit/metainterp/resoperation.py
@@ -381,11 +381,11 @@
     'GUARD_ISNULL/1d',
     'GUARD_NONNULL_CLASS/2d',
     '_GUARD_FOLDABLE_LAST',
-    'GUARD_NO_EXCEPTION/0d',
-    'GUARD_EXCEPTION/1d',
+    'GUARD_NO_EXCEPTION/0d',    # may be called with an exception currently set
+    'GUARD_EXCEPTION/1d',       # may be called with an exception currently set
     'GUARD_NO_OVERFLOW/0d',
     'GUARD_OVERFLOW/0d',
-    'GUARD_NOT_FORCED/0d',
+    'GUARD_NOT_FORCED/0d',      # may be called with an exception currently set
     'GUARD_NOT_INVALIDATED/0d',
     '_GUARD_LAST', # ----- end of guard operations -----
 
diff --git a/pypy/jit/metainterp/test/support.py b/pypy/jit/metainterp/test/support.py
--- a/pypy/jit/metainterp/test/support.py
+++ b/pypy/jit/metainterp/test/support.py
@@ -4,9 +4,9 @@
 from pypy.rpython.ootypesystem import ootype
 from pypy.jit.backend.llgraph import runner
 from pypy.jit.metainterp.warmspot import ll_meta_interp, get_stats
+from pypy.jit.metainterp.warmstate import unspecialize_value
 from pypy.jit.metainterp.optimizeopt import ALL_OPTS_DICT
 from pypy.jit.metainterp import pyjitpl, history
-from pypy.jit.metainterp.warmstate import set_future_value
 from pypy.jit.codewriter.policy import JitPolicy
 from pypy.jit.codewriter import codewriter, longlong
 from pypy.rlib.rfloat import isnan
@@ -136,11 +136,11 @@
     procedure_token = metainterp.get_procedure_token(args[:num_green_args])
     # a loop was successfully created by _run_with_pyjitpl(); call it
     cpu = metainterp.cpu
+    args1 = []
     for i in range(len(args) - num_green_args):
         x = args[num_green_args + i]
-        typecode = history.getkind(lltype.typeOf(x))
-        set_future_value(cpu, i, x, typecode)
-    faildescr = cpu.execute_token(procedure_token)
+        args1.append(unspecialize_value(x))
+    faildescr = cpu.execute_token(procedure_token, *args1)
     assert faildescr.__class__.__name__.startswith('DoneWithThisFrameDescr')
     if metainterp.jitdriver_sd.result_type == history.INT:
         return cpu.get_latest_value_int(0)
diff --git a/pypy/jit/metainterp/test/test_ajit.py b/pypy/jit/metainterp/test/test_ajit.py
--- a/pypy/jit/metainterp/test/test_ajit.py
+++ b/pypy/jit/metainterp/test/test_ajit.py
@@ -9,7 +9,6 @@
 from pypy.jit.metainterp.test.support import LLJitMixin, OOJitMixin, noConst
 from pypy.jit.metainterp.typesystem import LLTypeHelper, OOTypeHelper
 from pypy.jit.metainterp.warmspot import get_stats
-from pypy.jit.metainterp.warmstate import set_future_value
 from pypy.rlib import rerased
 from pypy.rlib.jit import (JitDriver, we_are_jitted, hint, dont_look_inside,
     loop_invariant, elidable, promote, jit_debug, assert_green,
@@ -2911,6 +2910,27 @@
         res = self.meta_interp(f, [32])
         assert res == f(32)
 
+    def test_decay_counters(self):
+        myjitdriver = JitDriver(greens = ['m'], reds = ['n'])
+        def f(m, n):
+            while n > 0:
+                myjitdriver.jit_merge_point(m=m, n=n)
+                n += m
+                n -= m
+                n -= 1
+        def main():
+            f(5, 7)      # run 7x with m=5           counter[m=5] = 7
+            f(15, 10)    # compiles one loop         counter[m=5] = 3  (automatic decay)
+            f(5, 5)      # run 5x times with m=5     counter[m=5] = 8
+        #
+        self.meta_interp(main, [], decay_halflife=1,
+                         function_threshold=0, threshold=9, trace_eagerness=99)
+        self.check_trace_count(1)
+        #
+        self.meta_interp(main, [], decay_halflife=1,
+                         function_threshold=0, threshold=8, trace_eagerness=99)
+        self.check_trace_count(2)
+
 
 class TestOOtype(BasicTests, OOJitMixin):
 
diff --git a/pypy/jit/metainterp/test/test_compile.py b/pypy/jit/metainterp/test/test_compile.py
--- a/pypy/jit/metainterp/test/test_compile.py
+++ b/pypy/jit/metainterp/test/test_compile.py
@@ -55,6 +55,7 @@
         warmstate = FakeState()
         on_compile = staticmethod(lambda *args: None)
         on_compile_bridge = staticmethod(lambda *args: None)
+        virtualizable_info = None
 
 def test_compile_loop():
     cpu = FakeCPU()
@@ -171,23 +172,17 @@
         result_type = INT
     #
     loop_token = compile_tmp_callback(cpu, FakeJitDriverSD(),
-                                      [ConstInt(12), ConstInt(34)],
-                                      [BoxInt(56), ConstInt(78), BoxInt(90)])
+                                      [ConstInt(12), ConstInt(34)], "ii")
     #
     raiseme = None
-    cpu.set_future_value_int(0, -156)
-    cpu.set_future_value_int(1, -178)
-    cpu.set_future_value_int(2, -190)     # passed in, but dropped
-    fail_descr = cpu.execute_token(loop_token)
+    # only two arguments must be passed in
+    fail_descr = cpu.execute_token(loop_token, -156, -178)
     assert fail_descr is FakeJitDriverSD().portal_finishtoken
     #
     EXC = lltype.GcStruct('EXC')
     llexc = lltype.malloc(EXC)
     raiseme = LLException("exception class", llexc)
-    cpu.set_future_value_int(0, -156)
-    cpu.set_future_value_int(1, -178)
-    cpu.set_future_value_int(2, -190)
-    fail_descr = cpu.execute_token(loop_token)
+    fail_descr = cpu.execute_token(loop_token, -156, -178)
     assert isinstance(fail_descr, compile.PropagateExceptionDescr)
     got = cpu.grab_exc_value()
     assert lltype.cast_opaque_ptr(lltype.Ptr(EXC), got) == llexc
@@ -196,10 +191,7 @@
         class ExitFrameWithExceptionRef(Exception):
             pass
     FakeMetaInterpSD.cpu = cpu
-    cpu.set_future_value_int(0, -156)
-    cpu.set_future_value_int(1, -178)
-    cpu.set_future_value_int(2, -190)
-    fail_descr = cpu.execute_token(loop_token)
+    fail_descr = cpu.execute_token(loop_token, -156, -178)
     try:
         fail_descr.handle_fail(FakeMetaInterpSD(), None)
     except FakeMetaInterpSD.ExitFrameWithExceptionRef, e:
diff --git a/pypy/jit/metainterp/test/test_jitprof.py b/pypy/jit/metainterp/test/test_jitprof.py
--- a/pypy/jit/metainterp/test/test_jitprof.py
+++ b/pypy/jit/metainterp/test/test_jitprof.py
@@ -10,7 +10,7 @@
         self.counter = 123456
         Profiler.start(self)
         self.events = []
-        self.times = [0, 0, 0, 0]
+        self.times = [0, 0]
     
     def timer(self):
         self.counter += 1
@@ -24,12 +24,6 @@
         Profiler._end(self, event)
         self.events.append(~event)
 
-    def start_running(self):   self._start(RUNNING)
-    def end_running(self):     self._end(RUNNING)
-
-    def start_blackhole(self): self._start(BLACKHOLE)
-    def end_blackhole(self):   self._end(BLACKHOLE)
-
 class ProfilerMixin(LLJitMixin):
     def meta_interp(self, *args, **kwds):
         kwds = kwds.copy()
@@ -56,14 +50,10 @@
             BACKEND,
             ~ BACKEND,
             ~ TRACING,
-            RUNNING,
-            ~ RUNNING,
-            BLACKHOLE,
-            ~ BLACKHOLE
             ]
         assert profiler.events == expected
-        assert profiler.times == [2, 1, 1, 1]
-        assert profiler.counters == [1, 1, 1, 1, 3, 3, 1, 15, 2, 0, 0, 0, 0,
+        assert profiler.times == [2, 1]
+        assert profiler.counters == [1, 1, 3, 3, 1, 15, 2, 0, 0, 0, 0,
                                      0, 0, 0, 0, 0]
 
     def test_simple_loop_with_call(self):
diff --git a/pypy/jit/metainterp/test/test_rawmem.py b/pypy/jit/metainterp/test/test_rawmem.py
--- a/pypy/jit/metainterp/test/test_rawmem.py
+++ b/pypy/jit/metainterp/test/test_rawmem.py
@@ -8,7 +8,7 @@
         VOID_TP = lltype.Array(lltype.Void, hints={"nolength": True, "uncast_on_llgraph": True})
         class A(object):
             def __init__(self, x):
-                self.storage = rffi.cast(lltype.Ptr(VOID_TP), x)\
+                self.storage = rffi.cast(lltype.Ptr(VOID_TP), x)
 
         def f(n):
             x = lltype.malloc(TP, n, flavor="raw", zero=True)
@@ -19,4 +19,14 @@
             lltype.free(x, flavor="raw")
             return s
         res = self.interp_operations(f, [10])
-        assert res == 1.0
\ No newline at end of file
+
+    def test_fixed_size_malloc(self):
+        TIMEVAL = lltype.Struct('dummy', ('tv_sec', rffi.LONG), ('tv_usec', rffi.LONG))
+        def f():
+            p = lltype.malloc(TIMEVAL, flavor='raw')
+            lltype.free(p, flavor='raw')
+            return 42
+        res = self.interp_operations(f, [])
+        assert res == 42
+        self.check_operations_history({'call': 2, 'guard_no_exception': 1,
+                                       'finish': 1})
diff --git a/pypy/jit/metainterp/test/test_tl.py b/pypy/jit/metainterp/test/test_tl.py
--- a/pypy/jit/metainterp/test/test_tl.py
+++ b/pypy/jit/metainterp/test/test_tl.py
@@ -72,16 +72,16 @@
         res = self.meta_interp(main, [0, 6], listops=True,
                                backendopt=True)
         assert res == 5040
-        self.check_resops({'jump': 1, 'int_le': 2, 'guard_value': 1,
-                           'int_mul': 2, 'guard_false': 2, 'int_sub': 2})
+        self.check_simple_loop({'jump': 1, 'int_le': 1,
+                                'int_mul': 1, 'guard_false': 1, 'int_sub': 1})
 
     def test_tl_2(self):
         main = self._get_main()
         res = self.meta_interp(main, [1, 10], listops=True,
                                backendopt=True)
         assert res == main(1, 10)
-        self.check_resops({'int_le': 2, 'int_sub': 2, 'jump': 1,
-                           'guard_false': 2, 'guard_value': 1})
+        self.check_simple_loop({'int_le': 1, 'int_sub': 1, 'jump': 1,
+                                'guard_false': 1})
 
     def test_tl_call(self, listops=True, policy=None):
         from pypy.jit.tl.tl import interp
diff --git a/pypy/jit/metainterp/test/test_virtualizable.py b/pypy/jit/metainterp/test/test_virtualizable.py
--- a/pypy/jit/metainterp/test/test_virtualizable.py
+++ b/pypy/jit/metainterp/test/test_virtualizable.py
@@ -77,7 +77,7 @@
             return xy.inst_x
         res = self.meta_interp(f, [20])
         assert res == 30
-        self.check_resops(setfield_gc=0, getfield_gc=0)
+        self.check_simple_loop(setfield_gc=0, getfield_gc=0)
 
     def test_preexisting_access_2(self):
         myjitdriver = JitDriver(greens = [], reds = ['n', 'xy'],
@@ -102,7 +102,8 @@
         assert f(5) == 185
         res = self.meta_interp(f, [5])
         assert res == 185
-        self.check_resops(setfield_gc=0, getfield_gc=0)
+        self.check_resops(setfield_gc=0,
+                          getfield_gc=2)  # <= at the header of the loop
 
     def test_two_paths_access(self):
         myjitdriver = JitDriver(greens = [], reds = ['n', 'xy'],
@@ -124,7 +125,7 @@
             return xy.inst_x
         res = self.meta_interp(f, [18])
         assert res == 10118
-        self.check_resops(setfield_gc=0, getfield_gc=0)
+        self.check_resops(setfield_gc=0, getfield_gc=2)
 
     def test_synchronize_in_return(self):
         myjitdriver = JitDriver(greens = [], reds = ['n', 'xy'],
@@ -146,7 +147,7 @@
             return xy.inst_x
         res = self.meta_interp(f, [18])
         assert res == 10180
-        self.check_resops(setfield_gc=0, getfield_gc=0)
+        self.check_resops(setfield_gc=0, getfield_gc=2)
 
     def test_virtualizable_and_greens(self):
         myjitdriver = JitDriver(greens = ['m'], reds = ['n', 'xy'],
@@ -174,7 +175,7 @@
             return res
         res = self.meta_interp(f, [40])
         assert res == 50 * 4
-        self.check_resops(setfield_gc=0, getfield_gc=0)
+        self.check_resops(setfield_gc=0, getfield_gc=4)
 
     def test_double_frame(self):
         myjitdriver = JitDriver(greens = [], reds = ['n', 'xy', 'other'],
@@ -197,7 +198,8 @@
             return xy.inst_x
         res = self.meta_interp(f, [20])
         assert res == 134
-        self.check_resops(setfield_gc=2, getfield_gc=1)
+        self.check_simple_loop(setfield_gc=1, getfield_gc=0)
+        self.check_resops(setfield_gc=2, getfield_gc=3)
 
     # ------------------------------
 
@@ -247,8 +249,8 @@
             return xy2.inst_l1[2]
         res = self.meta_interp(f, [16])
         assert res == 3001 + 16 * 80
-        self.check_resops(setarrayitem_gc=0, setfield_gc=0,
-                          getarrayitem_gc=0, getfield_gc=0)
+        self.check_simple_loop(setarrayitem_gc=0, setfield_gc=0,
+                               getarrayitem_gc=0, getfield_gc=0)
 
     def test_synchronize_arrays_in_return(self):
         myjitdriver = JitDriver(greens = [], reds = ['n', 'xy2'],
@@ -278,7 +280,8 @@
         assert f(18) == 10360
         res = self.meta_interp(f, [18])
         assert res == 10360
-        self.check_resops(setfield_gc=0, getarrayitem_gc=0, getfield_gc=0)
+        self.check_simple_loop(setfield_gc=0, getarrayitem_gc=0,
+                               getfield_gc=0, setarrayitem_gc=0)
 
     def test_array_length(self):
         myjitdriver = JitDriver(greens = [], reds = ['n', 'xy2'],
@@ -304,8 +307,8 @@
             return xy2.inst_l1[1]
         res = self.meta_interp(f, [18])
         assert res == 2941309 + 18
-        self.check_resops(setfield_gc=0, getarrayitem_gc=0,
-                          arraylen_gc=0, getfield_gc=0)
+        self.check_simple_loop(setfield_gc=0, getarrayitem_gc=0,
+                               arraylen_gc=0, getfield_gc=0)
 
     def test_residual_function(self):
         myjitdriver = JitDriver(greens = [], reds = ['n', 'xy2'],
@@ -338,8 +341,8 @@
             return xy2.inst_l1[1]
         res = self.meta_interp(f, [18])
         assert res == 2941309 + 18
-        self.check_resops(call=2, setfield_gc=0, getarrayitem_gc=0,
-                          arraylen_gc=2, getfield_gc=0)
+        self.check_simple_loop(call=1, setfield_gc=0, getarrayitem_gc=0,
+                               arraylen_gc=1, getfield_gc=0)
 
     def test_double_frame_array(self):
         myjitdriver = JitDriver(greens = [], reds = ['n', 'xy2', 'other'],
@@ -375,8 +378,8 @@
         expected = f(20)
         res = self.meta_interp(f, [20], enable_opts='')
         assert res == expected
-        self.check_resops(setarrayitem_gc=1, setfield_gc=0,
-                          getarrayitem_gc=1, arraylen_gc=1, getfield_gc=1)
+        self.check_simple_loop(setarrayitem_gc=1, setfield_gc=0,
+                               getarrayitem_gc=1, arraylen_gc=1, getfield_gc=1)
 
     # ------------------------------
 
@@ -423,7 +426,8 @@
         assert f(18) == 10360
         res = self.meta_interp(f, [18])
         assert res == 10360
-        self.check_resops(setfield_gc=0, getarrayitem_gc=0, getfield_gc=0)
+        self.check_simple_loop(getfield_gc=0, getarrayitem_gc=0,
+                               setfield_gc=0, setarrayitem_gc=0)
 
     # ------------------------------
 
@@ -457,7 +461,7 @@
 
         res = self.meta_interp(f, [10])
         assert res == 55
-        self.check_resops(setfield_gc=0, getfield_gc=0)
+        self.check_simple_loop(setfield_gc=0, getfield_gc=0)
 
     def test_virtualizable_with_array(self):
         myjitdriver = JitDriver(greens = [], reds = ['n', 'x', 'frame'],
@@ -491,7 +495,8 @@
 
         res = self.meta_interp(f, [10, 1], listops=True)
         assert res == f(10, 1)
-        self.check_resops(getarrayitem_gc=0)
+        self.check_simple_loop(getfield_gc=0, getarrayitem_gc=0)
+        self.check_resops(getfield_gc=2, getarrayitem_gc=4)
 
     def test_subclass_of_virtualizable(self):
         myjitdriver = JitDriver(greens = [], reds = ['frame'],
@@ -519,7 +524,7 @@
 
         res = self.meta_interp(f, [10])
         assert res == 55
-        self.check_resops(setfield_gc=0, getfield_gc=0)
+        self.check_simple_loop(setfield_gc=0, getfield_gc=0)
 
     def test_external_pass(self):
         jitdriver = JitDriver(greens = [], reds = ['n', 'z', 'frame'],
@@ -1037,7 +1042,7 @@
 
         res = self.meta_interp(f, [10])
         assert res == 55
-        self.check_resops(setfield_gc=0, getfield_gc=0)
+        self.check_simple_loop(setfield_gc=0, getfield_gc=0)
 
         from pypy.jit.backend.test.support import BaseCompiledMixin
         if isinstance(self, BaseCompiledMixin):
@@ -1197,7 +1202,8 @@
 
         res = self.meta_interp(f, [10])
         assert res == 155
-        self.check_resops(setfield_gc=0, getfield_gc=0)
+        self.check_simple_loop(setfield_gc=0, getfield_gc=0)
+        self.check_resops(setfield_gc=0, getfield_gc=2)
 
     def test_blackhole_should_synchronize(self):
         myjitdriver = JitDriver(greens = [], reds = ['frame'],
@@ -1233,7 +1239,8 @@
 
         res = self.meta_interp(f, [10])
         assert res == 155
-        self.check_resops(setfield_gc=0, getfield_gc=0)
+        self.check_simple_loop(setfield_gc=0, getfield_gc=0)
+        self.check_resops(setfield_gc=0, getfield_gc=2)
 
     def test_blackhole_should_not_reenter(self):
         if not self.basic:
diff --git a/pypy/jit/metainterp/test/test_warmspot.py b/pypy/jit/metainterp/test/test_warmspot.py
--- a/pypy/jit/metainterp/test/test_warmspot.py
+++ b/pypy/jit/metainterp/test/test_warmspot.py
@@ -303,18 +303,11 @@
         exc_vtable = lltype.malloc(OBJECT_VTABLE, immortal=True)
         cls.exc_vtable = exc_vtable
 
-        class FakeLoopToken:
+        class FakeFailDescr(object):
             def __init__(self, no):
                 self.no = no
-                self.generation = 0
-
-        class FakeFailDescr(object):
-            def __init__(self, looptoken):
-                assert isinstance(looptoken, FakeLoopToken)
-                self.looptoken = looptoken
-            
             def handle_fail(self, metainterp_sd, jitdrivers_sd):
-                no = self.looptoken.no
+                no = self.no
                 if no == 0:
                     raise metainterp_sd.warmrunnerdesc.DoneWithThisFrameInt(3)
                 if no == 1:
@@ -326,7 +319,7 @@
                     raise metainterp_sd.warmrunnerdesc.ExitFrameWithExceptionRef(
                         metainterp_sd.cpu,
                         lltype.cast_opaque_ptr(llmemory.GCREF, exc))
-                return self.looptoken
+                assert 0
 
         class FakeDescr:
             def as_vtable_size_descr(self):
@@ -353,11 +346,10 @@
             sizeof       = nodescr
 
             def get_fail_descr_from_number(self, no):
-                return FakeFailDescr(FakeLoopToken(no))
+                return FakeFailDescr(no)
 
-            def execute_token(self, token):
-                assert token.no == 2
-                return FakeFailDescr(FakeLoopToken(1))
+            def make_execute_token(self, *ARGS):
+                return "not callable"
 
         driver = JitDriver(reds = ['red'], greens = ['green'])
         
@@ -381,7 +373,6 @@
         [jd] = self.desc.jitdrivers_sd
         assert jd._assembler_call_helper(0, 0) == 3
         assert jd._assembler_call_helper(1, 0) == 10
-        assert jd._assembler_call_helper(2, 0) == 10
         try:
             jd._assembler_call_helper(3, 0)
         except LLException, lle:
diff --git a/pypy/jit/metainterp/test/test_warmstate.py b/pypy/jit/metainterp/test/test_warmstate.py
--- a/pypy/jit/metainterp/test/test_warmstate.py
+++ b/pypy/jit/metainterp/test/test_warmstate.py
@@ -1,3 +1,4 @@
+import math
 from pypy.rpython.test.test_llinterp import interpret
 from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi
 from pypy.rpython.ootypesystem import ootype
@@ -8,7 +9,7 @@
 from pypy.jit.metainterp.history import BoxInt, BoxFloat, BoxPtr
 from pypy.jit.metainterp.history import ConstInt, ConstFloat, ConstPtr
 from pypy.jit.codewriter import longlong
-from pypy.rlib.rarithmetic import r_singlefloat
+from pypy.rlib.rarithmetic import r_singlefloat, r_uint
 
 def boxfloat(x):
     return BoxFloat(longlong.getfloatstorage(x))
@@ -151,29 +152,6 @@
     assert get_jitcell(False, 42, 0.25) is cell4
     assert cell1 is not cell3 is not cell4 is not cell1
 
-def test_make_set_future_values():
-    future_values = {}
-    class FakeCPU:
-        def set_future_value_int(self, j, value):
-            future_values[j] = "int", value
-        def set_future_value_float(self, j, value):
-            future_values[j] = "float", value
-    class FakeWarmRunnerDesc:
-        cpu = FakeCPU()
-        memory_manager = None
-    class FakeJitDriverSD:
-        _red_args_types = ["int", "float"]
-        virtualizable_info = None
-    #
-    state = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
-    set_future_values = state.make_set_future_values()
-    set_future_values(5, 42.5)
-    assert future_values == {
-        0: ("int", 5),
-        1: ("float", longlong.getfloatstorage(42.5)),
-    }
-    assert set_future_values is state.make_set_future_values()
-
 def test_make_unwrap_greenkey():
     class FakeJitDriverSD:
         _green_args_spec = [lltype.Signed, lltype.Float]
@@ -210,6 +188,7 @@
         _confirm_enter_jit_ptr = None
         _can_never_inline_ptr = None
         _should_unroll_one_iteration_ptr = None
+        red_args_types = []
     class FakeCell:
         dont_trace_here = False
     state = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
@@ -239,6 +218,7 @@
         _can_never_inline_ptr = None
         _get_jitcell_at_ptr = None
         _should_unroll_one_iteration_ptr = None
+        red_args_types = []
     state = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
     state.make_jitdriver_callbacks()
     res = state.get_location_str([ConstInt(5), constfloat(42.5)])
@@ -264,6 +244,7 @@
         _can_never_inline_ptr = None
         _get_jitcell_at_ptr = None
         _should_unroll_one_iteration_ptr = None
+        red_args_types = []
 
     state = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
     state.make_jitdriver_callbacks()
@@ -289,8 +270,83 @@
         _can_never_inline_ptr = llhelper(CAN_NEVER_INLINE, can_never_inline)
         _get_jitcell_at_ptr = None
         _should_unroll_one_iteration_ptr = None
+        red_args_types = []
 
     state = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
     state.make_jitdriver_callbacks()
     res = state.can_never_inline(5, 42.5)
     assert res is True
+
+def test_decay_counters():
+    cell = JitCell(r_uint(5))
+    cell.counter = 100
+    cell.adjust_counter(r_uint(5), math.log(0.9))
+    assert cell.counter == 100
+    cell.adjust_counter(r_uint(6), math.log(0.9))
+    assert cell.counter == 90
+    cell.adjust_counter(r_uint(9), math.log(0.9))
+    assert cell.counter == int(90 * (0.9**3))
+
+def test_cleanup_jitcell_dict():
+    from pypy.jit.metainterp.memmgr import MemoryManager
+    class FakeWarmRunnerDesc:
+        memory_manager = MemoryManager()
+        class cpu:
+            pass
+    class FakeJitDriverSD:
+        _green_args_spec = [lltype.Signed]
+    #
+    # Test creating tons of jitcells that remain at 0
+    warmstate = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
+    get_jitcell = warmstate._make_jitcell_getter_default()
+    cell1 = get_jitcell(True, -1)
+    assert len(warmstate._jitcell_dict) == 1
+    assert FakeWarmRunnerDesc.memory_manager.current_generation == 1
+    #
+    for i in range(1, 20005):
+        get_jitcell(True, i)     # should trigger a clean-up at 20001
+        assert len(warmstate._jitcell_dict) == (i % 20000) + 1
+    assert FakeWarmRunnerDesc.memory_manager.current_generation == 2
+    #
+    # Same test, with one jitcell that has a counter of BASE instead of 0
+    warmstate = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
+    warmstate.set_param_decay_halflife(2)
+    warmstate.set_param_threshold(5)
+    warmstate.set_param_function_threshold(0)
+    get_jitcell = warmstate._make_jitcell_getter_default()
+    cell2 = get_jitcell(True, -2)
+    cell2.counter = BASE = warmstate.increment_threshold * 3
+    #
+    for i in range(0, 20005):
+        get_jitcell(True, i)
+        assert len(warmstate._jitcell_dict) == (i % 19999) + 2
+    #
+    assert cell2 in warmstate._jitcell_dict.values()
+    assert cell2.counter == int(BASE * math.sqrt(0.5))   # decayed once
+    assert FakeWarmRunnerDesc.memory_manager.current_generation == 3
+    #
+    # Same test, with jitcells that are compiled and free by the memmgr
+    warmstate = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
+    get_jitcell = warmstate._make_jitcell_getter_default()
+    get_jitcell(True, -1)
+    assert FakeWarmRunnerDesc.memory_manager.current_generation == 3
+    #
+    for i in range(1, 20005):
+        cell = get_jitcell(True, i)
+        cell.counter = -1
+        cell.wref_procedure_token = None    # or a dead weakref, equivalently
+        assert len(warmstate._jitcell_dict) == (i % 20000) + 1
+    assert FakeWarmRunnerDesc.memory_manager.current_generation == 4
+    #
+    # Same test, with counter == -2 (rare case, kept alive)
+    warmstate = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
+    get_jitcell = warmstate._make_jitcell_getter_default()
+    cell = get_jitcell(True, -1)
+    cell.counter = -2
+    assert FakeWarmRunnerDesc.memory_manager.current_generation == 4
+    #
+    for i in range(1, 20005):
+        cell = get_jitcell(True, i)
+        cell.counter = -2
+        assert len(warmstate._jitcell_dict) == i + 1
+    assert FakeWarmRunnerDesc.memory_manager.current_generation == 5
diff --git a/pypy/jit/metainterp/warmspot.py b/pypy/jit/metainterp/warmspot.py
--- a/pypy/jit/metainterp/warmspot.py
+++ b/pypy/jit/metainterp/warmspot.py
@@ -64,9 +64,11 @@
 
 def jittify_and_run(interp, graph, args, repeat=1, graph_and_interp_only=False,
                     backendopt=False, trace_limit=sys.maxint,
+                    threshold=3, trace_eagerness=2,
                     inline=False, loop_longevity=0, retrace_limit=5,
-                    function_threshold=4,
-                    enable_opts=ALL_OPTS_NAMES, max_retrace_guards=15, **kwds):
+                    function_threshold=4, decay_halflife=0,
+                    enable_opts=ALL_OPTS_NAMES, max_retrace_guards=15,
+                    **kwds):
     from pypy.config.config import ConfigError
     translator = interp.typer.annotator.translator
     try:
@@ -83,15 +85,16 @@
         pass
     warmrunnerdesc = WarmRunnerDesc(translator, backendopt=backendopt, **kwds)
     for jd in warmrunnerdesc.jitdrivers_sd:
-        jd.warmstate.set_param_threshold(3)          # for tests
+        jd.warmstate.set_param_threshold(threshold)
         jd.warmstate.set_param_function_threshold(function_threshold)
-        jd.warmstate.set_param_trace_eagerness(2)    # for tests
+        jd.warmstate.set_param_trace_eagerness(trace_eagerness)
         jd.warmstate.set_param_trace_limit(trace_limit)
         jd.warmstate.set_param_inlining(inline)
         jd.warmstate.set_param_loop_longevity(loop_longevity)
         jd.warmstate.set_param_retrace_limit(retrace_limit)
         jd.warmstate.set_param_max_retrace_guards(max_retrace_guards)
         jd.warmstate.set_param_enable_opts(enable_opts)
+        jd.warmstate.set_param_decay_halflife(decay_halflife)
     warmrunnerdesc.finish()
     if graph_and_interp_only:
         return interp, graph
@@ -522,9 +525,9 @@
         greens_v, reds_v = support.decode_hp_hint_args(op)
         ALLARGS = [v.concretetype for v in (greens_v + reds_v)]
         jd._green_args_spec = [v.concretetype for v in greens_v]
-        jd._red_args_types = [history.getkind(v.concretetype) for v in reds_v]
+        jd.red_args_types = [history.getkind(v.concretetype) for v in reds_v]
         jd.num_green_args = len(jd._green_args_spec)
-        jd.num_red_args = len(jd._red_args_types)
+        jd.num_red_args = len(jd.red_args_types)
         RESTYPE = graph.getreturnvar().concretetype
         (jd._JIT_ENTER_FUNCTYPE,
          jd._PTR_JIT_ENTER_FUNCTYPE) = self.cpu.ts.get_FuncType(ALLARGS, lltype.Void)
@@ -771,16 +774,16 @@
 
         def assembler_call_helper(failindex, virtualizableref):
             fail_descr = self.cpu.get_fail_descr_from_number(failindex)
-            while True:
-                if vinfo is not None:
-                    virtualizable = lltype.cast_opaque_ptr(
-                        vinfo.VTYPEPTR, virtualizableref)
-                    vinfo.reset_vable_token(virtualizable)
-                try:
-                    loop_token = fail_descr.handle_fail(self.metainterp_sd, jd)
-                except JitException, e:
-                    return handle_jitexception(e)
-                fail_descr = self.execute_token(loop_token)
+            if vinfo is not None:
+                virtualizable = lltype.cast_opaque_ptr(
+                    vinfo.VTYPEPTR, virtualizableref)
+                vinfo.reset_vable_token(virtualizable)
+            try:
+                fail_descr.handle_fail(self.metainterp_sd, jd)
+            except JitException, e:
+                return handle_jitexception(e)
+            else:
+                assert 0, "should have raised"
 
         jd._assembler_call_helper = assembler_call_helper # for debugging
         jd._assembler_helper_ptr = self.helper_func(
@@ -910,10 +913,3 @@
         graphs = self.translator.graphs
         for graph, block, i in find_force_quasi_immutable(graphs):
             self.replace_force_quasiimmut_with_direct_call(block.operations[i])
-
-    # ____________________________________________________________
-
-    def execute_token(self, loop_token):
-        fail_descr = self.cpu.execute_token(loop_token)
-        self.memory_manager.keep_loop_alive(loop_token)
-        return fail_descr
diff --git a/pypy/jit/metainterp/warmstate.py b/pypy/jit/metainterp/warmstate.py
--- a/pypy/jit/metainterp/warmstate.py
+++ b/pypy/jit/metainterp/warmstate.py
@@ -1,10 +1,10 @@
-import sys, weakref
+import sys, weakref, math
 from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi
 from pypy.rpython.ootypesystem import ootype
 from pypy.rpython.annlowlevel import hlstr, cast_base_ptr_to_instance
 from pypy.rpython.annlowlevel import cast_object_to_ptr
 from pypy.rlib.objectmodel import specialize, we_are_translated, r_dict
-from pypy.rlib.rarithmetic import intmask
+from pypy.rlib.rarithmetic import intmask, r_uint
 from pypy.rlib.nonconst import NonConstant
 from pypy.rlib.unroll import unrolling_iterable
 from pypy.rlib.jit import PARAMETERS
@@ -12,6 +12,7 @@
 from pypy.rlib.debug import debug_start, debug_stop, debug_print
 from pypy.jit.metainterp import history
 from pypy.jit.codewriter import support, heaptracker, longlong
+from pypy.tool.sourcetools import func_with_new_name
 
 # ____________________________________________________________
 
@@ -142,26 +143,6 @@
     else:
         return rffi.cast(lltype.Signed, x)
 
- at specialize.ll_and_arg(3)
-def set_future_value(cpu, j, value, typecode):
-    if typecode == 'ref':
-        refvalue = cpu.ts.cast_to_ref(value)
-        cpu.set_future_value_ref(j, refvalue)
-    elif typecode == 'int':
-        if isinstance(lltype.typeOf(value), lltype.Ptr):
-            intvalue = llmemory.AddressAsInt(llmemory.cast_ptr_to_adr(value))
-        else:
-            intvalue = lltype.cast_primitive(lltype.Signed, value)
-        cpu.set_future_value_int(j, intvalue)
-    elif typecode == 'float':
-        if lltype.typeOf(value) is lltype.Float:
-            value = longlong.getfloatstorage(value)
-        else:
-            assert longlong.is_longlong(lltype.typeOf(value))
-            value = rffi.cast(lltype.SignedLongLong, value)
-        cpu.set_future_value_float(j, value)
-    else:
-        assert False
 
 class JitCell(BaseJitCell):
     # the counter can mean the following things:
@@ -172,6 +153,25 @@
     dont_trace_here = False
     wref_procedure_token = None
 
+    def __init__(self, generation):
+        # The stored 'counter' value follows an exponential decay model.
+        # Conceptually after every generation, it decays by getting
+        # multiplied by a constant <= 1.0.  In practice, decaying occurs
+        # lazily: the following field records the latest seen generation
+        # number, and adjustment is done by adjust_counter() when needed.
+        self.latest_generation_seen = generation
+
+    def adjust_counter(self, generation, log_decay_factor):
+        if generation != self.latest_generation_seen:
+            # The latest_generation_seen is older than the current generation.
+            # Adjust by multiplying self.counter N times by decay_factor, i.e.
+            # by decay_factor ** N, which is equal to exp(log(decay_factor)*N).
+            assert self.counter >= 0
+            N = generation - self.latest_generation_seen
+            factor = math.exp(log_decay_factor * N)
+            self.counter = int(self.counter * factor)
+            self.latest_generation_seen = generation
+
     def get_procedure_token(self):
         if self.wref_procedure_token is not None:
             token = self.wref_procedure_token()
@@ -191,7 +191,6 @@
 
 class WarmEnterState(object):
     THRESHOLD_LIMIT = sys.maxint // 2
-    default_jitcell_dict = None
 
     def __init__(self, warmrunnerdesc, jitdriver_sd):
         "NOT_RPYTHON"
@@ -232,6 +231,17 @@
     def set_param_inlining(self, value):
         self.inlining = value
 
+    def set_param_decay_halflife(self, value):
+        # Use 0 or -1 to mean "no decay".  Initialize the internal variable
+        # 'log_decay_factor'.  It is choosen such that by multiplying the
+        # counter on loops by 'exp(log_decay_factor)' (<= 1.0) every
+        # generation, then the counter will be divided by two after 'value'
+        # generations have passed.
+        if value <= 0:
+            self.log_decay_factor = 0.0    # log(1.0)
+        else:
+            self.log_decay_factor = math.log(0.5) / value
+
     def set_param_enable_opts(self, value):
         from pypy.jit.metainterp.optimizeopt import ALL_OPTS_DICT, ALL_OPTS_NAMES
 
@@ -297,26 +307,61 @@
         index_of_virtualizable = jitdriver_sd.index_of_virtualizable
         num_green_args = jitdriver_sd.num_green_args
         get_jitcell = self.make_jitcell_getter()
-        set_future_values = self.make_set_future_values()
         self.make_jitdriver_callbacks()
         confirm_enter_jit = self.confirm_enter_jit
+        range_red_args = unrolling_iterable(
+            range(num_green_args, num_green_args + jitdriver_sd.num_red_args))
+        memmgr = self.warmrunnerdesc.memory_manager
+        if memmgr is not None:
+            get_current_generation = memmgr.get_current_generation_uint
+        else:
+            get_current_generation = lambda: r_uint(0)
+        # get a new specialized copy of the method
+        ARGS = []
+        for kind in jitdriver_sd.red_args_types:
+            if kind == 'int':
+                ARGS.append(lltype.Signed)
+            elif kind == 'ref':
+                ARGS.append(llmemory.GCREF)
+            elif kind == 'float':
+                ARGS.append(longlong.FLOATSTORAGE)
+            else:
+                assert 0, kind
+        func_execute_token = self.cpu.make_execute_token(*ARGS)
+
+        def execute_assembler(loop_token, *args):
+            # Call the backend to run the 'looptoken' with the given
+            # input args.
+            fail_descr = func_execute_token(loop_token, *args)
+            #
+            # If we have a virtualizable, we have to reset its
+            # 'vable_token' field afterwards
+            if vinfo is not None:
+                virtualizable = args[index_of_virtualizable]
+                virtualizable = vinfo.cast_gcref_to_vtype(virtualizable)
+                vinfo.reset_vable_token(virtualizable)
+            #
+            # Record in the memmgr that we just ran this loop,
+            # so that it will keep it alive for a longer time
+            warmrunnerdesc.memory_manager.keep_loop_alive(loop_token)
+            #
+            # Handle the failure
+            fail_descr.handle_fail(metainterp_sd, jitdriver_sd)
+            #
+            assert 0, "should have raised"
 
         def maybe_compile_and_run(threshold, *args):
             """Entry point to the JIT.  Called at the point with the
             can_enter_jit() hint.
             """
-            if vinfo is not None:
-                virtualizable = args[num_green_args + index_of_virtualizable]
-                virtualizable = vinfo.cast_to_vtype(virtualizable)
-            else:
-                virtualizable = None
-
             # look for the cell corresponding to the current greenargs
             greenargs = args[:num_green_args]
             cell = get_jitcell(True, *greenargs)
 
             if cell.counter >= 0:
                 # update the profiling counter
+                cell.adjust_counter(get_current_generation(),
+                                    self.log_decay_factor)
                 n = cell.counter + threshold
                 if n <= self.THRESHOLD_LIMIT:       # bound not reached
                     cell.counter = n
@@ -330,42 +375,36 @@
                 # set counter to -2, to mean "tracing in effect"
                 cell.counter = -2
                 try:
-                    procedure_token = metainterp.compile_and_run_once(jitdriver_sd,
-                                                                 *args)
+                    metainterp.compile_and_run_once(jitdriver_sd, *args)
                 finally:
                     if cell.counter == -2:
                         cell.counter = 0
             else:
-                if cell.counter == -2:
+                if cell.counter != -1:
+                    assert cell.counter == -2
                     # tracing already happening in some outer invocation of
                     # this function. don't trace a second time.
                     return
-                assert cell.counter == -1
                 if not confirm_enter_jit(*args):
                     return
+                # machine code was already compiled for these greenargs
                 procedure_token = cell.get_procedure_token()
                 if procedure_token is None:   # it was a weakref that has been freed
                     cell.counter = 0
                     return
-                # machine code was already compiled for these greenargs
-                # get the assembler and fill in the boxes
-                set_future_values(*args[num_green_args:])
-
-            # ---------- execute assembler ----------
-            while True:     # until interrupted by an exception
-                metainterp_sd.profiler.start_running()
-                #debug_start("jit-running")
-                fail_descr = warmrunnerdesc.execute_token(procedure_token)
-                #debug_stop("jit-running")
-                metainterp_sd.profiler.end_running()
-                procedure_token = None     # for test_memmgr
-                if vinfo is not None:
-                    vinfo.reset_vable_token(virtualizable)
-                procedure_token = fail_descr.handle_fail(metainterp_sd,
-                                                         jitdriver_sd)
+                # extract and unspecialize the red arguments to pass to
+                # the assembler
+                execute_args = ()
+                for i in range_red_args:
+                    execute_args += (unspecialize_value(args[i]), )
+                # run it!  this executes until interrupted by an exception
+                execute_assembler(procedure_token, *execute_args)
+            #
+            assert 0, "should not reach this point"
 
         maybe_compile_and_run._dont_inline_ = True
         self.maybe_compile_and_run = maybe_compile_and_run
+        self.execute_assembler = execute_assembler
         return maybe_compile_and_run
 
     # ----------
@@ -415,6 +454,15 @@
         #
         return jit_getter
 
+    def _new_jitcell(self):
+        warmrunnerdesc = self.warmrunnerdesc
+        if (warmrunnerdesc is not None and
+                warmrunnerdesc.memory_manager is not None):
+            gen = warmrunnerdesc.memory_manager.get_current_generation_uint()
+        else:
+            gen = r_uint(0)
+        return JitCell(gen)
+
     def _make_jitcell_getter_default(self):
         "NOT_RPYTHON"
         jitdriver_sd = self.jitdriver_sd
@@ -439,6 +487,49 @@
             return x
         #
         jitcell_dict = r_dict(comparekey, hashkey)
+        try:
+            self.warmrunnerdesc.stats.jitcell_dicts.append(jitcell_dict)
+        except AttributeError:
+            pass
+        #
+        memmgr = self.warmrunnerdesc and self.warmrunnerdesc.memory_manager
+        if memmgr:
+            def _cleanup_dict():
+                minimum = sys.maxint
+                if self.increment_threshold > 0:
+                    minimum = min(minimum, self.increment_threshold)
+                if self.increment_function_threshold > 0:
+                    minimum = min(minimum, self.increment_function_threshold)
+                currentgen = memmgr.get_current_generation_uint()
+                killme = []
+                for key, cell in jitcell_dict.iteritems():
+                    if cell.counter >= 0:
+                        cell.adjust_counter(currentgen, self.log_decay_factor)
+                        if cell.counter < minimum:
+                            killme.append(key)
+                    elif (cell.counter == -1
+                          and cell.get_procedure_token() is None):
+                        killme.append(key)
+                for key in killme:
+                    del jitcell_dict[key]
+            #
+            def _maybe_cleanup_dict():
+                # If no tracing goes on at all because the jitcells are
+                # each time for new greenargs, the dictionary grows forever.
+                # So every one in a (rare) while, we decide to force an
+                # artificial next_generation() and _cleanup_dict().
+                self._trigger_automatic_cleanup += 1
+                if self._trigger_automatic_cleanup > 20000:
+                    self._trigger_automatic_cleanup = 0
+                    memmgr.next_generation(do_cleanups_now=False)
+                    _cleanup_dict()
+            #
+            self._trigger_automatic_cleanup = 0
+            self._jitcell_dict = jitcell_dict       # for tests
+            memmgr.record_jitcell_dict(_cleanup_dict)
+        else:
+            def _maybe_cleanup_dict():
+                pass
         #
         def get_jitcell(build, *greenargs):
             try:
@@ -446,7 +537,8 @@
             except KeyError:
                 if not build:
                     return None
-                cell = JitCell()
+                _maybe_cleanup_dict()
+                cell = self._new_jitcell()
                 jitcell_dict[greenargs] = cell
             return cell
         return get_jitcell
@@ -457,6 +549,10 @@
         get_jitcell_at_ptr = self.jitdriver_sd._get_jitcell_at_ptr
         set_jitcell_at_ptr = self.jitdriver_sd._set_jitcell_at_ptr
         lltohlhack = {}
+        # note that there is no equivalent of record_jitcell_dict()
+        # in the case of custom getters.  We assume that the interpreter
+        # stores the JitCells on some objects that can go away by GC,
+        # like the PyCode objects in PyPy.
         #
         def get_jitcell(build, *greenargs):
             fn = support.maybe_on_top_of_llinterp(rtyper, get_jitcell_at_ptr)
@@ -478,7 +574,7 @@
             if not build:
                 return cell
             if cell is None:
-                cell = JitCell()
+                cell = self._new_jitcell()
                 # <hacks>
                 if we_are_translated():
                     cellref = cast_object_to_ptr(BASEJITCELL, cell)
@@ -502,56 +598,6 @@
 
     # ----------
 
-    def make_set_future_values(self):
-        "NOT_RPYTHON"
-        if hasattr(self, 'set_future_values'):
-            return self.set_future_values
-
-        jitdriver_sd   = self.jitdriver_sd
-        cpu = self.cpu
-        vinfo = jitdriver_sd.virtualizable_info
-        red_args_types = unrolling_iterable(jitdriver_sd._red_args_types)
-        #
-        def set_future_values(*redargs):
-            i = 0
-            for typecode in red_args_types:
-                set_future_value(cpu, i, redargs[i], typecode)
-                i = i + 1
-            if vinfo is not None:
-                set_future_values_from_vinfo(*redargs)
-        #
-        if vinfo is not None:
-            i0 = len(jitdriver_sd._red_args_types)
-            index_of_virtualizable = jitdriver_sd.index_of_virtualizable
-            vable_static_fields = unrolling_iterable(
-                zip(vinfo.static_extra_types, vinfo.static_fields))
-            vable_array_fields = unrolling_iterable(
-                zip(vinfo.arrayitem_extra_types, vinfo.array_fields))
-            getlength = cpu.ts.getlength
-            getarrayitem = cpu.ts.getarrayitem
-            #
-            def set_future_values_from_vinfo(*redargs):
-                i = i0
-                virtualizable = redargs[index_of_virtualizable]
-                virtualizable = vinfo.cast_to_vtype(virtualizable)
-                for typecode, fieldname in vable_static_fields:
-                    x = getattr(virtualizable, fieldname)
-                    set_future_value(cpu, i, x, typecode)
-                    i = i + 1
-                for typecode, fieldname in vable_array_fields:
-                    lst = getattr(virtualizable, fieldname)
-                    for j in range(getlength(lst)):
-                        x = getarrayitem(lst, j)
-                        set_future_value(cpu, i, x, typecode)
-                        i = i + 1
-        else:
-            set_future_values_from_vinfo = None
-        #
-        self.set_future_values = set_future_values
-        return set_future_values
-
-    # ----------
-
     def make_jitdriver_callbacks(self):
         if hasattr(self, 'get_location_str'):
             return
@@ -601,8 +647,9 @@
             jd.on_compile = lambda *args: None
             jd.on_compile_bridge = lambda *args: None
 
-        def get_assembler_token(greenkey, redboxes):
-            # 'redboxes' is only used to know the types of red arguments
+        redargtypes = ''.join([kind[0] for kind in jd.red_args_types])
+
+        def get_assembler_token(greenkey):
             cell = self.jit_cell_at_key(greenkey)
             procedure_token = cell.get_procedure_token()
             if procedure_token is None:
@@ -611,7 +658,7 @@
                     cell.counter = 0      # but was freed in the meantime.
                 memmgr = warmrunnerdesc.memory_manager
                 procedure_token = compile_tmp_callback(cpu, jd, greenkey,
-                                                   redboxes, memmgr)
+                                                       redargtypes, memmgr)
                 cell.set_procedure_token(procedure_token)
             return procedure_token
         self.get_assembler_token = get_assembler_token
diff --git a/pypy/jit/tool/jitoutput.py b/pypy/jit/tool/jitoutput.py
--- a/pypy/jit/tool/jitoutput.py
+++ b/pypy/jit/tool/jitoutput.py
@@ -10,9 +10,6 @@
 REGEXES = [
     (('tracing_no', 'tracing_time'), '^Tracing:\s+([\d.]+)\s+([\d.]+)$'),
     (('backend_no', 'backend_time'), '^Backend:\s+([\d.]+)\s+([\d.]+)$'),
-    (('asm_no',), '^Running asm:\s+([\d.]+)$'),
-    (('blackhole_no',),
-         '^Blackhole:\s+([\d.]+)$'),
     (None, '^TOTAL.*$'),
     (('ops.total',), '^ops:\s+(\d+)$'),
     (('recorded_ops.total',), '^recorded ops:\s+(\d+)$'),
diff --git a/pypy/jit/tool/test/test_jitoutput.py b/pypy/jit/tool/test/test_jitoutput.py
--- a/pypy/jit/tool/test/test_jitoutput.py
+++ b/pypy/jit/tool/test/test_jitoutput.py
@@ -34,8 +34,6 @@
     # assert did not crash
     # asserts below are a bit delicate, possibly they might be deleted
     assert info.tracing_no == 1
-    assert info.asm_no == 1
-    assert info.blackhole_no == 1
     assert info.backend_no == 1
     assert info.ops.total == 2
     assert info.recorded_ops.total == 2
@@ -47,8 +45,6 @@
 
 DATA = '''Tracing:         1       0.006992
 Backend:        1       0.000525
-Running asm:            1
-Blackhole:              1
 TOTAL:                  0.025532
 ops:                    2
 recorded ops:           6
@@ -75,8 +71,6 @@
     info = parse_prof(DATA)
     assert info.tracing_no == 1
     assert info.tracing_time == 0.006992
-    assert info.asm_no == 1
-    assert info.blackhole_no == 1
     assert info.backend_no == 1
     assert info.backend_time == 0.000525
     assert info.ops.total == 2
diff --git a/pypy/module/_ssl/test/test_ssl.py b/pypy/module/_ssl/test/test_ssl.py
--- a/pypy/module/_ssl/test/test_ssl.py
+++ b/pypy/module/_ssl/test/test_ssl.py
@@ -161,11 +161,16 @@
 
     def test_shutdown(self):
         import socket, ssl, sys, gc
-        if sys.platform == 'darwin':
-            skip("get also on CPython: error: [Errno 0]")
         ss = socket.ssl(self.s)
         ss.write("hello\n")
-        assert ss.shutdown() is self.s._sock
+        try:
+            result = ss.shutdown()
+        except socket.error, e:
+            # xxx obscure case; throwing errno 0 is pretty odd...
+            if e.errno == 0:
+                skip("Shutdown raised errno 0. CPython does this too")
+            raise
+        assert result is self.s._sock
         raises(ssl.SSLError, ss.write, "hello\n")
         del ss; gc.collect()
 
diff --git a/pypy/module/micronumpy/__init__.py b/pypy/module/micronumpy/__init__.py
--- a/pypy/module/micronumpy/__init__.py
+++ b/pypy/module/micronumpy/__init__.py
@@ -1,9 +1,19 @@
 from pypy.interpreter.mixedmodule import MixedModule
 
 
+class PyPyModule(MixedModule):
+    interpleveldefs = {
+        'debug_repr': 'interp_extras.debug_repr',
+    }
+    appleveldefs = {}
+
 class Module(MixedModule):
     applevel_name = 'numpypy'
 
+    submodules = {
+        'pypy': PyPyModule
+    }
+
     interpleveldefs = {
         'ndarray': 'interp_numarray.W_NDimArray',
         'dtype': 'interp_dtype.W_Dtype',
@@ -81,6 +91,7 @@
         'mean': 'app_numpy.mean',
         'sum': 'app_numpy.sum',
         'min': 'app_numpy.min',
+        'identity': 'app_numpy.identity',
         'max': 'app_numpy.max',
         'inf': 'app_numpy.inf',
         'e': 'app_numpy.e',
diff --git a/pypy/module/micronumpy/app_numpy.py b/pypy/module/micronumpy/app_numpy.py
--- a/pypy/module/micronumpy/app_numpy.py
+++ b/pypy/module/micronumpy/app_numpy.py
@@ -13,6 +13,11 @@
     # weighting, just the average part!
     return mean(a)
 
+def identity(n, dtype=None):
+    a = numpypy.zeros((n,n), dtype=dtype)
+    for i in range(n):
+        a[i][i] = 1
+    return a
 
 def mean(a):
     if not hasattr(a, "mean"):
diff --git a/pypy/module/micronumpy/interp_boxes.py b/pypy/module/micronumpy/interp_boxes.py
--- a/pypy/module/micronumpy/interp_boxes.py
+++ b/pypy/module/micronumpy/interp_boxes.py
@@ -86,6 +86,7 @@
     descr_ge = _binop_impl("greater_equal")
 
     descr_radd = _binop_right_impl("add")
+    descr_rsub = _binop_right_impl("subtract")
     descr_rmul = _binop_right_impl("multiply")
 
     descr_neg = _unaryop_impl("negative")
@@ -132,7 +133,7 @@
     descr__new__, get_dtype = new_dtype_getter("long")
 
 class W_ULongBox(W_UnsignedIntegerBox, PrimitiveBox):
-    pass
+    descr__new__, get_dtype = new_dtype_getter("ulong")
 
 class W_Int64Box(W_SignedIntegerBox, PrimitiveBox):
     descr__new__, get_dtype = new_dtype_getter("int64")
@@ -170,7 +171,8 @@
     __mul__ = interp2app(W_GenericBox.descr_mul),
     __div__ = interp2app(W_GenericBox.descr_div),
 
-    __radd__ = interp2app(W_GenericBox.descr_add),
+    __radd__ = interp2app(W_GenericBox.descr_radd),
+    __rsub__ = interp2app(W_GenericBox.descr_rsub),
     __rmul__ = interp2app(W_GenericBox.descr_rmul),
 
     __eq__ = interp2app(W_GenericBox.descr_eq),
diff --git a/pypy/module/micronumpy/interp_extras.py b/pypy/module/micronumpy/interp_extras.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/micronumpy/interp_extras.py
@@ -0,0 +1,7 @@
+from pypy.interpreter.gateway import unwrap_spec
+from pypy.module.micronumpy.interp_numarray import BaseArray
+
+
+ at unwrap_spec(array=BaseArray)
+def debug_repr(space, array):
+    return space.wrap(array.debug_repr())
diff --git a/pypy/module/micronumpy/interp_numarray.py b/pypy/module/micronumpy/interp_numarray.py
--- a/pypy/module/micronumpy/interp_numarray.py
+++ b/pypy/module/micronumpy/interp_numarray.py
@@ -581,6 +581,7 @@
     def descr_get_dtype(self, space):
         return space.wrap(self.find_dtype())
 
+    @jit.unroll_safe
     def descr_get_shape(self, space):
         return space.newtuple([space.wrap(i) for i in self.shape])
 
@@ -791,7 +792,8 @@
                 raise OperationError(space.w_IndexError, space.wrap(
                         "0-d arrays can't be indexed"))
             item = concrete._index_of_single_item(space, w_idx)
-            concrete.setitem_w(space, item, w_value)
+            dtype = concrete.find_dtype()
+            concrete.setitem(item, dtype.coerce(space, w_value))
             return
         if not isinstance(w_value, BaseArray):
             w_value = convert_to_array(space, w_value)
@@ -924,9 +926,6 @@
     def start_iter(self, res_shape=None):
         raise NotImplementedError
 
-    def descr_debug_repr(self, space):
-        return space.wrap(self.debug_repr())
-
     def descr_array_iface(self, space):
         concrete = self.get_concrete()
         storage = concrete.get_storage(space)
@@ -1178,10 +1177,6 @@
     def eval(self, iter):
         return self.parent.getitem(iter.get_offset())
 
-    @unwrap_spec(item=int)
-    def setitem_w(self, space, item, w_value):
-        return self.parent.setitem_w(space, item, w_value)
-
     def setitem(self, item, value):
         # This is currently not possible to be called from anywhere.
         raise NotImplementedError
@@ -1330,9 +1325,6 @@
         raise OperationError(space.w_TypeError, space.wrap(
             "len() of unsized object"))
 
-    def setitem_w(self, space, item, w_value):
-        return self.setitem(item, self.dtype.coerce(space, w_value))
-
     def setitem(self, item, value):
         self.invalidated()
         self.dtype.setitem(self.storage, item, value)
@@ -1472,7 +1464,6 @@
 
     __repr__ = interp2app(BaseArray.descr_repr),
     __str__ = interp2app(BaseArray.descr_str),
-    __debug_repr__ = interp2app(BaseArray.descr_debug_repr),
     __array_interface__ = GetSetProperty(BaseArray.descr_array_iface),
 
     dtype = GetSetProperty(BaseArray.descr_get_dtype),
diff --git a/pypy/module/micronumpy/interp_support.py b/pypy/module/micronumpy/interp_support.py
--- a/pypy/module/micronumpy/interp_support.py
+++ b/pypy/module/micronumpy/interp_support.py
@@ -1,34 +1,90 @@
-from pypy.interpreter.error import OperationError
+from pypy.interpreter.error import OperationError, operationerrfmt
 from pypy.interpreter.gateway import unwrap_spec
-from pypy.module.micronumpy.interp_dtype import get_dtype_cache
-from pypy.rlib.rstruct.runpack import runpack
 from pypy.rpython.lltypesystem import lltype, rffi
+from pypy.module.micronumpy import interp_dtype
+from pypy.objspace.std.strutil import strip_spaces
 
 
 FLOAT_SIZE = rffi.sizeof(lltype.Float)
 
- at unwrap_spec(s=str)
-def fromstring(space, s):
+def _fromstring_text(space, s, count, sep, length, dtype):
     from pypy.module.micronumpy.interp_numarray import W_NDimArray
+
+    sep_stripped = strip_spaces(sep)
+    skip_bad_vals = len(sep_stripped) == 0
+
+    items = []
+    num_items = 0
+    idx = 0
+    
+    while (num_items < count or count == -1) and idx < len(s):
+        nextidx = s.find(sep, idx)
+        if nextidx < 0:
+            nextidx = length
+        piece = strip_spaces(s[idx:nextidx])
+        if len(piece) > 0 or not skip_bad_vals:
+            if len(piece) == 0 and not skip_bad_vals:
+                val = dtype.itemtype.default_fromstring(space)
+            else:
+                try:
+                    val = dtype.coerce(space, space.wrap(piece))
+                except OperationError, e:
+                    if not e.match(space, space.w_ValueError):
+                        raise
+                    gotit = False
+                    while not gotit and len(piece) > 0:
+                        piece = piece[:-1]
+                        try:
+                            val = dtype.coerce(space, space.wrap(piece))
+                            gotit = True
+                        except OperationError, e:
+                            if not e.match(space, space.w_ValueError):
+                                raise
+                    if not gotit:
+                        val = dtype.itemtype.default_fromstring(space)
+                    nextidx = length
+            items.append(val)
+            num_items += 1
+        idx = nextidx + 1
+    
+    if count > num_items:
+        raise OperationError(space.w_ValueError, space.wrap(
+            "string is smaller than requested size"))
+
+    a = W_NDimArray(num_items, [num_items], dtype=dtype)
+    for i, val in enumerate(items):
+        a.dtype.setitem(a.storage, i, val)
+    
+    return space.wrap(a)
+
+def _fromstring_bin(space, s, count, length, dtype):
+    from pypy.module.micronumpy.interp_numarray import W_NDimArray
+    
+    itemsize = dtype.itemtype.get_element_size()
+    if count == -1:
+        count = length / itemsize
+    if length % itemsize != 0:
+        raise operationerrfmt(space.w_ValueError,
+                              "string length %d not divisable by item size %d",
+                              length, itemsize)
+    if count * itemsize > length:
+        raise OperationError(space.w_ValueError, space.wrap(
+            "string is smaller than requested size"))
+        
+    a = W_NDimArray(count, [count], dtype=dtype)
+    for i in range(count):
+        val = dtype.itemtype.runpack_str(s[i*itemsize:i*itemsize + itemsize])
+        a.dtype.setitem(a.storage, i, val)
+        
+    return space.wrap(a)
+
+ at unwrap_spec(s=str, count=int, sep=str)
+def fromstring(space, s, w_dtype=None, count=-1, sep=''):
+    dtype = space.interp_w(interp_dtype.W_Dtype,
+        space.call_function(space.gettypefor(interp_dtype.W_Dtype), w_dtype)
+    )
     length = len(s)
-
-    if length % FLOAT_SIZE == 0:
-        number = length/FLOAT_SIZE
+    if sep == '':
+        return _fromstring_bin(space, s, count, length, dtype)
     else:
-        raise OperationError(space.w_ValueError, space.wrap(
-            "string length %d not divisable by %d" % (length, FLOAT_SIZE)))
-
-    dtype = get_dtype_cache(space).w_float64dtype
-    a = W_NDimArray(number, [number], dtype=dtype)
-
-    start = 0
-    end = FLOAT_SIZE
-    i = 0
-    while i < number:
-        part = s[start:end]
-        a.dtype.setitem(a.storage, i, dtype.box(runpack('d', part)))
-        i += 1
-        start += FLOAT_SIZE
-        end += FLOAT_SIZE
-
-    return space.wrap(a)
+        return _fromstring_text(space, s, count, sep, length, dtype)
diff --git a/pypy/module/micronumpy/test/test_dtypes.py b/pypy/module/micronumpy/test/test_dtypes.py
--- a/pypy/module/micronumpy/test/test_dtypes.py
+++ b/pypy/module/micronumpy/test/test_dtypes.py
@@ -193,6 +193,7 @@
 
         assert type(X(True)) is numpy.bool_
         assert X(True) is numpy.True_
+        assert numpy.bool_("False") is numpy.True_
 
     def test_int8(self):
         import numpypy as numpy
@@ -211,6 +212,10 @@
 
         assert type(int(x)) is int
         assert int(x) == -128
+        assert numpy.int8('50') == numpy.int8(50)
+        raises(ValueError, numpy.int8, '50.2')
+        assert numpy.int8('127') == 127
+        assert numpy.int8('128') == -128
 
     def test_uint8(self):
         import numpypy as numpy
@@ -232,6 +237,8 @@
 
         assert numpy.uint8(255) == 255
         assert numpy.uint8(256) == 0
+        assert numpy.uint8('255') == 255
+        assert numpy.uint8('256') == 0
 
     def test_int16(self):
         import numpypy as numpy
@@ -240,26 +247,43 @@
         assert x == 3
         assert numpy.int16(32767) == 32767
         assert numpy.int16(32768) == -32768
+        assert numpy.int16('32767') == 32767
+        assert numpy.int16('32768') == -32768
 
     def test_uint16(self):
         import numpypy as numpy
 
         assert numpy.uint16(65535) == 65535
         assert numpy.uint16(65536) == 0
+        assert numpy.uint16('65535') == 65535
+        assert numpy.uint16('65536') == 0
 
     def test_int32(self):
+        import sys
         import numpypy as numpy
 
         x = numpy.int32(23)
         assert x == 23
         assert numpy.int32(2147483647) == 2147483647
-        assert numpy.int32(2147483648) == -2147483648
+        assert numpy.int32('2147483647') == 2147483647
+        if sys.maxint > 2 ** 31 - 1:
+            assert numpy.int32(2147483648) == -2147483648
+            assert numpy.int32('2147483648') == -2147483648
+        else:
+            raises(OverflowError, numpy.int32, 2147483648)
+            raises(OverflowError, numpy.int32, '2147483648')
 
     def test_uint32(self):
+        import sys
         import numpypy as numpy
 
-        assert numpy.uint32(4294967295) == 4294967295
-        assert numpy.uint32(4294967296) == 0
+        assert numpy.uint32(10) == 10
+
+        if sys.maxint > 2 ** 31 - 1:
+            assert numpy.uint32(4294967295) == 4294967295
+            assert numpy.uint32(4294967296) == 0
+            assert numpy.uint32('4294967295') == 4294967295
+            assert numpy.uint32('4294967296') == 0
 
     def test_int_(self):
         import numpypy as numpy
@@ -279,8 +303,15 @@
         assert numpy.dtype(numpy.int64).type is numpy.int64
         assert numpy.int64(3) == 3
 
-        assert numpy.int64(9223372036854775807) == 9223372036854775807
+        if sys.maxint >= 2 ** 63 - 1:
+            assert numpy.int64(9223372036854775807) == 9223372036854775807
+            assert numpy.int64('9223372036854775807') == 9223372036854775807
+        else:
+            raises(OverflowError, numpy.int64, 9223372036854775807)
+            raises(OverflowError, numpy.int64, '9223372036854775807')
+        
         raises(OverflowError, numpy.int64, 9223372036854775808)
+        raises(OverflowError, numpy.int64, '9223372036854775808')
 
     def test_uint64(self):
         import sys
@@ -304,6 +335,8 @@
         assert numpy.float32.mro() == [numpy.float32, numpy.floating, numpy.inexact, numpy.number, numpy.generic, object]
 
         assert numpy.float32(12) == numpy.float64(12)
+        assert numpy.float32('23.4') == numpy.float32(23.4)
+        raises(ValueError, numpy.float32, '23.2df')
 
     def test_float64(self):
         import numpypy as numpy
@@ -315,6 +348,8 @@
         assert numpy.dtype(float).type is numpy.float64
 
         assert numpy.float64(2.0) == 2.0
+        assert numpy.float64('23.4') == numpy.float64(23.4)
+        raises(ValueError, numpy.float64, '23.2df')
 
     def test_subclass_type(self):
         import numpypy as numpy
diff --git a/pypy/module/micronumpy/test/test_numarray.py b/pypy/module/micronumpy/test/test_numarray.py
--- a/pypy/module/micronumpy/test/test_numarray.py
+++ b/pypy/module/micronumpy/test/test_numarray.py
@@ -491,6 +491,11 @@
         for i in range(5):
             assert b[i] == i - 5
 
+    def test_scalar_subtract(self):
+        from numpypy import int32
+        assert int32(2) - 1 == 1
+        assert 1 - int32(2) == -1
+
     def test_mul(self):
         import numpypy
 
@@ -722,6 +727,26 @@
         a = array([True] * 5, bool)
         assert a.sum() == 5
 
+    def test_identity(self):
+        from numpypy import identity, array
+        from numpypy import int32, float64, dtype
+        a = identity(0)
+        assert len(a) == 0
+        assert a.dtype == dtype('float64')
+        assert a.shape == (0,0)
+        b = identity(1, dtype=int32)
+        assert len(b) == 1
+        assert b[0][0] == 1
+        assert b.shape == (1,1)
+        assert b.dtype == dtype('int32')
+        c = identity(2)
+        assert c.shape == (2,2)
+        assert (c == [[1,0],[0,1]]).all()
+        d = identity(3, dtype='int32')
+        assert d.shape == (3,3)
+        assert d.dtype == dtype('int32')
+        assert (d == [[1,0,0],[0,1,0],[0,0,1]]).all()
+
     def test_prod(self):
         from numpypy import array
         a = array(range(1, 6))
@@ -868,16 +893,17 @@
 
     def test_debug_repr(self):
         from numpypy import zeros, sin
+        from numpypy.pypy import debug_repr
         a = zeros(1)
-        assert a.__debug_repr__() == 'Array'
-        assert (a + a).__debug_repr__() == 'Call2(add, Array, Array)'
-        assert (a[::2]).__debug_repr__() == 'Slice(Array)'
-        assert (a + 2).__debug_repr__() == 'Call2(add, Array, Scalar)'
-        assert (a + a.flat).__debug_repr__() == 'Call2(add, Array, FlatIter(Array))'
-        assert sin(a).__debug_repr__() == 'Call1(sin, Array)'
+        assert debug_repr(a) == 'Array'
+        assert debug_repr(a + a) == 'Call2(add, Array, Array)'
+        assert debug_repr(a[::2]) == 'Slice(Array)'
+        assert debug_repr(a + 2) == 'Call2(add, Array, Scalar)'
+        assert debug_repr(a + a.flat) == 'Call2(add, Array, FlatIter(Array))'
+        assert debug_repr(sin(a)) == 'Call1(sin, Array)'
         b = a + a
         b[0] = 3
-        assert b.__debug_repr__() == 'Call2(add, forced=Array)'
+        assert debug_repr(b) == 'Call2(add, forced=Array)'
 
     def test_tolist_scalar(self):
         from numpypy import int32, bool_
@@ -1168,13 +1194,110 @@
         import struct
         BaseNumpyAppTest.setup_class.im_func(cls)
         cls.w_data = cls.space.wrap(struct.pack('dddd', 1, 2, 3, 4))
+        cls.w_fdata = cls.space.wrap(struct.pack('f', 2.3))
+        cls.w_float32val = cls.space.wrap(struct.pack('f', 5.2))
+        cls.w_float64val = cls.space.wrap(struct.pack('d', 300.4))
+        cls.w_ulongval = cls.space.wrap(struct.pack('L', 12))
 
     def test_fromstring(self):
-        from numpypy import fromstring
+        import sys
+        from numpypy import fromstring, array, uint8, float32, int32
+
         a = fromstring(self.data)
         for i in range(4):
             assert a[i] == i + 1
-        raises(ValueError, fromstring, "abc")
+        b = fromstring('\x01\x02', dtype=uint8)
+        assert a[0] == 1
+        assert a[1] == 2
+        c = fromstring(self.fdata, dtype=float32)
+        assert c[0] == float32(2.3)
+        d = fromstring("1 2", sep=' ', count=2, dtype=uint8)
+        assert len(d) == 2
+        assert d[0] == 1
+        assert d[1] == 2
+        e = fromstring('3, 4,5', dtype=uint8, sep=',')
+        assert len(e) == 3
+        assert e[0] == 3
+        assert e[1] == 4
+        assert e[2] == 5
+        f = fromstring('\x01\x02\x03\x04\x05', dtype=uint8, count=3)
+        assert len(f) == 3
+        assert f[0] == 1
+        assert f[1] == 2
+        assert f[2] == 3
+        g = fromstring("1  2    3 ", dtype=uint8, sep=" ")
+        assert len(g) == 3
+        assert g[0] == 1
+        assert g[1] == 2
+        assert g[2] == 3
+        h = fromstring("1, , 2, 3", dtype=uint8, sep=",")
+        assert (h == [1,0,2,3]).all()
+        i = fromstring("1    2 3", dtype=uint8, sep=" ")
+        assert (i == [1,2,3]).all()
+        j = fromstring("1\t\t\t\t2\t3", dtype=uint8, sep="\t")
+        assert (j == [1,2,3]).all()
+        k = fromstring("1,x,2,3", dtype=uint8, sep=",")
+        assert (k == [1,0]).all()
+        l = fromstring("1,x,2,3", dtype='float32', sep=",")
+        assert (l == [1.0,-1.0]).all()
+        m = fromstring("1,,2,3", sep=",")
+        assert (m == [1.0,-1.0,2.0,3.0]).all()
+        n = fromstring("3.4 2.0 3.8 2.2", dtype=int32, sep=" ")
+        assert (n == [3]).all()
+        o = fromstring("1.0 2f.0f 3.8 2.2", dtype=float32, sep=" ")
+        assert len(o) == 2
+        assert o[0] == 1.0
+        assert o[1] == 2.0
+        p = fromstring("1.0,,2.0,3.0", sep=",")
+        assert (p == [1.0, -1.0, 2.0, 3.0]).all()
+        q = fromstring("1.0,,2.0,3.0", sep=" ")
+        assert (q == [1.0]).all()
+        r = fromstring("\x01\x00\x02", dtype='bool')
+        assert (r == [True, False, True]).all()
+        s = fromstring("1,2,3,,5", dtype=bool, sep=",")
+        assert (s == [True, True, True, False, True]).all()
+        t = fromstring("", bool)
+        assert (t == []).all()
+        u = fromstring("\x01\x00\x00\x00\x00\x00\x00\x00", dtype=int)
+        if sys.maxint > 2 ** 31 - 1:
+            assert (u == [1]).all()
+        else:
+            assert (u == [1, 0]).all()
+
+    def test_fromstring_types(self):
+        from numpypy import (fromstring, int8, int16, int32, int64, uint8,
+            uint16, uint32, float32, float64)
+
+        a = fromstring('\xFF', dtype=int8)
+        assert a[0] == -1
+        b = fromstring('\xFF', dtype=uint8)
+        assert b[0] == 255
+        c = fromstring('\xFF\xFF', dtype=int16)
+        assert c[0] == -1
+        d = fromstring('\xFF\xFF', dtype=uint16)
+        assert d[0] == 65535
+        e = fromstring('\xFF\xFF\xFF\xFF', dtype=int32)
+        assert e[0] == -1
+        f = fromstring('\xFF\xFF\xFF\xFF', dtype=uint32)
+        assert repr(f[0]) == '4294967295'
+        g = fromstring('\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF', dtype=int64)
+        assert g[0] == -1
+        h = fromstring(self.float32val, dtype=float32)
+        assert h[0] == float32(5.2)
+        i = fromstring(self.float64val, dtype=float64)
+        assert i[0] == float64(300.4)
+        j = fromstring(self.ulongval, dtype='L')
+        assert j[0] == 12
+
+
+    def test_fromstring_invalid(self):
+        from numpypy import fromstring, uint16, uint8, int32
+        #default dtype is 64-bit float, so 3 bytes should fail
+        raises(ValueError, fromstring, "\x01\x02\x03")
+        #3 bytes is not modulo 2 bytes (int16)
+        raises(ValueError, fromstring, "\x01\x03\x03", dtype=uint16)
+        #5 bytes is larger than 3 bytes
+        raises(ValueError, fromstring, "\x01\x02\x03", count=5, dtype=uint8)
 
 
 class AppTestRepr(BaseNumpyAppTest):
diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -250,22 +250,6 @@
                                 'int_ge': 1, 'guard_false': 1,
                                 'jump': 1})
 
-    def define_slice2():
-        return """
-        a = |30|
-        s1 = a -> :20:2
-        s2 = a -> :30:3
-        b = s1 + s2
-        b -> 3
-        """
-
-    def test_slice2(self):
-        result = self.run("slice2")
-        assert result == 15
-        self.check_simple_loop({'getinteriorfield_raw': 2, 'float_add': 1,
-                                'setinteriorfield_raw': 1, 'int_add': 3,
-                                'int_ge': 1, 'guard_false': 1, 'jump': 1})
-
     def define_multidim():
         return """
         a = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
diff --git a/pypy/module/micronumpy/types.py b/pypy/module/micronumpy/types.py
--- a/pypy/module/micronumpy/types.py
+++ b/pypy/module/micronumpy/types.py
@@ -8,6 +8,7 @@
 from pypy.rlib.objectmodel import specialize
 from pypy.rlib.rarithmetic import LONG_BIT, widen
 from pypy.rpython.lltypesystem import lltype, rffi
+from pypy.rlib.rstruct.runpack import runpack
 
 
 def simple_unary_op(func):
@@ -55,6 +56,7 @@
 
 class Primitive(object):
     _mixin_ = True
+
     def get_element_size(self):
         return rffi.sizeof(self.T)
 
@@ -84,6 +86,9 @@
     def _coerce(self, space, w_item):
         raise NotImplementedError
 
+    def default_fromstring(self, space):
+        raise NotImplementedError
+
     def read(self, storage, width, i, offset):
         return self.box(libffi.array_getitem(clibffi.cast_type_to_ffitype(self.T),
             width, storage, i, offset
@@ -102,6 +107,9 @@
                 width, storage, i, offset, value
             )
 
+    def runpack_str(self, s):
+        return self.box(runpack(self.format_code, s))
+
     @simple_binary_op
     def add(self, v1, v2):
         return v1 + v2
@@ -164,6 +172,7 @@
 class Bool(BaseType, Primitive):
     T = lltype.Bool
     BoxType = interp_boxes.W_BoolBox
+    format_code = "?"
 
     True = BoxType(True)
     False = BoxType(False)
@@ -193,11 +202,14 @@
     def for_computation(self, v):
         return int(v)
 
+    def default_fromstring(self, space):
+        return self.box(False)
+
 class Integer(Primitive):
     _mixin_ = True
 
     def _coerce(self, space, w_item):
-        return self.box(space.int_w(space.int(w_item)))
+        return self.box(space.int_w(space.call_function(space.w_int, w_item)))
 
     def str_format(self, box):
         value = self.unbox(box)
@@ -206,6 +218,9 @@
     def for_computation(self, v):
         return widen(v)
 
+    def default_fromstring(self, space):
+        return self.box(0)
+
     @simple_binary_op
     def div(self, v1, v2):
         if v2 == 0:
@@ -241,42 +256,52 @@
 class Int8(BaseType, Integer):
     T = rffi.SIGNEDCHAR
     BoxType = interp_boxes.W_Int8Box
+    format_code = "b"
 
 class UInt8(BaseType, Integer):
     T = rffi.UCHAR
     BoxType = interp_boxes.W_UInt8Box
+    format_code = "B"
 
 class Int16(BaseType, Integer):
     T = rffi.SHORT
     BoxType = interp_boxes.W_Int16Box
+    format_code = "h"
 
 class UInt16(BaseType, Integer):
     T = rffi.USHORT
     BoxType = interp_boxes.W_UInt16Box
+    format_code = "H"
 
 class Int32(BaseType, Integer):
     T = rffi.INT
     BoxType = interp_boxes.W_Int32Box
+    format_code = "i"
 
 class UInt32(BaseType, Integer):
     T = rffi.UINT
     BoxType = interp_boxes.W_UInt32Box
+    format_code = "I"
 
 class Long(BaseType, Integer):
     T = rffi.LONG
     BoxType = interp_boxes.W_LongBox
+    format_code = "l"
 
 class ULong(BaseType, Integer):
     T = rffi.ULONG
     BoxType = interp_boxes.W_ULongBox
+    format_code = "L"
 
 class Int64(BaseType, Integer):
     T = rffi.LONGLONG
     BoxType = interp_boxes.W_Int64Box
+    format_code = "q"
 
 class UInt64(BaseType, Integer):
     T = rffi.ULONGLONG
     BoxType = interp_boxes.W_UInt64Box
+    format_code = "Q"
 
     def _coerce(self, space, w_item):
         try:
@@ -295,7 +320,7 @@
     _mixin_ = True
 
     def _coerce(self, space, w_item):
-        return self.box(space.float_w(space.float(w_item)))
+        return self.box(space.float_w(space.call_function(space.w_float, w_item)))
 
     def str_format(self, box):
         value = self.unbox(box)
@@ -304,6 +329,9 @@
     def for_computation(self, v):
         return float(v)
 
+    def default_fromstring(self, space):
+        return self.box(-1.0)
+
     @simple_binary_op
     def div(self, v1, v2):
         try:
@@ -403,7 +431,9 @@
 class Float32(BaseType, Float):
     T = rffi.FLOAT
     BoxType = interp_boxes.W_Float32Box
+    format_code = "f"
 
 class Float64(BaseType, Float):
     T = rffi.DOUBLE
-    BoxType = interp_boxes.W_Float64Box
\ No newline at end of file
+    BoxType = interp_boxes.W_Float64Box
+    format_code = "d"
\ No newline at end of file
diff --git a/pypy/module/posix/test/test_posix2.py b/pypy/module/posix/test/test_posix2.py
--- a/pypy/module/posix/test/test_posix2.py
+++ b/pypy/module/posix/test/test_posix2.py
@@ -656,7 +656,11 @@
                 os.fsync(f)     # <- should also work with a file, or anything
             finally:            #    with a fileno() method
                 f.close()
-            raises(OSError, os.fsync, fd)
+            try:
+                # May not raise anything with a buggy libc (or eatmydata)
+                os.fsync(fd)
+            except OSError:
+                pass
             raises(ValueError, os.fsync, -1)
 
     if hasattr(os, 'fdatasync'):
@@ -668,7 +672,11 @@
                 os.fdatasync(fd)
             finally:
                 f.close()
-            raises(OSError, os.fdatasync, fd)
+            try:
+                # May not raise anything with a buggy libc (or eatmydata)
+                os.fdatasync(fd)
+            except OSError:
+                pass
             raises(ValueError, os.fdatasync, -1)
 
     if hasattr(os, 'fchdir'):
diff --git a/pypy/module/pypyjit/test_pypy_c/model.py b/pypy/module/pypyjit/test_pypy_c/model.py
--- a/pypy/module/pypyjit/test_pypy_c/model.py
+++ b/pypy/module/pypyjit/test_pypy_c/model.py
@@ -210,9 +210,9 @@
     def entry_bridge_ops(self, *args, **kwds):
         ops = list(self._allops(*args, **kwds))
         labels = [op for op in ops if op.name == 'label']
-        assert ops.index(labels[0]) == 0
-        i = ops.index(labels[1])
-        return ops[1:i]
+        i0 = ops.index(labels[0])
+        i1 = ops.index(labels[1])
+        return ops[i0+1:i1]
 
     @property
     def chunks(self):
@@ -409,7 +409,7 @@
         """
         iter_exp_ops = iter(expected_ops)
         iter_ops = RevertableIterator(self.ops)
-        for opindex, exp_op in enumerate(iter_exp_ops):
+        for exp_op in iter_exp_ops:
             try:
                 if exp_op == '...':
                     # loop until we find an operation which matches
@@ -430,7 +430,7 @@
                 if exp_op[4] is False:    # optional operation
                     iter_ops.revert_one()
                     continue       # try to match with the next exp_op
-                e.opindex = opindex
+                e.opindex = iter_ops.index - 1
                 raise
         #
         # make sure we exhausted iter_ops
diff --git a/pypy/module/pypyjit/test_pypy_c/test_00_model.py b/pypy/module/pypyjit/test_pypy_c/test_00_model.py
--- a/pypy/module/pypyjit/test_pypy_c/test_00_model.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_00_model.py
@@ -45,8 +45,10 @@
         cmdline = [sys.executable]
         if not import_site:
             cmdline.append('-S')
-        for key, value in jitopts.iteritems():
-            cmdline += ['--jit', '%s=%s' % (key, value)]
+        if jitopts:
+            jitcmdline = ['%s=%s' % (key, value)
+                          for key, value in jitopts.items()]
+            cmdline += ['--jit', ','.join(jitcmdline)]
         cmdline.append(str(self.filepath))
         #
         print cmdline, logfile
diff --git a/pypy/module/pypyjit/test_pypy_c/test_generators.py b/pypy/module/pypyjit/test_pypy_c/test_generators.py
--- a/pypy/module/pypyjit/test_pypy_c/test_generators.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_generators.py
@@ -6,6 +6,8 @@
         def main(n):
             def f():
                 for i in range(10000):
+                    i -= 1
+                    i -= 42    # ID: subtract
                     yield i
 
             def g():
@@ -15,10 +17,15 @@
             g()
 
         log = self.run(main, [500])
-        loop, = log.loops_by_filename(self.filepath, is_entry_bridge='*')
+        # XXX XXX this test fails so far because of a detail that
+        # changed with jit-simplify-backendintf.  We should try to
+        # think of a way to be more resistent against such details.
+        # The issue is that we now get one Tracing, then go back
+        # to the interpreter hoping to immediately run the JITted
+        # code; but instead, we Trace again, just because another
+        # counter was also about to reach its limit...
+        loop, = log.loops_by_filename(self.filepath)
         assert loop.match_by_id("generator", """
-            ...
-            label(..., descr=...)
             i16 = force_token()
             p45 = new_with_vtable(ConstClass(W_IntObject))
             setfield_gc(p45, i29, descr=<SignedFieldDescr .*>)
@@ -26,3 +33,8 @@
             i47 = arraylen_gc(p8, descr=<GcPtrArrayDescr>) # Should be removed by backend
             jump(..., descr=...)
             """)
+        assert loop.match_by_id("subtract", """
+            setfield_gc(p7, 35, descr=<.*last_instr .*>)      # XXX bad, kill me
+            i2 = int_sub_ovf(i1, 42)
+            guard_no_overflow(descr=...)
+            """)
diff --git a/pypy/objspace/std/complexobject.py b/pypy/objspace/std/complexobject.py
--- a/pypy/objspace/std/complexobject.py
+++ b/pypy/objspace/std/complexobject.py
@@ -31,9 +31,9 @@
         imag2 = float2longlong(imag2)
         return real1 == real2 and imag1 == imag2
 
-    def unique_id(self, space):
+    def immutable_unique_id(self, space):
         if self.user_overridden_class:
-            return W_Object.unique_id(self, space)
+            return None
         from pypy.rlib.longlong2float import float2longlong
         from pypy.objspace.std.model import IDTAG_COMPLEX as tag
         real = space.float_w(space.getattr(self, space.wrap("real")))
diff --git a/pypy/objspace/std/floatobject.py b/pypy/objspace/std/floatobject.py
--- a/pypy/objspace/std/floatobject.py
+++ b/pypy/objspace/std/floatobject.py
@@ -34,9 +34,9 @@
         two = float2longlong(space.float_w(w_other))
         return one == two
 
-    def unique_id(self, space):
+    def immutable_unique_id(self, space):
         if self.user_overridden_class:
-            return W_Object.unique_id(self, space)
+            return None
         from pypy.rlib.longlong2float import float2longlong
         from pypy.objspace.std.model import IDTAG_FLOAT as tag
         val = float2longlong(space.float_w(self))
diff --git a/pypy/objspace/std/intobject.py b/pypy/objspace/std/intobject.py
--- a/pypy/objspace/std/intobject.py
+++ b/pypy/objspace/std/intobject.py
@@ -26,9 +26,9 @@
             return self is w_other
         return space.int_w(self) == space.int_w(w_other)
 
-    def unique_id(self, space):
+    def immutable_unique_id(self, space):
         if self.user_overridden_class:
-            return W_Object.unique_id(self, space)
+            return None
         from pypy.objspace.std.model import IDTAG_INT as tag
         b = space.bigint_w(self)
         b = b.lshift(3).or_(rbigint.fromint(tag))
diff --git a/pypy/objspace/std/longobject.py b/pypy/objspace/std/longobject.py
--- a/pypy/objspace/std/longobject.py
+++ b/pypy/objspace/std/longobject.py
@@ -18,9 +18,9 @@
             return self is w_other
         return space.bigint_w(self).eq(space.bigint_w(w_other))
 
-    def unique_id(self, space):
+    def immutable_unique_id(self, space):
         if self.user_overridden_class:
-            return W_Object.unique_id(self, space)
+            return None
         from pypy.objspace.std.model import IDTAG_LONG as tag
         b = space.bigint_w(self)
         b = b.lshift(3).or_(rbigint.fromint(tag))
diff --git a/pypy/objspace/std/specialisedtupleobject.py b/pypy/objspace/std/specialisedtupleobject.py
--- a/pypy/objspace/std/specialisedtupleobject.py
+++ b/pypy/objspace/std/specialisedtupleobject.py
@@ -177,52 +177,55 @@
 
 _specialisations = []
 Cls_ii = make_specialised_class((int, int))
-Cls_is = make_specialised_class((int, str))
-Cls_io = make_specialised_class((int, object))
-Cls_si = make_specialised_class((str, int))
-Cls_ss = make_specialised_class((str, str))
-Cls_so = make_specialised_class((str, object))
-Cls_oi = make_specialised_class((object, int))
-Cls_os = make_specialised_class((object, str))
+#Cls_is = make_specialised_class((int, str))
+#Cls_io = make_specialised_class((int, object))
+#Cls_si = make_specialised_class((str, int))
+#Cls_ss = make_specialised_class((str, str))
+#Cls_so = make_specialised_class((str, object))
+#Cls_oi = make_specialised_class((object, int))
+#Cls_os = make_specialised_class((object, str))
 Cls_oo = make_specialised_class((object, object))
 Cls_ff = make_specialised_class((float, float))
-Cls_ooo = make_specialised_class((object, object, object))
+#Cls_ooo = make_specialised_class((object, object, object))
 
 def makespecialisedtuple(space, list_w):
     if len(list_w) == 2:
         w_arg1, w_arg2 = list_w
         w_type1 = space.type(w_arg1)
-        w_type2 = space.type(w_arg2)
+        #w_type2 = space.type(w_arg2)
         #
         if w_type1 is space.w_int:
+            w_type2 = space.type(w_arg2)
             if w_type2 is space.w_int:
                 return Cls_ii(space, w_arg1, w_arg2)
-            elif w_type2 is space.w_str:
-                return Cls_is(space, w_arg1, w_arg2)
-            else:
-                return Cls_io(space, w_arg1, w_arg2)
+            #elif w_type2 is space.w_str:
+            #    return Cls_is(space, w_arg1, w_arg2)
+            #else:
+            #    return Cls_io(space, w_arg1, w_arg2)
         #
-        elif w_type1 is space.w_str:
-            if w_type2 is space.w_int:
-                return Cls_si(space, w_arg1, w_arg2)
-            elif w_type2 is space.w_str:
-                return Cls_ss(space, w_arg1, w_arg2)
-            else:
-                return Cls_so(space, w_arg1, w_arg2)
+        #elif w_type1 is space.w_str:
+        #    if w_type2 is space.w_int:
+        #        return Cls_si(space, w_arg1, w_arg2)
+        #    elif w_type2 is space.w_str:
+        #        return Cls_ss(space, w_arg1, w_arg2)
+        #    else:
+        #        return Cls_so(space, w_arg1, w_arg2)
         #
-        elif w_type1 is space.w_float and w_type2 is space.w_float:
-            return Cls_ff(space, w_arg1, w_arg2)
+        elif w_type1 is space.w_float:
+            w_type2 = space.type(w_arg2)
+            if w_type2 is space.w_float:
+                return Cls_ff(space, w_arg1, w_arg2)
         #
-        else:
-            if w_type2 is space.w_int:
-                return Cls_oi(space, w_arg1, w_arg2)
-            elif w_type2 is space.w_str:
-                return Cls_os(space, w_arg1, w_arg2)
-            else:
-                return Cls_oo(space, w_arg1, w_arg2)
+        #else:
+        #    if w_type2 is space.w_int:
+        #        return Cls_oi(space, w_arg1, w_arg2)
+        #    elif w_type2 is space.w_str:
+        #        return Cls_os(space, w_arg1, w_arg2)
+        #    else:
+        return Cls_oo(space, w_arg1, w_arg2)
         #
-    elif len(list_w) == 3:
-        return Cls_ooo(space, list_w[0], list_w[1], list_w[2])
+    #elif len(list_w) == 3:
+    #    return Cls_ooo(space, list_w[0], list_w[1], list_w[2])
     else:
         raise NotSpecialised
 
diff --git a/pypy/objspace/std/stringobject.py b/pypy/objspace/std/stringobject.py
--- a/pypy/objspace/std/stringobject.py
+++ b/pypy/objspace/std/stringobject.py
@@ -32,9 +32,9 @@
             return False
         return space.str_w(self) is space.str_w(w_other)
 
-    def unique_id(self, space):
+    def immutable_unique_id(self, space):
         if self.user_overridden_class:
-            return W_Object.unique_id(self, space)
+            return None
         return space.wrap(compute_unique_id(space.str_w(self)))
 
 
@@ -514,44 +514,41 @@
     if maxsplit == 0:
         return space.wrap(input)
 
-    #print "from replace, input: %s, sub: %s, by: %s" % (input, sub, by)
+    # An ok guess at the default size
+    builder = StringBuilder(len(input))
+    first = True
 
     if not sub:
         upper = len(input)
         if maxsplit > 0 and maxsplit < upper + 2:
             upper = maxsplit - 1
             assert upper >= 0
-        substrings_w = [""]
+        first = False
         for i in range(upper):
-            c = input[i]
-            substrings_w.append(c)
-        substrings_w.append(input[upper:])
+            builder.append(by)
+            builder.append(input[i])
+        builder.append(by)
+        builder.append_slice(input, upper, len(input))
     else:
         start = 0
         sublen = len(sub)
-        substrings_w = []
 
         while maxsplit != 0:
             next = input.find(sub, start)
             if next < 0:
                 break
-            substrings_w.append(input[start:next])
+            if not first:
+                builder.append(by)
+            first = False
+            builder.append_slice(input, start, next)
             start = next + sublen
             maxsplit -= 1   # NB. if it's already < 0, it stays < 0
 
-        substrings_w.append(input[start:])
+        if not first:
+            builder.append(by)
+        builder.append_slice(input, start, len(input))
 
-    try:
-        # XXX conservative estimate. If your strings are that close
-        # to overflowing, bad luck.
-        one = ovfcheck(len(substrings_w) * len(by))
-        ovfcheck(one + len(input))
-    except OverflowError:
-        raise OperationError(
-            space.w_OverflowError,
-            space.wrap("replace string is too long"))
-
-    return space.wrap(by.join(substrings_w))
+    return space.wrap(builder.build())
 
 
 def str_replace__String_ANY_ANY_ANY(space, w_self, w_sub, w_by, w_maxsplit):
diff --git a/pypy/objspace/std/test/test_obj.py b/pypy/objspace/std/test/test_obj.py
--- a/pypy/objspace/std/test/test_obj.py
+++ b/pypy/objspace/std/test/test_obj.py
@@ -253,6 +253,12 @@
         y = 2j
         assert id(x) != id(y)
 
+    def test_object_hash_immutable(self):
+        x = 42
+        y = 40
+        y += 2
+        assert object.__hash__(x) == object.__hash__(y)
+
 
 def test_isinstance_shortcut():
     from pypy.objspace.std import objspace
diff --git a/pypy/objspace/std/test/test_specialisedtupleobject.py b/pypy/objspace/std/test/test_specialisedtupleobject.py
--- a/pypy/objspace/std/test/test_specialisedtupleobject.py
+++ b/pypy/objspace/std/test/test_specialisedtupleobject.py
@@ -33,15 +33,15 @@
         N_space = gettestobjspace(**{"objspace.std.withspecialisedtuple": False})
         S_space = gettestobjspace(**{"objspace.std.withspecialisedtuple": True})
         
-        def hash_test(values):
+        def hash_test(values, must_be_specialized=True):
             N_values_w = [N_space.wrap(value) for value in values]
             S_values_w = [S_space.wrap(value) for value in values]
             N_w_tuple = N_space.newtuple(N_values_w)
             S_w_tuple = S_space.newtuple(S_values_w)
-    
-            assert isinstance(S_w_tuple, W_SpecialisedTupleObject)
+
+            if must_be_specialized:
+                assert isinstance(S_w_tuple, W_SpecialisedTupleObject)
             assert isinstance(N_w_tuple, W_TupleObject)
-            assert not N_space.is_true(N_space.eq(N_w_tuple, S_w_tuple))
             assert S_space.is_true(S_space.eq(N_w_tuple, S_w_tuple))
             assert S_space.is_true(S_space.eq(N_space.hash(N_w_tuple), S_space.hash(S_w_tuple)))
 
@@ -53,7 +53,7 @@
         hash_test([1,(1,2)])
         hash_test([1,('a',2)])
         hash_test([1,()])
-        hash_test([1,2,3])
+        hash_test([1,2,3], must_be_specialized=False)
 
 
 class AppTestW_SpecialisedTupleObject:
@@ -83,6 +83,8 @@
         return ("SpecialisedTupleObject" + expected) in r
 
     def test_createspecialisedtuple(self):
+        have = ['ii', 'ff', 'oo']
+        #
         spec = {int: 'i',
                 float: 'f',
                 str: 's',
@@ -92,14 +94,14 @@
             for y in [43, 4.3, "bar", []]:
                 expected1 = spec[type(x)]
                 expected2 = spec[type(y)]
-                if (expected1 == 'f') ^ (expected2 == 'f'):
-                    if expected1 == 'f': expected1 = 'o'
-                    if expected2 == 'f': expected2 = 'o'
+                if expected1 + expected2 not in have:
+                    expected1 = expected2 = 'o'
                 obj = (x, y)
                 assert self.isspecialised(obj, '_' + expected1 + expected2)
         #
-        obj = (1, 2, 3)
-        assert self.isspecialised(obj, '_ooo')
+        if 'ooo' in have:
+            obj = (1, 2, 3)
+            assert self.isspecialised(obj, '_ooo')
 
     def test_delegation(self):
         t = self.forbid_delegation((42, 43))
@@ -214,6 +216,8 @@
         raises(IndexError, "t[-3]")
 
     def test_three_tuples(self):
+        if not self.isspecialised((1, 2, 3)):
+            skip("don't have specialization for 3-tuples")
         b = self.forbid_delegation((1, 2, 3))
         c = (1,)
         d = c + (2, 3)
@@ -221,6 +225,16 @@
         assert b == d
 
     def test_mongrel(self):
+        a = self.forbid_delegation((2.2, '333'))
+        assert self.isspecialised(a)
+        assert len(a) == 2
+        assert a[0] == 2.2 and a[1] == '333'
+        b = ('333',)
+        assert a == (2.2,) + b
+        assert not a != (2.2,) + b
+        #
+        if not self.isspecialised((1, 2, 3)):
+            skip("don't have specialization for 3-tuples")
         a = self.forbid_delegation((1, 2.2, '333'))
         assert self.isspecialised(a)
         assert len(a) == 3
diff --git a/pypy/objspace/std/typetype.py b/pypy/objspace/std/typetype.py
--- a/pypy/objspace/std/typetype.py
+++ b/pypy/objspace/std/typetype.py
@@ -10,7 +10,6 @@
     w_dict=gateway.NoneNotWrapped):
 
     "This is used to create user-defined classes only."
-    from pypy.objspace.std.typeobject import W_TypeObject
     # XXX check types
 
     w_typetype = _precheck_for_new(space, w_typetype)
@@ -19,10 +18,18 @@
     if (space.is_w(space.type(w_typetype), space.w_type) and w_bases is None and
         w_dict is None):
         return space.type(w_name)
-    elif w_bases is None or w_dict is None:
+    else:
+        return _create_new_type(space, w_typetype, w_name, w_bases, w_dict)
+
+
+def _create_new_type(space, w_typetype, w_name, w_bases, w_dict):
+    # this is in its own function because we want the special case 'type(x)'
+    # above to be seen by the jit.
+    from pypy.objspace.std.typeobject import W_TypeObject
+
+    if w_bases is None or w_dict is None:
         raise OperationError(space.w_TypeError, space.wrap("type() takes 1 or 3 arguments"))
 
-
     bases_w = space.fixedview(w_bases)
 
     w_winner = w_typetype
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -32,9 +32,9 @@
             return False
         return space.unicode_w(self) is space.unicode_w(w_other)
 
-    def unique_id(self, space):
+    def immutable_unique_id(self, space):
         if self.user_overridden_class:
-            return W_Object.unique_id(self, space)
+            return None
         return space.wrap(compute_unique_id(space.unicode_w(self)))
 
 
diff --git a/pypy/rlib/jit.py b/pypy/rlib/jit.py
--- a/pypy/rlib/jit.py
+++ b/pypy/rlib/jit.py
@@ -395,6 +395,7 @@
               'retrace_limit': 5,
               'max_retrace_guards': 15,
               'enable_opts': 'all',
+              'decay_halflife': 40,
               }
 unroll_parameters = unrolling_iterable(PARAMETERS.items())
 DEFAULT = object()
diff --git a/pypy/rlib/longlong2float.py b/pypy/rlib/longlong2float.py
--- a/pypy/rlib/longlong2float.py
+++ b/pypy/rlib/longlong2float.py
@@ -79,19 +79,19 @@
 longlong2float = rffi.llexternal(
     "pypy__longlong2float", [rffi.LONGLONG], rffi.DOUBLE,
     _callable=longlong2float_emulator, compilation_info=eci,
-    _nowrapper=True, elidable_function=True)
+    _nowrapper=True, elidable_function=True, sandboxsafe=True)
 
 float2longlong = rffi.llexternal(
     "pypy__float2longlong", [rffi.DOUBLE], rffi.LONGLONG,
     _callable=float2longlong_emulator, compilation_info=eci,
-    _nowrapper=True, elidable_function=True)
+    _nowrapper=True, elidable_function=True, sandboxsafe=True)
 
 uint2singlefloat = rffi.llexternal(
     "pypy__uint2singlefloat", [rffi.UINT], rffi.FLOAT,
     _callable=uint2singlefloat_emulator, compilation_info=eci,
-    _nowrapper=True, elidable_function=True)
+    _nowrapper=True, elidable_function=True, sandboxsafe=True)
 
 singlefloat2uint = rffi.llexternal(
     "pypy__singlefloat2uint", [rffi.FLOAT], rffi.UINT,
     _callable=singlefloat2uint_emulator, compilation_info=eci,
-    _nowrapper=True, elidable_function=True)
+    _nowrapper=True, elidable_function=True, sandboxsafe=True)
diff --git a/pypy/rpython/lltypesystem/rffi.py b/pypy/rpython/lltypesystem/rffi.py
--- a/pypy/rpython/lltypesystem/rffi.py
+++ b/pypy/rpython/lltypesystem/rffi.py
@@ -16,6 +16,7 @@
 from pypy.rpython.annlowlevel import llhelper
 from pypy.rlib.objectmodel import we_are_translated
 from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
+from pypy.rlib import jit
 from pypy.rpython.lltypesystem import llmemory
 import os, sys
 
@@ -249,8 +250,7 @@
     wrapper = func_with_new_name(wrapper, name)
 
     if calling_conv != "c":
-        from pypy.rlib.jit import dont_look_inside
-        wrapper = dont_look_inside(wrapper)
+        wrapper = jit.dont_look_inside(wrapper)
 
     return wrapper
 
@@ -697,6 +697,8 @@
         return b.build()
 
     # str -> char*
+    # Can't inline this because of the raw address manipulation.
+    @jit.dont_look_inside
     def get_nonmovingbuffer(data):
         """
         Either returns a non-moving copy or performs neccessary pointer
@@ -717,6 +719,8 @@
     get_nonmovingbuffer._annenforceargs_ = [strtype]
 
     # (str, char*) -> None
+    # Can't inline this because of the raw address manipulation.
+    @jit.dont_look_inside
     def free_nonmovingbuffer(data, buf):
         """
         Either free a non-moving buffer or keep the original storage alive.
diff --git a/pypy/rpython/rint.py b/pypy/rpython/rint.py
--- a/pypy/rpython/rint.py
+++ b/pypy/rpython/rint.py
@@ -126,10 +126,7 @@
     rtype_inplace_rshift = rtype_rshift
 
     def rtype_pow(_, hop):
-        raise MissingRTypeOperation("pow(int, int)"
-                                    " (use float**float instead; it is too"
-                                    " easy to overlook the overflow"
-                                    " issues of int**int)")
+        raise MissingRTypeOperation("'**' not supported in RPython")
 
     rtype_pow_ovf = rtype_pow
     rtype_inplace_pow = rtype_pow
diff --git a/pypy/tool/clean_old_branches.py b/pypy/tool/clean_old_branches.py
--- a/pypy/tool/clean_old_branches.py
+++ b/pypy/tool/clean_old_branches.py
@@ -11,14 +11,17 @@
     sys.exit(1)
 
 def heads(args):
-    g = os.popen(r"hg heads --topo %s --template '{branches} {node|short}\n'"
+    g = os.popen(r"hg heads --topo %s --template '{node|short}:{branches}\n'"
                  % args, 'r')
     result = g.read()
     g.close()
     result = result.splitlines(False)
-    result = [s for s in result
-                if not s.startswith(' ')
-                   and not s.startswith('closed-branches ')]
+    for line in result:
+        if len(line.split(':', 1)) != 2:
+            raise ValueError("'result' contains: %r" % line)
+    result = [s.split(':', 1) for s in result]
+    result = [(head, branch) for (head, branch) in result
+                if branch not in ['', 'closed-branches']]
     return result
 
 all_heads = heads("--closed")
@@ -34,8 +37,7 @@
 
 closed_heads.reverse()
 
-for branch_head in closed_heads:
-    branch, head = branch_head.split()
+for head, branch in closed_heads:
     print '\t', branch
 print
 print 'The branches listed above will be merged to "closed-branches".'
@@ -54,8 +56,7 @@
         print '*** error %r' % (err,)
         sys.exit(1)
 
-for branch_head in closed_heads:
-    branch, head = branch_head.split()
+for head, branch in closed_heads:
     print
     print '***** %s ***** %s *****' % (branch, head)
     do("hg up --clean closed-branches")
diff --git a/pypy/tool/gcc_cache.py b/pypy/tool/gcc_cache.py
--- a/pypy/tool/gcc_cache.py
+++ b/pypy/tool/gcc_cache.py
@@ -11,6 +11,9 @@
     # Import 'platform' every time, the compiler may have been changed
     from pypy.translator.platform import platform
     cache_dir = cache_dir_root.join(cachename).ensure(dir=1)
+    c_files.extend([py.path.local(f) for f in eci.separate_module_files])
+    eci = ExternalCompilationInfo(**eci._copy_attributes())
+    eci.separate_module_files = ()
     filecontents = [c_file.read() for c_file in c_files]
     key = repr((filecontents, eci, platform.key()))
     hash = md5(key).hexdigest()