[pypy-commit] pypy win64-stage1: Merge with default

ctismer noreply at buildbot.pypy.org
Mon Mar 12 23:50:02 CET 2012


Author: Christian Tismer <tismer at stackless.com>
Branch: win64-stage1
Changeset: r53372:958c1647a847
Date: 2012-03-12 15:48 -0700
http://bitbucket.org/pypy/pypy/changeset/958c1647a847/

Log:	Merge with default

diff too long, truncating to 10000 out of 15558 lines

diff --git a/lib-python/modified-2.7/ctypes/__init__.py b/lib-python/modified-2.7/ctypes/__init__.py
--- a/lib-python/modified-2.7/ctypes/__init__.py
+++ b/lib-python/modified-2.7/ctypes/__init__.py
@@ -351,7 +351,7 @@
         self._FuncPtr = _FuncPtr
 
         if handle is None:
-            self._handle = _ffi.CDLL(name)
+            self._handle = _ffi.CDLL(name, mode)
         else:
             self._handle = handle
 
diff --git a/lib-python/modified-2.7/ctypes/test/test_callbacks.py b/lib-python/modified-2.7/ctypes/test/test_callbacks.py
--- a/lib-python/modified-2.7/ctypes/test/test_callbacks.py
+++ b/lib-python/modified-2.7/ctypes/test/test_callbacks.py
@@ -1,5 +1,6 @@
 import unittest
 from ctypes import *
+from ctypes.test import xfail
 import _ctypes_test
 
 class Callbacks(unittest.TestCase):
@@ -98,6 +99,7 @@
 ##        self.check_type(c_char_p, "abc")
 ##        self.check_type(c_char_p, "def")
 
+    @xfail
     def test_pyobject(self):
         o = ()
         from sys import getrefcount as grc
diff --git a/lib-python/modified-2.7/ctypes/test/test_libc.py b/lib-python/modified-2.7/ctypes/test/test_libc.py
--- a/lib-python/modified-2.7/ctypes/test/test_libc.py
+++ b/lib-python/modified-2.7/ctypes/test/test_libc.py
@@ -25,7 +25,10 @@
         lib.my_qsort(chars, len(chars)-1, sizeof(c_char), comparefunc(sort))
         self.assertEqual(chars.raw, "   ,,aaaadmmmnpppsss\x00")
 
-    def test_no_more_xfail(self):
+    def SKIPPED_test_no_more_xfail(self):
+        # We decided to not explicitly support the whole ctypes-2.7
+        # and instead go for a case-by-case, demand-driven approach.
+        # So this test is skipped instead of failing.
         import socket
         import ctypes.test
         self.assertTrue(not hasattr(ctypes.test, 'xfail'),
diff --git a/lib_pypy/distributed/socklayer.py b/lib_pypy/distributed/socklayer.py
--- a/lib_pypy/distributed/socklayer.py
+++ b/lib_pypy/distributed/socklayer.py
@@ -2,7 +2,7 @@
 import py
 from socket import socket
 
-XXX needs import adaptation as 'green' is removed from py lib for years 
+raise ImportError("XXX needs import adaptation as 'green' is removed from py lib for years")
 from py.impl.green.msgstruct import decodemessage, message
 from socket import socket, AF_INET, SOCK_STREAM
 import marshal
diff --git a/py/_code/code.py b/py/_code/code.py
--- a/py/_code/code.py
+++ b/py/_code/code.py
@@ -164,6 +164,7 @@
         #   if something:  # assume this causes a NameError
         #      # _this_ lines and the one
                #        below we don't want from entry.getsource()
+        end = min(end, len(source))
         for i in range(self.lineno, end):
             if source[i].rstrip().endswith(':'):
                 end = i + 1
diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py
--- a/pypy/config/pypyoption.py
+++ b/pypy/config/pypyoption.py
@@ -252,6 +252,10 @@
                    "use small tuples",
                    default=False),
 
+        BoolOption("withspecialisedtuple",
+                   "use specialised tuples",
+                   default=False),
+
         BoolOption("withrope", "use ropes as the string implementation",
                    default=False,
                    requires=[("objspace.std.withstrslice", False),
@@ -365,6 +369,7 @@
         config.objspace.std.suggest(optimized_list_getitem=True)
         config.objspace.std.suggest(getattributeshortcut=True)
         config.objspace.std.suggest(newshortcut=True)
+        config.objspace.std.suggest(withspecialisedtuple=True)
         #if not IS_64_BITS:
         #    config.objspace.std.suggest(withsmalllong=True)
 
diff --git a/pypy/conftest.py b/pypy/conftest.py
--- a/pypy/conftest.py
+++ b/pypy/conftest.py
@@ -496,6 +496,17 @@
     def setup(self):
         super(AppClassCollector, self).setup()
         cls = self.obj
+        #
+        # <hack>
+        for name in dir(cls):
+            if name.startswith('test_'):
+                func = getattr(cls, name, None)
+                code = getattr(func, 'func_code', None)
+                if code and code.co_flags & 32:
+                    raise AssertionError("unsupported: %r is a generator "
+                                         "app-level test method" % (name,))
+        # </hack>
+        #
         space = cls.space
         clsname = cls.__name__
         if self.config.option.runappdirect:
diff --git a/pypy/doc/conf.py b/pypy/doc/conf.py
--- a/pypy/doc/conf.py
+++ b/pypy/doc/conf.py
@@ -45,9 +45,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '1.6'
+version = '1.7'
 # The full version, including alpha/beta/rc tags.
-release = '1.6'
+release = '1.7'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/pypy/doc/config/objspace.std.withspecialisedtuple.txt b/pypy/doc/config/objspace.std.withspecialisedtuple.txt
new file mode 100644
--- /dev/null
+++ b/pypy/doc/config/objspace.std.withspecialisedtuple.txt
@@ -0,0 +1,3 @@
+Use "specialized tuples", a custom implementation for some common kinds
+of tuples.  Currently limited to tuples of length 2, in three variants:
+(int, int), (float, float), and a generic (object, object).
diff --git a/pypy/doc/faq.rst b/pypy/doc/faq.rst
--- a/pypy/doc/faq.rst
+++ b/pypy/doc/faq.rst
@@ -112,10 +112,32 @@
 You might be interested in our `benchmarking site`_ and our 
 `jit documentation`_.
 
+Note that the JIT has a very high warm-up cost, meaning that the
+programs are slow at the beginning.  If you want to compare the timings
+with CPython, even relatively simple programs need to run *at least* one
+second, preferrably at least a few seconds.  Large, complicated programs
+need even more time to warm-up the JIT.
+
 .. _`benchmarking site`: http://speed.pypy.org
 
 .. _`jit documentation`: jit/index.html
 
+---------------------------------------------------------------
+Couldn't the JIT dump and reload already-compiled machine code?
+---------------------------------------------------------------
+
+No, we found no way of doing that.  The JIT generates machine code
+containing a large number of constant addresses --- constant at the time
+the machine code is written.  The vast majority is probably not at all
+constants that you find in the executable, with a nice link name.  E.g.
+the addresses of Python classes are used all the time, but Python
+classes don't come statically from the executable; they are created anew
+every time you restart your program.  This makes saving and reloading
+machine code completely impossible without some very advanced way of
+mapping addresses in the old (now-dead) process to addresses in the new
+process, including checking that all the previous assumptions about the
+(now-dead) object are still true about the new object.
+
 
 .. _`prolog and javascript`:
 
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -190,8 +190,8 @@
     def is_w(self, space, w_other):
         return self is w_other
 
-    def unique_id(self, space):
-        return space.wrap(compute_unique_id(self))
+    def immutable_unique_id(self, space):
+        return None
 
     def str_w(self, space):
         w_msg = typed_unwrap_error_msg(space, "string", self)
@@ -487,6 +487,16 @@
         'parser', 'fcntl', '_codecs', 'binascii'
     ]
 
+    # These modules are treated like CPython treats built-in modules,
+    # i.e. they always shadow any xx.py.  The other modules are treated
+    # like CPython treats extension modules, and are loaded in sys.path
+    # order by the fake entry '.../lib_pypy/__extensions__'.
+    MODULES_THAT_ALWAYS_SHADOW = dict.fromkeys([
+        '__builtin__', '__pypy__', '_ast', '_codecs', '_sre', '_warnings',
+        '_weakref', 'errno', 'exceptions', 'gc', 'imp', 'marshal',
+        'posix', 'nt', 'pwd', 'signal', 'sys', 'thread', 'zipimport',
+    ], None)
+
     def make_builtins(self):
         "NOT_RPYTHON: only for initializing the space."
 
@@ -696,7 +706,10 @@
         return w_two.is_w(self, w_one)
 
     def id(self, w_obj):
-        return w_obj.unique_id(self)
+        w_result = w_obj.immutable_unique_id(self)
+        if w_result is None:
+            w_result = self.wrap(compute_unique_id(w_obj))
+        return w_result
 
     def hash_w(self, w_obj):
         """shortcut for space.int_w(space.hash(w_obj))"""
diff --git a/pypy/interpreter/typedef.py b/pypy/interpreter/typedef.py
--- a/pypy/interpreter/typedef.py
+++ b/pypy/interpreter/typedef.py
@@ -54,7 +54,11 @@
 #  Hash support
 
 def default_identity_hash(space, w_obj):
-    return space.wrap(compute_identity_hash(w_obj))
+    w_unique_id = w_obj.immutable_unique_id(space)
+    if w_unique_id is None:     # common case
+        return space.wrap(compute_identity_hash(w_obj))
+    else:
+        return space.hash(w_unique_id)
 
 # ____________________________________________________________
 #
diff --git a/pypy/jit/backend/llgraph/llimpl.py b/pypy/jit/backend/llgraph/llimpl.py
--- a/pypy/jit/backend/llgraph/llimpl.py
+++ b/pypy/jit/backend/llgraph/llimpl.py
@@ -8,6 +8,7 @@
 from pypy.objspace.flow.model import Variable, Constant
 from pypy.annotation import model as annmodel
 from pypy.jit.metainterp.history import REF, INT, FLOAT
+from pypy.jit.metainterp import history
 from pypy.jit.codewriter import heaptracker
 from pypy.rpython.lltypesystem import lltype, llmemory, rclass, rstr, rffi
 from pypy.rpython.ootypesystem import ootype
@@ -48,6 +49,11 @@
         value._the_opaque_pointer = op
         return op
 
+def _normalize(value):
+    if isinstance(value, lltype._ptr):
+        value = lltype.top_container(value._obj)
+    return value
+
 def from_opaque_string(s):
     if isinstance(s, str):
         return s
@@ -322,6 +328,14 @@
     _variables.append(v)
     return r
 
+def compile_started_vars(clt):
+    if not hasattr(clt, '_debug_argtypes'):    # only when compiling the loop
+        argtypes = [v.concretetype for v in _variables]
+        try:
+            clt._debug_argtypes = argtypes
+        except AttributeError:    # when 'clt' is actually a translated
+            pass                  # GcStruct
+
 def compile_add(loop, opnum):
     loop = _from_opaque(loop)
     loop.operations.append(Operation(opnum))
@@ -347,6 +361,16 @@
     op = loop.operations[-1]
     op.descr = weakref.ref(descr)
 
+TARGET_TOKENS = weakref.WeakKeyDictionary()
+
+def compile_add_target_token(loop, descr, clt):
+    # here, 'clt' is the compiled_loop_token of the original loop that
+    # we are compiling
+    loop = _from_opaque(loop)
+    op = loop.operations[-1]
+    descrobj = _normalize(descr)
+    TARGET_TOKENS[descrobj] = loop, len(loop.operations), op.args, clt
+
 def compile_add_var(loop, intvar):
     loop = _from_opaque(loop)
     op = loop.operations[-1]
@@ -381,13 +405,25 @@
     _variables.append(v)
     return r
 
-def compile_add_jump_target(loop, loop_target):
+def compile_add_jump_target(loop, targettoken, source_clt):
     loop = _from_opaque(loop)
-    loop_target = _from_opaque(loop_target)
+    descrobj = _normalize(targettoken)
+    (loop_target, target_opindex, target_inputargs, target_clt
+        ) = TARGET_TOKENS[descrobj]
+    #
+    try:
+        assert source_clt._debug_argtypes == target_clt._debug_argtypes
+    except AttributeError:   # when translated
+        pass
+    #
     op = loop.operations[-1]
     op.jump_target = loop_target
+    op.jump_target_opindex = target_opindex
+    op.jump_target_inputargs = target_inputargs
     assert op.opnum == rop.JUMP
-    assert len(op.args) == len(loop_target.inputargs)
+    assert [v.concretetype for v in op.args] == (
+           [v.concretetype for v in target_inputargs])
+    #
     if loop_target == loop:
         log.info("compiling new loop")
     else:
@@ -521,10 +557,11 @@
                 self.opindex += 1
                 continue
             if op.opnum == rop.JUMP:
-                assert len(op.jump_target.inputargs) == len(args)
-                self.env = dict(zip(op.jump_target.inputargs, args))
+                inputargs = op.jump_target_inputargs
+                assert len(inputargs) == len(args)
+                self.env = dict(zip(inputargs, args))
                 self.loop = op.jump_target
-                self.opindex = 0
+                self.opindex = op.jump_target_opindex
                 _stats.exec_jumps += 1
             elif op.opnum == rop.FINISH:
                 if self.verbose:
@@ -617,6 +654,15 @@
         #
         return _op_default_implementation
 
+    def op_label(self, _, *args):
+        op = self.loop.operations[self.opindex]
+        assert op.opnum == rop.LABEL
+        assert len(op.args) == len(args)
+        newenv = {}
+        for v, value in zip(op.args, args):
+            newenv[v] = value
+        self.env = newenv
+
     def op_debug_merge_point(self, _, *args):
         from pypy.jit.metainterp.warmspot import get_stats
         try:
@@ -959,6 +1005,7 @@
         self._may_force = self.opindex
         try:
             inpargs = _from_opaque(ctl.compiled_version).inputargs
+            assert len(inpargs) == len(args)
             for i, inparg in enumerate(inpargs):
                 TYPE = inparg.concretetype
                 if TYPE is lltype.Signed:
@@ -1788,9 +1835,11 @@
 setannotation(compile_start_int_var, annmodel.SomeInteger())
 setannotation(compile_start_ref_var, annmodel.SomeInteger())
 setannotation(compile_start_float_var, annmodel.SomeInteger())
+setannotation(compile_started_vars, annmodel.s_None)
 setannotation(compile_add, annmodel.s_None)
 setannotation(compile_add_descr, annmodel.s_None)
 setannotation(compile_add_descr_arg, annmodel.s_None)
+setannotation(compile_add_target_token, annmodel.s_None)
 setannotation(compile_add_var, annmodel.s_None)
 setannotation(compile_add_int_const, annmodel.s_None)
 setannotation(compile_add_ref_const, annmodel.s_None)
diff --git a/pypy/jit/backend/llgraph/runner.py b/pypy/jit/backend/llgraph/runner.py
--- a/pypy/jit/backend/llgraph/runner.py
+++ b/pypy/jit/backend/llgraph/runner.py
@@ -138,29 +138,30 @@
         clt = original_loop_token.compiled_loop_token
         clt.loop_and_bridges.append(c)
         clt.compiling_a_bridge()
-        self._compile_loop_or_bridge(c, inputargs, operations)
+        self._compile_loop_or_bridge(c, inputargs, operations, clt)
         old, oldindex = faildescr._compiled_fail
         llimpl.compile_redirect_fail(old, oldindex, c)
 
-    def compile_loop(self, inputargs, operations, looptoken, log=True, name=''):
+    def compile_loop(self, inputargs, operations, jitcell_token,
+                     log=True, name=''):
         """In a real assembler backend, this should assemble the given
         list of operations.  Here we just generate a similar CompiledLoop
         instance.  The code here is RPython, whereas the code in llimpl
         is not.
         """
         c = llimpl.compile_start()
-        clt = model.CompiledLoopToken(self, looptoken.number)
+        clt = model.CompiledLoopToken(self, jitcell_token.number)
         clt.loop_and_bridges = [c]
         clt.compiled_version = c
-        looptoken.compiled_loop_token = clt
-        self._compile_loop_or_bridge(c, inputargs, operations)
+        jitcell_token.compiled_loop_token = clt
+        self._compile_loop_or_bridge(c, inputargs, operations, clt)
 
     def free_loop_and_bridges(self, compiled_loop_token):
         for c in compiled_loop_token.loop_and_bridges:
             llimpl.mark_as_free(c)
         model.AbstractCPU.free_loop_and_bridges(self, compiled_loop_token)
 
-    def _compile_loop_or_bridge(self, c, inputargs, operations):
+    def _compile_loop_or_bridge(self, c, inputargs, operations, clt):
         var2index = {}
         for box in inputargs:
             if isinstance(box, history.BoxInt):
@@ -172,10 +173,11 @@
                 var2index[box] = llimpl.compile_start_float_var(c)
             else:
                 raise Exception("box is: %r" % (box,))
-        self._compile_operations(c, operations, var2index)
+        llimpl.compile_started_vars(clt)
+        self._compile_operations(c, operations, var2index, clt)
         return c
 
-    def _compile_operations(self, c, operations, var2index):
+    def _compile_operations(self, c, operations, var2index, clt):
         for op in operations:
             llimpl.compile_add(c, op.getopnum())
             descr = op.getdescr()
@@ -183,9 +185,11 @@
                 llimpl.compile_add_descr(c, descr.ofs, descr.typeinfo,
                                          descr.arg_types, descr.extrainfo,
                                          descr.width)
-            if (isinstance(descr, history.LoopToken) and
-                op.getopnum() != rop.JUMP):
+            if isinstance(descr, history.JitCellToken):
+                assert op.getopnum() != rop.JUMP
                 llimpl.compile_add_loop_token(c, descr)
+            if isinstance(descr, history.TargetToken) and op.getopnum() == rop.LABEL:
+                llimpl.compile_add_target_token(c, descr, clt)
             if self.is_oo and isinstance(descr, (OODescr, MethDescr)):
                 # hack hack, not rpython
                 c._obj.externalobj.operations[-1].setdescr(descr)
@@ -239,9 +243,7 @@
         assert op.is_final()
         if op.getopnum() == rop.JUMP:
             targettoken = op.getdescr()
-            assert isinstance(targettoken, history.LoopToken)
-            compiled_version = targettoken.compiled_loop_token.compiled_version
-            llimpl.compile_add_jump_target(c, compiled_version)
+            llimpl.compile_add_jump_target(c, targettoken, clt)
         elif op.getopnum() == rop.FINISH:
             faildescr = op.getdescr()
             index = self.get_fail_descr_number(faildescr)
@@ -260,21 +262,28 @@
         self.latest_frame = frame
         return fail_index
 
-    def execute_token(self, loop_token):
-        """Calls the assembler generated for the given loop.
-        Returns the ResOperation that failed, of type rop.FAIL.
-        """
-        fail_index = self._execute_token(loop_token)
-        return self.get_fail_descr_from_number(fail_index)
-
-    def set_future_value_int(self, index, intvalue):
-        llimpl.set_future_value_int(index, intvalue)
-
-    def set_future_value_ref(self, index, objvalue):
-        llimpl.set_future_value_ref(index, objvalue)
-
-    def set_future_value_float(self, index, floatvalue):
-        llimpl.set_future_value_float(index, floatvalue)
+    def make_execute_token(self, *argtypes):
+        nb_args = len(argtypes)
+        unroll_argtypes = unrolling_iterable(list(enumerate(argtypes)))
+        #
+        def execute_token(loop_token, *args):
+            assert len(args) == nb_args
+            for index, TYPE in unroll_argtypes:
+                x = args[index]
+                assert TYPE == lltype.typeOf(x)
+                if TYPE == lltype.Signed:
+                    llimpl.set_future_value_int(index, x)
+                elif TYPE == llmemory.GCREF:
+                    llimpl.set_future_value_ref(index, x)
+                elif TYPE == longlong.FLOATSTORAGE:
+                    llimpl.set_future_value_float(index, x)
+                else:
+                    assert 0
+            #
+            fail_index = self._execute_token(loop_token)
+            return self.get_fail_descr_from_number(fail_index)
+        #
+        return execute_token
 
     def get_latest_value_int(self, index):
         return llimpl.frame_int_getvalue(self.latest_frame, index)
diff --git a/pypy/jit/backend/llsupport/regalloc.py b/pypy/jit/backend/llsupport/regalloc.py
--- a/pypy/jit/backend/llsupport/regalloc.py
+++ b/pypy/jit/backend/llsupport/regalloc.py
@@ -16,32 +16,106 @@
     """ Manage frame positions
     """
     def __init__(self):
-        self.frame_bindings = {}
-        self.frame_depth    = 0
+        self.bindings = {}
+        self.used = []      # list of bools
+        self.hint_frame_locations = {}
+
+    frame_depth = property(lambda:xxx, lambda:xxx)   # XXX kill me
+
+    def get_frame_depth(self):
+        return len(self.used)
 
     def get(self, box):
-        return self.frame_bindings.get(box, None)
+        return self.bindings.get(box, None)
 
     def loc(self, box):
-        res = self.get(box)
-        if res is not None:
-            return res
+        """Return or create the frame location associated with 'box'."""
+        # first check if it's already in the frame_manager
+        try:
+            return self.bindings[box]
+        except KeyError:
+            pass
+        # check if we have a hint for this box
+        if box in self.hint_frame_locations:
+            # if we do, try to reuse the location for this box
+            loc = self.hint_frame_locations[box]
+            if self.try_to_reuse_location(box, loc):
+                return loc
+        # no valid hint.  make up a new free location
+        return self.get_new_loc(box)
+
+    def get_new_loc(self, box):
         size = self.frame_size(box.type)
-        self.frame_depth += ((-self.frame_depth) & (size-1))
-        # ^^^ frame_depth is rounded up to a multiple of 'size', assuming
+        # frame_depth is rounded up to a multiple of 'size', assuming
         # that 'size' is a power of two.  The reason for doing so is to
         # avoid obscure issues in jump.py with stack locations that try
         # to move from position (6,7) to position (7,8).
-        newloc = self.frame_pos(self.frame_depth, box.type)
-        self.frame_bindings[box] = newloc
-        self.frame_depth += size
+        while self.get_frame_depth() & (size - 1):
+            self.used.append(False)
+        #
+        index = self.get_frame_depth()
+        newloc = self.frame_pos(index, box.type)
+        for i in range(size):
+            self.used.append(True)
+        #
+        if not we_are_translated():    # extra testing
+            testindex = self.get_loc_index(newloc)
+            assert testindex == index
+        #
+        self.bindings[box] = newloc
         return newloc
 
+    def set_binding(self, box, loc):
+        self.bindings[box] = loc
+        #
+        index = self.get_loc_index(loc)
+        if index < 0:
+            return
+        endindex = index + self.frame_size(box.type)
+        while len(self.used) < endindex:
+            self.used.append(False)
+        while index < endindex:
+            self.used[index] = True
+            index += 1
+
     def reserve_location_in_frame(self, size):
-        frame_depth = self.frame_depth
-        self.frame_depth += size
+        frame_depth = self.get_frame_depth()
+        for i in range(size):
+            self.used.append(True)
         return frame_depth
 
+    def mark_as_free(self, box):
+        try:
+            loc = self.bindings[box]
+        except KeyError:
+            return    # already gone
+        del self.bindings[box]
+        #
+        size = self.frame_size(box.type)
+        baseindex = self.get_loc_index(loc)
+        if baseindex < 0:
+            return
+        for i in range(size):
+            index = baseindex + i
+            assert 0 <= index < len(self.used)
+            self.used[index] = False
+
+    def try_to_reuse_location(self, box, loc):
+        index = self.get_loc_index(loc)
+        if index < 0:
+            return False
+        size = self.frame_size(box.type)
+        for i in range(size):
+            while (index + i) >= len(self.used):
+                self.used.append(False)
+            if self.used[index + i]:
+                return False    # already in use
+        # good, we can reuse the location
+        for i in range(size):
+            self.used[index + i] = True
+        self.bindings[box] = loc
+        return True
+
     # abstract methods that need to be overwritten for specific assemblers
     @staticmethod
     def frame_pos(loc, type):
@@ -49,6 +123,10 @@
     @staticmethod
     def frame_size(type):
         return 1
+    @staticmethod
+    def get_loc_index(loc):
+        raise NotImplementedError("Purely abstract")
+
 
 class RegisterManager(object):
     """ Class that keeps track of register allocations
@@ -68,7 +146,14 @@
         self.frame_manager = frame_manager
         self.assembler = assembler
 
+    def is_still_alive(self, v):
+        # Check if 'v' is alive at the current position.
+        # Return False if the last usage is strictly before.
+        return self.longevity[v][1] >= self.position
+
     def stays_alive(self, v):
+        # Check if 'v' stays alive after the current position.
+        # Return False if the last usage is before or at position.
         return self.longevity[v][1] > self.position
 
     def next_instruction(self, incr=1):
@@ -84,11 +169,14 @@
             point for all variables that might be in registers.
         """
         self._check_type(v)
-        if isinstance(v, Const) or v not in self.reg_bindings:
+        if isinstance(v, Const):
             return
         if v not in self.longevity or self.longevity[v][1] <= self.position:
-            self.free_regs.append(self.reg_bindings[v])
-            del self.reg_bindings[v]
+            if v in self.reg_bindings:
+                self.free_regs.append(self.reg_bindings[v])
+                del self.reg_bindings[v]
+            if self.frame_manager is not None:
+                self.frame_manager.mark_as_free(v)
 
     def possibly_free_vars(self, vars):
         """ Same as 'possibly_free_var', but for all v in vars.
diff --git a/pypy/jit/backend/llsupport/test/test_regalloc.py b/pypy/jit/backend/llsupport/test/test_regalloc.py
--- a/pypy/jit/backend/llsupport/test/test_regalloc.py
+++ b/pypy/jit/backend/llsupport/test/test_regalloc.py
@@ -42,8 +42,13 @@
     def frame_size(self, box_type):
         if box_type == FLOAT:
             return 2
+        elif box_type == INT:
+            return 1
         else:
-            return 1
+            raise ValueError(box_type)
+    def get_loc_index(self, loc):
+        assert isinstance(loc, FakeFramePos)
+        return loc.pos
 
 class MockAsm(object):
     def __init__(self):
@@ -282,7 +287,7 @@
             rm.force_allocate_reg(b)
         rm.before_call()
         assert len(rm.reg_bindings) == 2
-        assert fm.frame_depth == 2
+        assert fm.get_frame_depth() == 2
         assert len(asm.moves) == 2
         rm._check_invariants()
         rm.after_call(boxes[-1])
@@ -305,7 +310,7 @@
             rm.force_allocate_reg(b)
         rm.before_call(save_all_regs=True)
         assert len(rm.reg_bindings) == 0
-        assert fm.frame_depth == 4
+        assert fm.get_frame_depth() == 4
         assert len(asm.moves) == 4
         rm._check_invariants()
         rm.after_call(boxes[-1])
@@ -327,7 +332,7 @@
         xrm = XRegisterManager(longevity, frame_manager=fm, assembler=asm)
         xrm.loc(f0)
         rm.loc(b0)
-        assert fm.frame_depth == 3
+        assert fm.get_frame_depth() == 3
         
         
 
@@ -348,3 +353,123 @@
         spilled2 = rm.force_allocate_reg(b5)
         assert spilled2 is loc
         rm._check_invariants()
+
+
+    def test_hint_frame_locations_1(self):
+        b0, = newboxes(0)
+        fm = TFrameManager()
+        loc123 = FakeFramePos(123, INT)
+        fm.hint_frame_locations[b0] = loc123
+        assert fm.get_frame_depth() == 0
+        loc = fm.loc(b0)
+        assert loc == loc123
+        assert fm.get_frame_depth() == 124
+
+    def test_hint_frame_locations_2(self):
+        b0, b1, b2 = newboxes(0, 1, 2)
+        longevity = {b0: (0, 1), b1: (0, 2), b2: (0, 2)}
+        fm = TFrameManager()
+        asm = MockAsm()
+        rm = RegisterManager(longevity, frame_manager=fm, assembler=asm)
+        rm.force_allocate_reg(b0)
+        rm.force_allocate_reg(b1)
+        rm.force_allocate_reg(b2)
+        rm.force_spill_var(b0)
+        loc = rm.loc(b0)
+        assert isinstance(loc, FakeFramePos)
+        assert fm.get_loc_index(loc) == 0
+        rm.position = 1
+        assert fm.used == [True]
+        rm.possibly_free_var(b0)
+        assert fm.used == [False]
+        #
+        fm.hint_frame_locations[b1] = loc
+        rm.force_spill_var(b1)
+        loc1 = rm.loc(b1)
+        assert loc1 == loc
+        assert fm.used == [True]
+        #
+        fm.hint_frame_locations[b2] = loc
+        rm.force_spill_var(b2)
+        loc2 = rm.loc(b2)
+        assert loc2 != loc1     # because it was not free
+        assert fm.used == [True, True]
+        #
+        rm._check_invariants()
+
+    def test_frame_manager_basic(self):
+        b0, b1 = newboxes(0, 1)
+        fm = TFrameManager()
+        loc0 = fm.loc(b0)
+        assert fm.get_loc_index(loc0) == 0
+        #
+        assert fm.get(b1) is None
+        loc1 = fm.loc(b1)
+        assert fm.get_loc_index(loc1) == 1
+        assert fm.get(b1) == loc1
+        #
+        loc0b = fm.loc(b0)
+        assert loc0b == loc0
+        #
+        fm.loc(BoxInt())
+        assert fm.get_frame_depth() == 3
+        #
+        f0 = BoxFloat()
+        locf0 = fm.loc(f0)
+        assert fm.get_loc_index(locf0) == 4
+        assert fm.get_frame_depth() == 6
+        #
+        f1 = BoxFloat()
+        locf1 = fm.loc(f1)
+        assert fm.get_loc_index(locf1) == 6
+        assert fm.get_frame_depth() == 8
+        assert fm.used == [True, True, True, False, True, True, True, True]
+        #
+        fm.mark_as_free(b0)
+        assert fm.used == [False, True, True, False, True, True, True, True]
+        fm.mark_as_free(b0)
+        assert fm.used == [False, True, True, False, True, True, True, True]
+        fm.mark_as_free(f1)
+        assert fm.used == [False, True, True, False, True, True, False, False]
+        #
+        fm.reserve_location_in_frame(1)
+        assert fm.get_frame_depth() == 9
+        assert fm.used == [False, True, True, False, True, True, False, False, True]
+        #
+        assert b0 not in fm.bindings
+        fm.set_binding(b0, loc0)
+        assert b0 in fm.bindings
+        assert fm.used == [True, True, True, False, True, True, False, False, True]
+        #
+        b3 = BoxInt()
+        assert not fm.try_to_reuse_location(b3, loc0)
+        assert fm.used == [True, True, True, False, True, True, False, False, True]
+        #
+        fm.mark_as_free(b0)
+        assert fm.used == [False, True, True, False, True, True, False, False, True]
+        assert fm.try_to_reuse_location(b3, loc0)
+        assert fm.used == [True, True, True, False, True, True, False, False, True]
+        #
+        fm.mark_as_free(b0)   # already free
+        assert fm.used == [True, True, True, False, True, True, False, False, True]
+        #
+        fm.mark_as_free(b3)
+        assert fm.used == [False, True, True, False, True, True, False, False, True]
+        f3 = BoxFloat()
+        assert not fm.try_to_reuse_location(f3, fm.frame_pos(0, FLOAT))
+        assert not fm.try_to_reuse_location(f3, fm.frame_pos(1, FLOAT))
+        assert not fm.try_to_reuse_location(f3, fm.frame_pos(2, FLOAT))
+        assert not fm.try_to_reuse_location(f3, fm.frame_pos(3, FLOAT))
+        assert not fm.try_to_reuse_location(f3, fm.frame_pos(4, FLOAT))
+        assert not fm.try_to_reuse_location(f3, fm.frame_pos(5, FLOAT))
+        assert fm.used == [False, True, True, False, True, True, False, False, True]
+        assert fm.try_to_reuse_location(f3, fm.frame_pos(6, FLOAT))
+        assert fm.used == [False, True, True, False, True, True, True, True, True]
+        #
+        fm.used = [False]
+        assert fm.try_to_reuse_location(BoxFloat(), fm.frame_pos(0, FLOAT))
+        assert fm.used == [True, True]
+        #
+        fm.used = [True]
+        assert not fm.try_to_reuse_location(BoxFloat(), fm.frame_pos(0, FLOAT))
+        assert fm.used == [True]
diff --git a/pypy/jit/backend/model.py b/pypy/jit/backend/model.py
--- a/pypy/jit/backend/model.py
+++ b/pypy/jit/backend/model.py
@@ -1,5 +1,6 @@
 from pypy.rlib.debug import debug_start, debug_print, debug_stop
 from pypy.jit.metainterp import history
+from pypy.rpython.lltypesystem import lltype
 
 
 class AbstractCPU(object):
@@ -84,24 +85,21 @@
         """Print a disassembled version of looptoken to stdout"""
         raise NotImplementedError
 
-    def execute_token(self, looptoken):
-        """Execute the generated code referenced by the looptoken.
+    def execute_token(self, looptoken, *args):
+        """NOT_RPYTHON (for tests only)
+        Execute the generated code referenced by the looptoken.
         Returns the descr of the last executed operation: either the one
         attached to the failing guard, or the one attached to the FINISH.
-        Use set_future_value_xxx() before, and get_latest_value_xxx() after.
+        Use get_latest_value_xxx() afterwards to read the result(s).
         """
-        raise NotImplementedError
+        argtypes = [lltype.typeOf(x) for x in args]
+        execute = self.make_execute_token(*argtypes)
+        return execute(looptoken, *args)
 
-    def set_future_value_int(self, index, intvalue):
-        """Set the value for the index'th argument for the loop to run."""
-        raise NotImplementedError
-
-    def set_future_value_float(self, index, floatvalue):
-        """Set the value for the index'th argument for the loop to run."""
-        raise NotImplementedError
-
-    def set_future_value_ref(self, index, objvalue):
-        """Set the value for the index'th argument for the loop to run."""
+    def make_execute_token(self, *argtypes):
+        """Must make and return an execute_token() function that will be
+        called with the given argtypes.
+        """
         raise NotImplementedError
 
     def get_latest_value_int(self, index):
diff --git a/pypy/jit/backend/test/calling_convention_test.py b/pypy/jit/backend/test/calling_convention_test.py
--- a/pypy/jit/backend/test/calling_convention_test.py
+++ b/pypy/jit/backend/test/calling_convention_test.py
@@ -2,7 +2,7 @@
                                          AbstractDescr,
                                          BasicFailDescr,
                                          BoxInt, Box, BoxPtr,
-                                         LoopToken,
+                                         JitCellToken,
                                          ConstInt, ConstPtr,
                                          BoxObj, Const,
                                          ConstObj, BoxFloat, ConstFloat)
@@ -40,17 +40,18 @@
         local_floats = list(floats)
         local_ints = list(ints)
         expected_result = 0.0
+        arguments = []
         for i in range(len(args)):
             x = args[i]
             if x[0] == 'f':
                 x = local_floats.pop()
                 t = longlong.getfloatstorage(x)
-                self.cpu.set_future_value_float(i, t)
+                arguments.append(t)
             else:
                 x = local_ints.pop()
-                self.cpu.set_future_value_int(i, x)
+                arguments.append(x)
             expected_result += x
-        return expected_result
+        return arguments, expected_result
 
     @classmethod
     def get_funcbox(cls, cpu, func_ptr):
@@ -107,12 +108,12 @@
             ops += 'finish(f99, %s)\n' % arguments
 
             loop = parse(ops, namespace=locals())
-            looptoken = LoopToken()
+            looptoken = JitCellToken()
             done_number = self.cpu.get_fail_descr_number(loop.operations[-1].getdescr())
             self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
-            expected_result = self._prepare_args(args, floats, ints)
+            argvals, expected_result = self._prepare_args(args, floats, ints)
 
-            res = self.cpu.execute_token(looptoken)
+            res = self.cpu.execute_token(looptoken, *argvals)
             x = longlong.getrealfloat(cpu.get_latest_value_float(0))
             assert abs(x - expected_result) < 0.0001
 
@@ -253,13 +254,13 @@
             called_ops += 'finish(f%d, descr=fdescr3)\n' % total_index
             # compile called loop
             called_loop = parse(called_ops, namespace=locals())
-            called_looptoken = LoopToken()
+            called_looptoken = JitCellToken()
             called_looptoken.outermost_jitdriver_sd = FakeJitDriverSD()
             done_number = self.cpu.get_fail_descr_number(called_loop.operations[-1].getdescr())
             self.cpu.compile_loop(called_loop.inputargs, called_loop.operations, called_looptoken)
 
-            expected_result = self._prepare_args(args, floats, ints)
-            res = cpu.execute_token(called_looptoken)
+            argvals, expected_result = self._prepare_args(args, floats, ints)
+            res = cpu.execute_token(called_looptoken, *argvals)
             assert res.identifier == 3
             t = longlong.getrealfloat(cpu.get_latest_value_float(0))
             assert abs(t - expected_result) < 0.0001
@@ -284,12 +285,12 @@
             # we want to take the fast path
             self.cpu.done_with_this_frame_float_v = done_number
             try:
-                othertoken = LoopToken()
+                othertoken = JitCellToken()
                 self.cpu.compile_loop(loop.inputargs, loop.operations, othertoken)
 
                 # prepare call to called_loop
-                self._prepare_args(args, floats, ints)
-                res = cpu.execute_token(othertoken)
+                argvals, _ = self._prepare_args(args, floats, ints)
+                res = cpu.execute_token(othertoken, *argvals)
                 x = longlong.getrealfloat(cpu.get_latest_value_float(0))
                 assert res.identifier == 4
                 assert abs(x - expected_result) < 0.0001
diff --git a/pypy/jit/backend/test/runner_test.py b/pypy/jit/backend/test/runner_test.py
--- a/pypy/jit/backend/test/runner_test.py
+++ b/pypy/jit/backend/test/runner_test.py
@@ -3,7 +3,7 @@
                                          AbstractDescr,
                                          BasicFailDescr,
                                          BoxInt, Box, BoxPtr,
-                                         LoopToken,
+                                         JitCellToken, TargetToken,
                                          ConstInt, ConstPtr,
                                          BoxObj,
                                          ConstObj, BoxFloat, ConstFloat)
@@ -32,22 +32,19 @@
                                                                 result_type,
                                                                 valueboxes,
                                                                 descr)
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         self.cpu.compile_loop(inputargs, operations, looptoken)
-        j = 0
+        args = []
         for box in inputargs:
             if isinstance(box, BoxInt):
-                self.cpu.set_future_value_int(j, box.getint())
-                j += 1
+                args.append(box.getint())
             elif isinstance(box, (BoxPtr, BoxObj)):
-                self.cpu.set_future_value_ref(j, box.getref_base())
-                j += 1
+                args.append(box.getref_base())
             elif isinstance(box, BoxFloat):
-                self.cpu.set_future_value_float(j, box.getfloatstorage())
-                j += 1
+                args.append(box.getfloatstorage())
             else:
                 raise NotImplementedError(box)
-        res = self.cpu.execute_token(looptoken)
+        res = self.cpu.execute_token(looptoken, *args)
         if res is operations[-1].getdescr():
             self.guard_failed = False
         else:
@@ -106,10 +103,9 @@
             ResOperation(rop.FINISH, [i1], None, descr=BasicFailDescr(1))
             ]
         inputargs = [i0]
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         self.cpu.compile_loop(inputargs, operations, looptoken)
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         res = self.cpu.get_latest_value_int(0)
         assert res == 3
         assert fail.identifier == 1
@@ -118,19 +114,20 @@
         i0 = BoxInt()
         i1 = BoxInt()
         i2 = BoxInt()
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
+        targettoken = TargetToken()
         operations = [
+            ResOperation(rop.LABEL, [i0], None, descr=targettoken),
             ResOperation(rop.INT_ADD, [i0, ConstInt(1)], i1),
             ResOperation(rop.INT_LE, [i1, ConstInt(9)], i2),
             ResOperation(rop.GUARD_TRUE, [i2], None, descr=BasicFailDescr(2)),
-            ResOperation(rop.JUMP, [i1], None, descr=looptoken),
+            ResOperation(rop.JUMP, [i1], None, descr=targettoken),
             ]
         inputargs = [i0]
-        operations[2].setfailargs([i1])
+        operations[3].setfailargs([i1])
 
         self.cpu.compile_loop(inputargs, operations, looptoken)
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_int(0)
         assert res == 10
@@ -139,19 +136,22 @@
         i0 = BoxInt()
         i1 = BoxInt()
         i2 = BoxInt()
-        looptoken = LoopToken()
+        i3 = BoxInt()
+        looptoken = JitCellToken()
+        targettoken = TargetToken()
         operations = [
+            ResOperation(rop.INT_SUB, [i3, ConstInt(42)], i0),
+            ResOperation(rop.LABEL, [i0], None, descr=targettoken),
             ResOperation(rop.INT_ADD, [i0, ConstInt(1)], i1),
             ResOperation(rop.INT_LE, [i1, ConstInt(9)], i2),
             ResOperation(rop.GUARD_TRUE, [i2], None, descr=BasicFailDescr(2)),
-            ResOperation(rop.JUMP, [i1], None, descr=looptoken),
+            ResOperation(rop.JUMP, [i1], None, descr=targettoken),
             ]
-        inputargs = [i0]
-        operations[2].setfailargs([None, None, i1, None])
+        inputargs = [i3]
+        operations[4].setfailargs([None, None, i1, None])
 
         self.cpu.compile_loop(inputargs, operations, looptoken)
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 44)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_int(2)
         assert res == 10
@@ -162,15 +162,17 @@
         i0 = BoxInt()
         i1 = BoxInt()
         i2 = BoxInt()
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
+        targettoken = TargetToken()
         operations = [
+            ResOperation(rop.LABEL, [i0], None, descr=targettoken),
             ResOperation(rop.INT_ADD, [i0, ConstInt(1)], i1),
             ResOperation(rop.INT_LE, [i1, ConstInt(9)], i2),
             ResOperation(rop.GUARD_TRUE, [i2], None, descr=BasicFailDescr()),
-            ResOperation(rop.JUMP, [i1], None, descr=looptoken),
+            ResOperation(rop.JUMP, [i1], None, descr=targettoken),
             ]
         inputargs = [i0]
-        operations[2].setfailargs([i1])
+        operations[3].setfailargs([i1])
         wr_i1 = weakref.ref(i1)
         wr_guard = weakref.ref(operations[2])
         self.cpu.compile_loop(inputargs, operations, looptoken)
@@ -190,15 +192,17 @@
         i2 = BoxInt()
         faildescr1 = BasicFailDescr(1)
         faildescr2 = BasicFailDescr(2)
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
+        targettoken = TargetToken()
         operations = [
+            ResOperation(rop.LABEL, [i0], None, descr=targettoken),
             ResOperation(rop.INT_ADD, [i0, ConstInt(1)], i1),
             ResOperation(rop.INT_LE, [i1, ConstInt(9)], i2),
             ResOperation(rop.GUARD_TRUE, [i2], None, descr=faildescr1),
-            ResOperation(rop.JUMP, [i1], None, descr=looptoken),
+            ResOperation(rop.JUMP, [i1], None, descr=targettoken),
             ]
         inputargs = [i0]
-        operations[2].setfailargs([i1])
+        operations[3].setfailargs([i1])
         self.cpu.compile_loop(inputargs, operations, looptoken)
 
         i1b = BoxInt()
@@ -206,14 +210,13 @@
         bridge = [
             ResOperation(rop.INT_LE, [i1b, ConstInt(19)], i3),
             ResOperation(rop.GUARD_TRUE, [i3], None, descr=faildescr2),
-            ResOperation(rop.JUMP, [i1b], None, descr=looptoken),
+            ResOperation(rop.JUMP, [i1b], None, descr=targettoken),
         ]
         bridge[1].setfailargs([i1b])
 
         self.cpu.compile_bridge(faildescr1, [i1b], bridge, looptoken)
 
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_int(0)
         assert res == 20
@@ -226,17 +229,21 @@
         i0 = BoxInt()
         i1 = BoxInt()
         i2 = BoxInt()
+        i3 = BoxInt()
         faildescr1 = BasicFailDescr(1)
         faildescr2 = BasicFailDescr(2)
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
+        targettoken = TargetToken()
         operations = [
+            ResOperation(rop.INT_SUB, [i3, ConstInt(42)], i0),
+            ResOperation(rop.LABEL, [i0], None, descr=targettoken),
             ResOperation(rop.INT_ADD, [i0, ConstInt(1)], i1),
             ResOperation(rop.INT_LE, [i1, ConstInt(9)], i2),
             ResOperation(rop.GUARD_TRUE, [i2], None, descr=faildescr1),
-            ResOperation(rop.JUMP, [i1], None, descr=looptoken),
+            ResOperation(rop.JUMP, [i1], None, descr=targettoken),
             ]
-        inputargs = [i0]
-        operations[2].setfailargs([None, i1, None])
+        inputargs = [i3]
+        operations[4].setfailargs([None, i1, None])
         self.cpu.compile_loop(inputargs, operations, looptoken)
 
         i1b = BoxInt()
@@ -244,14 +251,13 @@
         bridge = [
             ResOperation(rop.INT_LE, [i1b, ConstInt(19)], i3),
             ResOperation(rop.GUARD_TRUE, [i3], None, descr=faildescr2),
-            ResOperation(rop.JUMP, [i1b], None, descr=looptoken),
+            ResOperation(rop.JUMP, [i1b], None, descr=targettoken),
         ]
         bridge[1].setfailargs([i1b])
 
         self.cpu.compile_bridge(faildescr1, [i1b], bridge, looptoken)
 
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_int(0)
         assert res == 20
@@ -261,19 +267,20 @@
         i1 = BoxInt()
         i2 = BoxInt()
         faildescr1 = BasicFailDescr(1)
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
+        targettoken = TargetToken()
         operations = [
+            ResOperation(rop.LABEL, [i0], None, descr=targettoken),
             ResOperation(rop.INT_ADD, [i0, ConstInt(1)], i1),
             ResOperation(rop.INT_LE, [i1, ConstInt(9)], i2),
             ResOperation(rop.GUARD_TRUE, [i2], None, descr=faildescr1),
-            ResOperation(rop.JUMP, [i1], None, descr=looptoken),
+            ResOperation(rop.JUMP, [i1], None, descr=targettoken),
             ]
         inputargs = [i0]
-        operations[2].setfailargs([None, i1, None])
+        operations[3].setfailargs([None, i1, None])
         self.cpu.compile_loop(inputargs, operations, looptoken)
 
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail is faildescr1
 
         count = self.cpu.get_latest_value_count()
@@ -290,18 +297,17 @@
                     return AbstractFailDescr.__setattr__(self, name, value)
                 py.test.fail("finish descrs should not be touched")
         faildescr = UntouchableFailDescr() # to check that is not touched
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         operations = [
             ResOperation(rop.FINISH, [i0], None, descr=faildescr)
             ]
         self.cpu.compile_loop([i0], operations, looptoken)
-        self.cpu.set_future_value_int(0, 99)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 99)
         assert fail is faildescr
         res = self.cpu.get_latest_value_int(0)
         assert res == 99
 
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         operations = [
             ResOperation(rop.FINISH, [ConstInt(42)], None, descr=faildescr)
             ]
@@ -311,7 +317,7 @@
         res = self.cpu.get_latest_value_int(0)
         assert res == 42
 
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         operations = [
             ResOperation(rop.FINISH, [], None, descr=faildescr)
             ]
@@ -320,20 +326,19 @@
         assert fail is faildescr
 
         if self.cpu.supports_floats:
-            looptoken = LoopToken()
+            looptoken = JitCellToken()
             f0 = BoxFloat()
             operations = [
                 ResOperation(rop.FINISH, [f0], None, descr=faildescr)
                 ]
             self.cpu.compile_loop([f0], operations, looptoken)
             value = longlong.getfloatstorage(-61.25)
-            self.cpu.set_future_value_float(0, value)
-            fail = self.cpu.execute_token(looptoken)
+            fail = self.cpu.execute_token(looptoken, value)
             assert fail is faildescr
             res = self.cpu.get_latest_value_float(0)
             assert longlong.getrealfloat(res) == -61.25
 
-            looptoken = LoopToken()
+            looptoken = JitCellToken()
             operations = [
                 ResOperation(rop.FINISH, [constfloat(42.5)], None, descr=faildescr)
                 ]
@@ -350,20 +355,20 @@
         z = BoxInt(579)
         t = BoxInt(455)
         u = BoxInt(0)    # False
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
+        targettoken = TargetToken()
         operations = [
+            ResOperation(rop.LABEL, [y, x], None, descr=targettoken),
             ResOperation(rop.INT_ADD, [x, y], z),
             ResOperation(rop.INT_SUB, [y, ConstInt(1)], t),
             ResOperation(rop.INT_EQ, [t, ConstInt(0)], u),
             ResOperation(rop.GUARD_FALSE, [u], None,
                          descr=BasicFailDescr()),
-            ResOperation(rop.JUMP, [z, t], None, descr=looptoken),
+            ResOperation(rop.JUMP, [t, z], None, descr=targettoken),
             ]
         operations[-2].setfailargs([t, z])
         cpu.compile_loop([x, y], operations, looptoken)
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.set_future_value_int(1, 10)
-        res = self.cpu.execute_token(looptoken)
+        res = self.cpu.execute_token(looptoken, 0, 10)
         assert self.cpu.get_latest_value_int(0) == 0
         assert self.cpu.get_latest_value_int(1) == 55
 
@@ -419,14 +424,12 @@
                     ]
                 ops[1].setfailargs([v_res])
             #
-            looptoken = LoopToken()
+            looptoken = JitCellToken()
             self.cpu.compile_loop([v1, v2], ops, looptoken)
             for x, y, z in testcases:
                 excvalue = self.cpu.grab_exc_value()
                 assert not excvalue
-                self.cpu.set_future_value_int(0, x)
-                self.cpu.set_future_value_int(1, y)
-                fail = self.cpu.execute_token(looptoken)
+                fail = self.cpu.execute_token(looptoken, x, y)
                 if (z == boom) ^ reversed:
                     assert fail.identifier == 1
                 else:
@@ -1082,16 +1085,18 @@
             inputargs.insert(index_counter, i0)
             jumpargs.insert(index_counter, i1)
             #
-            looptoken = LoopToken()
+            looptoken = JitCellToken()
+            targettoken = TargetToken()
             faildescr = BasicFailDescr(15)
             operations = [
+                ResOperation(rop.LABEL, inputargs, None, descr=targettoken),
                 ResOperation(rop.INT_SUB, [i0, ConstInt(1)], i1),
                 ResOperation(rop.INT_GE, [i1, ConstInt(0)], i2),
                 ResOperation(rop.GUARD_TRUE, [i2], None),
-                ResOperation(rop.JUMP, jumpargs, None, descr=looptoken),
+                ResOperation(rop.JUMP, jumpargs, None, descr=targettoken),
                 ]
-            operations[2].setfailargs(inputargs[:])
-            operations[2].setdescr(faildescr)
+            operations[3].setfailargs(inputargs[:])
+            operations[3].setdescr(faildescr)
             #
             self.cpu.compile_loop(inputargs, operations, looptoken)
             #
@@ -1109,17 +1114,7 @@
                     assert 0
             values[index_counter] = 11
             #
-            for i, (box, val) in enumerate(zip(inputargs, values)):
-                if isinstance(box, BoxInt):
-                    self.cpu.set_future_value_int(i, val)
-                elif isinstance(box, BoxPtr):
-                    self.cpu.set_future_value_ref(i, val)
-                elif isinstance(box, BoxFloat):
-                    self.cpu.set_future_value_float(i, val)
-                else:
-                    assert 0
-            #
-            fail = self.cpu.execute_token(looptoken)
+            fail = self.cpu.execute_token(looptoken, *values)
             assert fail.identifier == 15
             #
             dstvalues = values[:]
@@ -1149,30 +1144,33 @@
             py.test.skip("requires floats")
         fboxes = [BoxFloat() for i in range(12)]
         i2 = BoxInt()
+        targettoken = TargetToken()
         faildescr1 = BasicFailDescr(1)
         faildescr2 = BasicFailDescr(2)
         operations = [
+            ResOperation(rop.LABEL, fboxes, None, descr=targettoken),
             ResOperation(rop.FLOAT_LE, [fboxes[0], constfloat(9.2)], i2),
             ResOperation(rop.GUARD_TRUE, [i2], None, descr=faildescr1),
             ResOperation(rop.FINISH, fboxes, None, descr=faildescr2),
             ]
         operations[-2].setfailargs(fboxes)
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         self.cpu.compile_loop(fboxes, operations, looptoken)
 
         fboxes2 = [BoxFloat() for i in range(12)]
         f3 = BoxFloat()
         bridge = [
             ResOperation(rop.FLOAT_SUB, [fboxes2[0], constfloat(1.0)], f3),
-            ResOperation(rop.JUMP, [f3] + fboxes2[1:], None, descr=looptoken),
+            ResOperation(rop.JUMP, [f3]+fboxes2[1:], None, descr=targettoken),
         ]
 
         self.cpu.compile_bridge(faildescr1, fboxes2, bridge, looptoken)
 
+        args = []
         for i in range(len(fboxes)):
             x = 13.5 + 6.73 * i
-            self.cpu.set_future_value_float(i, longlong.getfloatstorage(x))
-        fail = self.cpu.execute_token(looptoken)
+            args.append(longlong.getfloatstorage(x))
+        fail = self.cpu.execute_token(looptoken, *args)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(res) == 8.5
@@ -1214,7 +1212,7 @@
                         ResOperation(rop.FINISH, [], None, descr=faildescr2),
                         ]
                     operations[-2].setfailargs([])
-                    looptoken = LoopToken()
+                    looptoken = JitCellToken()
                     self.cpu.compile_loop(inputargs, operations, looptoken)
                     #
                     cpu = self.cpu
@@ -1222,14 +1220,12 @@
                         if test1 == -42 or combinaison[0] == 'b':
                             for test2 in [-65, -42, -11]:
                                 if test2 == -42 or combinaison[1] == 'b':
-                                    n = 0
+                                    args = []
                                     if combinaison[0] == 'b':
-                                        cpu.set_future_value_int(n, test1)
-                                        n += 1
+                                        args.append(test1)
                                     if combinaison[1] == 'b':
-                                        cpu.set_future_value_int(n, test2)
-                                        n += 1
-                                    fail = cpu.execute_token(looptoken)
+                                        args.append(test2)
+                                    fail = cpu.execute_token(looptoken, *args)
                                     #
                                     expected = compare(test1, test2)
                                     expected ^= guard_case
@@ -1271,7 +1267,7 @@
                         ResOperation(rop.FINISH, [], None, descr=faildescr2),
                         ]
                     operations[-2].setfailargs([])
-                    looptoken = LoopToken()
+                    looptoken = JitCellToken()
                     self.cpu.compile_loop(inputargs, operations, looptoken)
                     #
                     cpu = self.cpu
@@ -1281,16 +1277,14 @@
                         if test1 == -4.5 or combinaison[0] == 'b':
                             for test2 in [-6.5, -4.5, -2.5, nan]:
                                 if test2 == -4.5 or combinaison[1] == 'b':
-                                    n = 0
+                                    args = []
                                     if combinaison[0] == 'b':
-                                        cpu.set_future_value_float(
-                                            n, longlong.getfloatstorage(test1))
-                                        n += 1
+                                        args.append(
+                                            longlong.getfloatstorage(test1))
                                     if combinaison[1] == 'b':
-                                        cpu.set_future_value_float(
-                                            n, longlong.getfloatstorage(test2))
-                                        n += 1
-                                    fail = cpu.execute_token(looptoken)
+                                        args.append(
+                                            longlong.getfloatstorage(test2))
+                                    fail = cpu.execute_token(looptoken, *args)
                                     #
                                     expected = compare(test1, test2)
                                     expected ^= guard_case
@@ -1330,19 +1324,20 @@
         faildescr = BasicFailDescr(1)
         operations.append(ResOperation(rop.FINISH, [], None,
                                        descr=faildescr))
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         #
         self.cpu.compile_loop(inputargs, operations, looptoken)
         #
-        for i, box in enumerate(inputargs):
+        args = []
+        for box in inputargs:
             if isinstance(box, BoxInt):
-                self.cpu.set_future_value_int(i, box.getint())
+                args.append(box.getint())
             elif isinstance(box, BoxFloat):
-                self.cpu.set_future_value_float(i, box.getfloatstorage())
+                args.append(box.getfloatstorage())
             else:
                 assert 0
         #
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, *args)
         assert fail.identifier == 1
 
     def test_nan_and_infinity(self):
@@ -1400,15 +1395,14 @@
                             ResOperation(rop.FINISH, [], None,
                                          descr=BasicFailDescr(5))]
                         operations[1].setfailargs([])
-                        looptoken = LoopToken()
+                        looptoken = JitCellToken()
                         # Use "set" to unique-ify inputargs
                         unique_testcase_list = list(set(testcase))
                         self.cpu.compile_loop(unique_testcase_list, operations,
                                               looptoken)
-                        for i, box in enumerate(unique_testcase_list):
-                            self.cpu.set_future_value_float(
-                                i, box.getfloatstorage())
-                        fail = self.cpu.execute_token(looptoken)
+                        args = [box.getfloatstorage()
+                                for box in unique_testcase_list]
+                        fail = self.cpu.execute_token(looptoken, *args)
                         if fail.identifier != 5 - (expected_id^expected):
                             if fail.identifier == 4:
                                 msg = "was taken"
@@ -1675,15 +1669,14 @@
         exc_tp = xtp
         exc_ptr = xptr
         loop = parse(ops, self.cpu, namespace=locals())
-        self.cpu.compile_loop(loop.inputargs, loop.operations, loop.token)
-        self.cpu.set_future_value_int(0, 1)
-        self.cpu.execute_token(loop.token)
+        looptoken = JitCellToken()
+        self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
+        self.cpu.execute_token(looptoken, 1)
         assert self.cpu.get_latest_value_int(0) == 0
         assert self.cpu.get_latest_value_ref(1) == xptr
         excvalue = self.cpu.grab_exc_value()
         assert not excvalue
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.execute_token(loop.token)
+        self.cpu.execute_token(looptoken, 0)
         assert self.cpu.get_latest_value_int(0) == 1
         excvalue = self.cpu.grab_exc_value()
         assert not excvalue
@@ -1700,9 +1693,9 @@
         exc_tp = ytp
         exc_ptr = yptr
         loop = parse(ops, self.cpu, namespace=locals())
-        self.cpu.compile_loop(loop.inputargs, loop.operations, loop.token)
-        self.cpu.set_future_value_int(0, 1)
-        self.cpu.execute_token(loop.token)
+        looptoken = JitCellToken()
+        self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
+        self.cpu.execute_token(looptoken, 1)
         assert self.cpu.get_latest_value_int(0) == 1
         excvalue = self.cpu.grab_exc_value()
         assert excvalue == yptr
@@ -1718,14 +1711,13 @@
         finish(0)
         '''
         loop = parse(ops, self.cpu, namespace=locals())
-        self.cpu.compile_loop(loop.inputargs, loop.operations, loop.token)
-        self.cpu.set_future_value_int(0, 1)
-        self.cpu.execute_token(loop.token)
+        looptoken = JitCellToken()
+        self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
+        self.cpu.execute_token(looptoken, 1)
         assert self.cpu.get_latest_value_int(0) == 1
         excvalue = self.cpu.grab_exc_value()
         assert excvalue == xptr
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.execute_token(loop.token)
+        self.cpu.execute_token(looptoken, 0)
         assert self.cpu.get_latest_value_int(0) == 0
         excvalue = self.cpu.grab_exc_value()
         assert not excvalue
@@ -1895,18 +1887,14 @@
         ResOperation(rop.FINISH, [i0], None, descr=BasicFailDescr(0))
         ]
         ops[2].setfailargs([i1, i0])
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         self.cpu.compile_loop([i0, i1], ops, looptoken)
-        self.cpu.set_future_value_int(0, 20)
-        self.cpu.set_future_value_int(1, 0)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 20, 0)
         assert fail.identifier == 0
         assert self.cpu.get_latest_value_int(0) == 20
         assert values == []
 
-        self.cpu.set_future_value_int(0, 10)
-        self.cpu.set_future_value_int(1, 1)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 10, 1)
         assert fail.identifier == 1
         assert self.cpu.get_latest_value_int(0) == 1
         assert self.cpu.get_latest_value_int(1) == 10
@@ -1940,18 +1928,14 @@
         ResOperation(rop.FINISH, [i2], None, descr=BasicFailDescr(0))
         ]
         ops[2].setfailargs([i1, i2, i0])
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         self.cpu.compile_loop([i0, i1], ops, looptoken)
-        self.cpu.set_future_value_int(0, 20)
-        self.cpu.set_future_value_int(1, 0)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 20, 0)
         assert fail.identifier == 0
         assert self.cpu.get_latest_value_int(0) == 42
         assert values == []
 
-        self.cpu.set_future_value_int(0, 10)
-        self.cpu.set_future_value_int(1, 1)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 10, 1)
         assert fail.identifier == 1
         assert self.cpu.get_latest_value_int(0) == 1
         assert self.cpu.get_latest_value_int(1) == 42
@@ -1986,19 +1970,15 @@
         ResOperation(rop.FINISH, [f2], None, descr=BasicFailDescr(0))
         ]
         ops[2].setfailargs([i1, f2, i0])
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         self.cpu.compile_loop([i0, i1], ops, looptoken)
-        self.cpu.set_future_value_int(0, 20)
-        self.cpu.set_future_value_int(1, 0)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 20, 0)
         assert fail.identifier == 0
         x = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(x) == 42.5
         assert values == []
 
-        self.cpu.set_future_value_int(0, 10)
-        self.cpu.set_future_value_int(1, 1)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 10, 1)
         assert fail.identifier == 1
         assert self.cpu.get_latest_value_int(0) == 1
         x = self.cpu.get_latest_value_float(1)
@@ -2031,10 +2011,9 @@
         ResOperation(rop.FINISH, [i2], None, descr=BasicFailDescr(0))
         ]
         ops[1].setfailargs([i1, i2])
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         self.cpu.compile_loop([i1], ops, looptoken)
-        self.cpu.set_future_value_int(0, ord('G'))
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, ord('G'))
         assert fail.identifier == 0
         assert self.cpu.get_latest_value_int(0) == ord('g')
 
@@ -2091,14 +2070,14 @@
         ResOperation(rop.FINISH, [], None, descr=BasicFailDescr(0))
         ]
         ops[1].setfailargs([])
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         self.cpu.compile_loop([i0, i1, i2, i3], ops, looptoken)
-        self.cpu.set_future_value_int(0, rffi.cast(lltype.Signed, raw))
-        self.cpu.set_future_value_int(1, 2)
-        self.cpu.set_future_value_int(2, 4)
-        self.cpu.set_future_value_int(3, rffi.cast(lltype.Signed, fn))
+        args = [rffi.cast(lltype.Signed, raw),
+                2,
+                4,
+                rffi.cast(lltype.Signed, fn)]
         assert glob.lst == []
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, *args)
         assert fail.identifier == 0
         assert len(glob.lst) > 0
         lltype.free(raw, flavor='raw')
@@ -2147,13 +2126,12 @@
         ops += [
             ResOperation(rop.FINISH, [i3], None, descr=BasicFailDescr(0))
         ]
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         self.cpu.compile_loop([i1, i2], ops, looptoken)
 
         buffer = lltype.malloc(rffi.CCHARP.TO, buflen, flavor='raw')
-        self.cpu.set_future_value_int(0, buflen)
-        self.cpu.set_future_value_int(1, rffi.cast(lltype.Signed, buffer))
-        fail = self.cpu.execute_token(looptoken)
+        args = [buflen, rffi.cast(lltype.Signed, buffer)]
+        fail = self.cpu.execute_token(looptoken, *args)
         assert fail.identifier == 0
         assert self.cpu.get_latest_value_int(0) == len(cwd)
         assert rffi.charp2strn(buffer, buflen) == cwd
@@ -2169,12 +2147,10 @@
             ResOperation(rop.FINISH, [i0], None, descr=BasicFailDescr(0))
         ]
         ops[0].setfailargs([i1])
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         self.cpu.compile_loop([i0, i1], ops, looptoken)
 
-        self.cpu.set_future_value_int(0, -42)
-        self.cpu.set_future_value_int(1, 9)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, -42, 9)
         assert fail.identifier == 0
         assert self.cpu.get_latest_value_int(0) == -42
         print 'step 1 ok'
@@ -2183,9 +2159,7 @@
         # mark as failing
         self.cpu.invalidate_loop(looptoken)
 
-        self.cpu.set_future_value_int(0, -42)
-        self.cpu.set_future_value_int(1, 9)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, -42, 9)
         assert fail is faildescr
         assert self.cpu.get_latest_value_int(0) == 9
         print 'step 2 ok'
@@ -2201,9 +2175,7 @@
         ops[0].setfailargs([])
         self.cpu.compile_bridge(faildescr, [i2], ops, looptoken)
 
-        self.cpu.set_future_value_int(0, -42)
-        self.cpu.set_future_value_int(1, 9)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, -42, 9)
         assert fail.identifier == 3
         assert self.cpu.get_latest_value_int(0) == 9
         print 'step 3 ok'
@@ -2212,9 +2184,7 @@
         # mark as failing again
         self.cpu.invalidate_loop(looptoken)
 
-        self.cpu.set_future_value_int(0, -42)
-        self.cpu.set_future_value_int(1, 9)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, -42, 9)
         assert fail is faildescr2
         print 'step 4 ok'
         print '-'*79
@@ -2415,7 +2385,7 @@
         i18 = int_add(i17, i9)
         finish(i18)'''
         loop = parse(ops)
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         looptoken.outermost_jitdriver_sd = FakeJitDriverSD()
         self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
         ARGS = [lltype.Signed] * 10
@@ -2423,9 +2393,8 @@
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
             lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
             EffectInfo.MOST_GENERAL)
-        for i in range(10):
-            self.cpu.set_future_value_int(i, i+1)
-        res = self.cpu.execute_token(looptoken)
+        args = [i+1 for i in range(10)]
+        res = self.cpu.execute_token(looptoken, *args)
         assert self.cpu.get_latest_value_int(0) == 55
         ops = '''
         [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9]
@@ -2435,11 +2404,10 @@
         finish(i11)
         '''
         loop = parse(ops, namespace=locals())
-        othertoken = LoopToken()
+        othertoken = JitCellToken()
         self.cpu.compile_loop(loop.inputargs, loop.operations, othertoken)
-        for i in range(10):
-            self.cpu.set_future_value_int(i, i+1)
-        res = self.cpu.execute_token(othertoken)
+        args = [i+1 for i in range(10)]
+        res = self.cpu.execute_token(othertoken, *args)
         assert self.cpu.get_latest_value_int(0) == 13
         assert called
 
@@ -2471,12 +2439,12 @@
         finish(f2)'''
         loop = parse(ops)
         done_number = self.cpu.get_fail_descr_number(loop.operations[-1].getdescr())
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         looptoken.outermost_jitdriver_sd = FakeJitDriverSD()
         self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
-        self.cpu.set_future_value_float(0, longlong.getfloatstorage(1.2))
-        self.cpu.set_future_value_float(1, longlong.getfloatstorage(2.3))
-        res = self.cpu.execute_token(looptoken)
+        args = [longlong.getfloatstorage(1.2),
+                longlong.getfloatstorage(2.3)]
+        res = self.cpu.execute_token(looptoken, *args)
         x = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(x) == 1.2 + 2.3
         ops = '''
@@ -2486,11 +2454,11 @@
         finish(f3)
         '''
         loop = parse(ops, namespace=locals())
-        othertoken = LoopToken()
+        othertoken = JitCellToken()
         self.cpu.compile_loop(loop.inputargs, loop.operations, othertoken)
-        self.cpu.set_future_value_float(0, longlong.getfloatstorage(1.2))
-        self.cpu.set_future_value_float(1, longlong.getfloatstorage(3.2))
-        res = self.cpu.execute_token(othertoken)
+        args = [longlong.getfloatstorage(1.2),
+                longlong.getfloatstorage(3.2)]
+        res = self.cpu.execute_token(othertoken, *args)
         x = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(x) == 13.5
         assert called
@@ -2499,11 +2467,11 @@
         del called[:]
         self.cpu.done_with_this_frame_float_v = done_number
         try:
-            othertoken = LoopToken()
+            othertoken = JitCellToken()
             self.cpu.compile_loop(loop.inputargs, loop.operations, othertoken)
-            self.cpu.set_future_value_float(0, longlong.getfloatstorage(1.2))
-            self.cpu.set_future_value_float(1, longlong.getfloatstorage(3.2))
-            res = self.cpu.execute_token(othertoken)
+            args = [longlong.getfloatstorage(1.2),
+                    longlong.getfloatstorage(3.2)]
+            res = self.cpu.execute_token(othertoken, *args)
             x = self.cpu.get_latest_value_float(0)
             assert longlong.getrealfloat(x) == 1.2 + 3.2
             assert not called
@@ -2561,12 +2529,12 @@
         f2 = float_add(f0, f1)
         finish(f2)'''
         loop = parse(ops)
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         looptoken.outermost_jitdriver_sd = FakeJitDriverSD()
         self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
-        self.cpu.set_future_value_float(0, longlong.getfloatstorage(1.25))
-        self.cpu.set_future_value_float(1, longlong.getfloatstorage(2.35))
-        res = self.cpu.execute_token(looptoken)
+        args = [longlong.getfloatstorage(1.25),
+                longlong.getfloatstorage(2.35)]
+        res = self.cpu.execute_token(looptoken, *args)
         x = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(x) == 1.25 + 2.35
         assert not called
@@ -2578,13 +2546,13 @@
         finish(f3)
         '''
         loop = parse(ops, namespace=locals())
-        othertoken = LoopToken()
+        othertoken = JitCellToken()
         self.cpu.compile_loop(loop.inputargs, loop.operations, othertoken)
 
         # normal call_assembler: goes to looptoken
-        self.cpu.set_future_value_float(0, longlong.getfloatstorage(1.25))
-        self.cpu.set_future_value_float(1, longlong.getfloatstorage(3.25))
-        res = self.cpu.execute_token(othertoken)
+        args = [longlong.getfloatstorage(1.25),
+                longlong.getfloatstorage(3.25)]
+        res = self.cpu.execute_token(othertoken, *args)
         x = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(x) == 13.5
         assert called
@@ -2596,7 +2564,7 @@
         f2 = float_sub(f0, f1)
         finish(f2)'''
         loop = parse(ops)
-        looptoken2 = LoopToken()
+        looptoken2 = JitCellToken()
         looptoken2.outermost_jitdriver_sd = FakeJitDriverSD()
         self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken2)
 
@@ -2604,10 +2572,9 @@
         self.cpu.redirect_call_assembler(looptoken, looptoken2)
 
         # now, our call_assembler should go to looptoken2
-        self.cpu.set_future_value_float(0, longlong.getfloatstorage(6.0))
-        self.cpu.set_future_value_float(1, longlong.getfloatstorage(1.5))
-                                                       # 6.0-1.5 == 1.25+3.25
-        res = self.cpu.execute_token(othertoken)
+        args = [longlong.getfloatstorage(6.0),
+                longlong.getfloatstorage(1.5)]         # 6.0-1.5 == 1.25+3.25
+        res = self.cpu.execute_token(othertoken, *args)
         x = self.cpu.get_latest_value_float(0)
         assert longlong.getrealfloat(x) == 13.5
         assert called
@@ -2958,13 +2925,137 @@
             ResOperation(rop.FINISH, [p0], None, descr=BasicFailDescr(1))
             ]
         inputargs = [i0]
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         self.cpu.compile_loop(inputargs, operations, looptoken)
         # overflowing value:
-        self.cpu.set_future_value_int(0, sys.maxint // 4 + 1)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, sys.maxint // 4 + 1)
         assert fail.identifier == excdescr.identifier
 
+    def test_compile_loop_with_target(self):
+        i0 = BoxInt()
+        i1 = BoxInt()
+        i2 = BoxInt()
+        i3 = BoxInt()
+        looptoken = JitCellToken()
+        targettoken1 = TargetToken()
+        targettoken2 = TargetToken()
+        faildescr = BasicFailDescr(2)
+        operations = [
+            ResOperation(rop.LABEL, [i0], None, descr=targettoken1),
+            ResOperation(rop.INT_ADD, [i0, ConstInt(1)], i1),
+            ResOperation(rop.INT_LE, [i1, ConstInt(9)], i2),
+            ResOperation(rop.GUARD_TRUE, [i2], None, descr=faildescr),
+            ResOperation(rop.LABEL, [i1], None, descr=targettoken2),
+            ResOperation(rop.INT_GE, [i1, ConstInt(0)], i3),
+            ResOperation(rop.GUARD_TRUE, [i3], None, descr=BasicFailDescr(3)),
+            ResOperation(rop.JUMP, [i1], None, descr=targettoken1),
+            ]
+        inputargs = [i0]
+        operations[3].setfailargs([i1])
+        operations[6].setfailargs([i1])
+
+        self.cpu.compile_loop(inputargs, operations, looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
+        assert fail.identifier == 2
+        res = self.cpu.get_latest_value_int(0)
+        assert res == 10
+
+        inputargs = [i0]
+        operations = [
+            ResOperation(rop.INT_SUB, [i0, ConstInt(20)], i2),
+            ResOperation(rop.JUMP, [i2], None, descr=targettoken2),
+            ]
+        self.cpu.compile_bridge(faildescr, inputargs, operations, looptoken)
+        
+        fail = self.cpu.execute_token(looptoken, 2)
+        assert fail.identifier == 3
+        res = self.cpu.get_latest_value_int(0)
+        assert res == -10
+
+    def test_compile_bridge_with_target(self):
+        # This test creates a loopy piece of code in a bridge, and builds another
+        # unrelated loop that ends in a jump directly to this loopy bit of code.
+        # It catches a case in which we underestimate the needed frame_depth across
+        # the cross-loop JUMP, because we estimate it based on the frame_depth stored
+        # in the original loop.
+        i0 = BoxInt()
+        i1 = BoxInt()
+        looptoken1 = JitCellToken()
+        targettoken1 = TargetToken()
+        faildescr1 = BasicFailDescr(2)
+        inputargs = [i0]
+        operations = [
+            ResOperation(rop.INT_LE, [i0, ConstInt(1)], i1),
+            ResOperation(rop.GUARD_TRUE, [i1], None, descr=faildescr1),
+            ResOperation(rop.FINISH, [i0], None, descr=BasicFailDescr(1234)),
+            ]
+        operations[1].setfailargs([i0])
+        self.cpu.compile_loop(inputargs, operations, looptoken1)
+
+        def func(a, b, c, d, e, f, g, h, i):
+            assert a + 2 == b
+            assert a + 4 == c
+            assert a + 6 == d
+            assert a + 8 == e
+            assert a + 10 == f
+            assert a + 12 == g
+            assert a + 14 == h
+            assert a + 16 == i
+        FPTR = self.Ptr(self.FuncType([lltype.Signed]*9, lltype.Void))
+        func_ptr = llhelper(FPTR, func)
+        cpu = self.cpu
+        calldescr = cpu.calldescrof(deref(FPTR), (lltype.Signed,)*9, lltype.Void,
+                                    EffectInfo.MOST_GENERAL)
+        funcbox = self.get_funcbox(cpu, func_ptr)
+
+        i0 = BoxInt(); i1 = BoxInt(); i2 = BoxInt(); i3 = BoxInt(); i4 = BoxInt()
+        i5 = BoxInt(); i6 = BoxInt(); i7 = BoxInt(); i8 = BoxInt(); i9 = BoxInt()
+        i10 = BoxInt(); i11 = BoxInt(); i12 = BoxInt(); i13 = BoxInt(); i14 = BoxInt()
+        i15 = BoxInt(); i16 = BoxInt(); i17 = BoxInt(); i18 = BoxInt(); i19 = BoxInt()
+        i20 = BoxInt()
+        inputargs = [i0]
+        operations = [
+            ResOperation(rop.LABEL, [i0], None, descr=targettoken1),
+            ResOperation(rop.INT_ADD, [i0, ConstInt(1)], i1),
+            ResOperation(rop.INT_ADD, [i1, ConstInt(1)], i2),
+            ResOperation(rop.INT_ADD, [i2, ConstInt(1)], i3),
+            ResOperation(rop.INT_ADD, [i3, ConstInt(1)], i4),
+            ResOperation(rop.INT_ADD, [i4, ConstInt(1)], i5),
+            ResOperation(rop.INT_ADD, [i5, ConstInt(1)], i6),
+            ResOperation(rop.INT_ADD, [i6, ConstInt(1)], i7),
+            ResOperation(rop.INT_ADD, [i7, ConstInt(1)], i8),
+            ResOperation(rop.INT_ADD, [i8, ConstInt(1)], i9),
+            ResOperation(rop.INT_ADD, [i9, ConstInt(1)], i10),
+            ResOperation(rop.INT_ADD, [i10, ConstInt(1)], i11),
+            ResOperation(rop.INT_ADD, [i11, ConstInt(1)], i12),
+            ResOperation(rop.INT_ADD, [i12, ConstInt(1)], i13),
+            ResOperation(rop.INT_ADD, [i13, ConstInt(1)], i14),
+            ResOperation(rop.INT_ADD, [i14, ConstInt(1)], i15),
+            ResOperation(rop.INT_ADD, [i15, ConstInt(1)], i16),
+            ResOperation(rop.INT_ADD, [i16, ConstInt(1)], i17),
+            ResOperation(rop.INT_ADD, [i17, ConstInt(1)], i18),
+            ResOperation(rop.INT_ADD, [i18, ConstInt(1)], i19),
+            ResOperation(rop.CALL, [funcbox, i2, i4, i6, i8, i10, i12, i14, i16, i18],
+                         None, descr=calldescr),
+            ResOperation(rop.CALL, [funcbox, i2, i4, i6, i8, i10, i12, i14, i16, i18],
+                         None, descr=calldescr),
+            ResOperation(rop.INT_LT, [i19, ConstInt(100)], i20),
+            ResOperation(rop.GUARD_TRUE, [i20], None, descr=BasicFailDescr(42)),
+            ResOperation(rop.JUMP, [i19], None, descr=targettoken1),
+            ]
+        operations[-2].setfailargs([])
+        self.cpu.compile_bridge(faildescr1, inputargs, operations, looptoken1)
+
+        looptoken2 = JitCellToken()
+        inputargs = [BoxInt()]
+        operations = [
+            ResOperation(rop.JUMP, [ConstInt(0)], None, descr=targettoken1),
+            ]
+        self.cpu.compile_loop(inputargs, operations, looptoken2)
+
+        fail = self.cpu.execute_token(looptoken2, -9)
+        assert fail.identifier == 42
+
 
 class OOtypeBackendTest(BaseBackendTest):
 
diff --git a/pypy/jit/backend/test/test_random.py b/pypy/jit/backend/test/test_random.py
--- a/pypy/jit/backend/test/test_random.py
+++ b/pypy/jit/backend/test/test_random.py
@@ -3,9 +3,10 @@
 from pypy.rlib.rarithmetic import intmask, LONG_BIT
 from pypy.rpython.lltypesystem import llmemory
 from pypy.jit.metainterp.history import BasicFailDescr, TreeLoop
-from pypy.jit.metainterp.history import BoxInt, ConstInt, LoopToken
-from pypy.jit.metainterp.history import BoxPtr, ConstPtr
+from pypy.jit.metainterp.history import BoxInt, ConstInt, JitCellToken
+from pypy.jit.metainterp.history import BoxPtr, ConstPtr, TargetToken
 from pypy.jit.metainterp.history import BoxFloat, ConstFloat, Const
+from pypy.jit.metainterp.history import INT, FLOAT
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.executor import execute_nonspec
 from pypy.jit.metainterp.resoperation import opname
@@ -179,7 +180,7 @@
                 #print >>s, '    operations[%d].suboperations = [' % i
                 #print >>s, '        ResOperation(rop.FAIL, [%s], None)]' % (
                 #    ', '.join([names[v] for v in op.args]))
-        print >>s, '    looptoken = LoopToken()'
+        print >>s, '    looptoken = JitCellToken()'
         print >>s, '    cpu.compile_loop(inputargs, operations, looptoken)'
         if hasattr(self.loop, 'inputargs'):
             for i, v in enumerate(self.loop.inputargs):
@@ -525,29 +526,53 @@
                     startvars.append(BoxFloat(r.random_float_storage()))
                 else:
                     startvars.append(BoxInt(r.random_integer()))
+            allow_delay = True
+        else:
+            allow_delay = False
         assert len(dict.fromkeys(startvars)) == len(startvars)
         self.startvars = startvars
         self.prebuilt_ptr_consts = []
         self.r = r
-        self.build_random_loop(cpu, builder_factory, r, startvars)
+        self.build_random_loop(cpu, builder_factory, r, startvars, allow_delay)
 
-    def build_random_loop(self, cpu, builder_factory, r, startvars):
+    def build_random_loop(self, cpu, builder_factory, r, startvars, allow_delay):
 
         loop = TreeLoop('test_random_function')
         loop.inputargs = startvars[:]
         loop.operations = []
-        loop.token = LoopToken()
-
+        loop._jitcelltoken = JitCellToken()
         builder = builder_factory(cpu, loop, startvars[:])
-        self.generate_ops(builder, r, loop, startvars)
+        if allow_delay:
+            needs_a_label = True
+        else:
+            self.insert_label(loop, 0, r)
+            needs_a_label = False
+        self.generate_ops(builder, r, loop, startvars, needs_a_label=needs_a_label)
         self.builder = builder
         self.loop = loop
-        cpu.compile_loop(loop.inputargs, loop.operations, loop.token)
+        dump(loop)
+        cpu.compile_loop(loop.inputargs, loop.operations, loop._jitcelltoken)
 
-    def generate_ops(self, builder, r, loop, startvars):
+    def insert_label(self, loop, position, r):
+        assert not hasattr(loop, '_targettoken')
+        for i in range(position):
+            op = loop.operations[i]
+            if (not op.has_no_side_effect()
+                    or not isinstance(op.result, (BoxInt, BoxFloat))):
+                position = i
+                break       # cannot move the LABEL later
+            randompos = r.randrange(0, len(self.startvars)+1)
+            self.startvars.insert(randompos, op.result)
+        loop._targettoken = TargetToken()
+        loop.operations.insert(position, ResOperation(rop.LABEL, self.startvars, None,
+                                                      loop._targettoken))
+
+    def generate_ops(self, builder, r, loop, startvars, needs_a_label=False):
         block_length = pytest.config.option.block_length
+        istart = 0
 
         for i in range(block_length):
+            istart = len(loop.operations)
             try:
                 op = r.choice(builder.OPERATIONS)
                 op.filter(builder)
@@ -556,6 +581,12 @@
                 pass
             if builder.should_fail_by is not None:
                 break
+            if needs_a_label and r.random() < 0.2:
+                self.insert_label(loop, istart, r)
+                needs_a_label = False
+        if needs_a_label:
+            self.insert_label(loop, istart, r)
+
         endvars = []
         used_later = {}
         for op in loop.operations:
@@ -581,6 +612,22 @@
         if pytest.config.option.output:
             builder.print_loop()
 
+    def runjitcelltoken(self):
+        if self.startvars == self.loop.inputargs:
+            return self.loop._jitcelltoken
+        if not hasattr(self, '_initialjumploop_celltoken'):
+            self._initialjumploop_celltoken = JitCellToken()
+            args = []
+            for box in self.startvars:
+                if box not in self.loop.inputargs:
+                    box = box.constbox()
+                args.append(box)
+            self.cpu.compile_loop(self.loop.inputargs,
+                                  [ResOperation(rop.JUMP, args, None,
+                                                descr=self.loop._targettoken)],
+                                  self._initialjumploop_celltoken)
+        return self._initialjumploop_celltoken
+
     def get_fail_args(self):
         if self.should_fail_by.is_guard():
             assert self.should_fail_by.getfailargs() is not None
@@ -608,14 +655,8 @@
         exc = cpu.grab_exc_value()
         assert not exc
 
-        for i, box in enumerate(self.startvars):
-            if isinstance(box, BoxInt):
-                cpu.set_future_value_int(i, box.value)
-            elif isinstance(box, BoxFloat):
-                cpu.set_future_value_float(i, box.value)
-            else:
-                raise NotImplementedError(box)
-        fail = cpu.execute_token(self.loop.token)
+        arguments = [box.value for box in self.loop.inputargs]
+        fail = cpu.execute_token(self.runjitcelltoken(), *arguments)
         assert fail is self.should_fail_by.getdescr()
         for i, v in enumerate(self.get_fail_args()):
             if isinstance(v, (BoxFloat, ConstFloat)):
@@ -676,33 +717,55 @@
             # to build_bridge().)
 
             # First make up the other loop...
-            subset = bridge_builder.subset_of_intvars(r)
-            subset = [i for i in subset if i in fail_args]
-            if len(subset) == 0:
-                return False
+            #
+            # New restriction: must have the same argument count and types
+            # as the original loop
+            subset = []
+            for box in self.loop.inputargs:
+                srcbox = r.choice(fail_args)
+                if srcbox.type != box.type:
+                    if box.type == INT:
+                        srcbox = ConstInt(r.random_integer())
+                    elif box.type == FLOAT:
+                        srcbox = ConstFloat(r.random_float_storage())
+                    else:
+                        raise AssertionError(box.type)
+                subset.append(srcbox)
+            #
             args = [x.clonebox() for x in subset]
             rl = RandomLoop(self.builder.cpu, self.builder.fork,
                                      r, args)
+            dump(rl.loop)
             self.cpu.compile_loop(rl.loop.inputargs, rl.loop.operations,
-                                  rl.loop.token)
+                                  rl.loop._jitcelltoken)
             # done
             self.should_fail_by = rl.should_fail_by
             self.expected = rl.expected
             assert len(rl.loop.inputargs) == len(args)
             # The new bridge's execution will end normally at its FINISH.
             # Just replace the FINISH with the JUMP to the new loop.
-            jump_op = ResOperation(rop.JUMP, subset, None, descr=rl.loop.token)
+            jump_op = ResOperation(rop.JUMP, subset, None,
+                                   descr=rl.loop._targettoken)
             subloop.operations[-1] = jump_op
             self.guard_op = rl.guard_op
             self.prebuilt_ptr_consts += rl.prebuilt_ptr_consts
-            self.loop.token.record_jump_to(rl.loop.token)
+            self.loop._jitcelltoken.record_jump_to(rl.loop._jitcelltoken)
             self.dont_generate_more = True
         if r.random() < .05:
             return False
+        dump(subloop)
         self.builder.cpu.compile_bridge(fail_descr, fail_args,
-                                        subloop.operations, self.loop.token)
+                                        subloop.operations,
+                                        self.loop._jitcelltoken)
         return True
 
+def dump(loop):
+    print >> sys.stderr, loop
+    if hasattr(loop, 'inputargs'):
+        print >> sys.stderr, '\t', loop.inputargs
+    for op in loop.operations:
+        print >> sys.stderr, '\t', op
+
 def check_random_function(cpu, BuilderClass, r, num=None, max=None):
     loop = RandomLoop(cpu, BuilderClass, r)
     while True:
diff --git a/pypy/jit/backend/x86/assembler.py b/pypy/jit/backend/x86/assembler.py
--- a/pypy/jit/backend/x86/assembler.py
+++ b/pypy/jit/backend/x86/assembler.py
@@ -2,8 +2,8 @@
 from pypy.jit.backend.llsupport import symbolic
 from pypy.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from pypy.jit.metainterp.history import Const, Box, BoxInt, ConstInt
-from pypy.jit.metainterp.history import (AbstractFailDescr, INT, REF, FLOAT,
-                                         LoopToken)
+from pypy.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
+from pypy.jit.metainterp.history import JitCellToken
 from pypy.rpython.lltypesystem import lltype, rffi, rstr, llmemory
 from pypy.rpython.lltypesystem.lloperation import llop
 from pypy.rpython.annlowlevel import llhelper
@@ -38,6 +38,7 @@
 from pypy.jit.backend.x86.jump import remap_frame_layout
 from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.codewriter import longlong
+from pypy.rlib.rarithmetic import intmask
 
 # darwin requires the stack to be 16 bytes aligned on calls. Same for gcc 4.5.0,
 # better safe than sorry
@@ -152,14 +153,13 @@
         allblocks = self.get_asmmemmgr_blocks(looptoken)
         self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
                                                         allblocks)
+        self.target_tokens_currently_compiling = {}
 
     def teardown(self):
         self.pending_guard_tokens = None
         if WORD == 8:
             self.pending_memoryerror_trampoline_from = None
         self.mc = None
-        self.looppos = -1
-        self.currently_compiling_loop = None
         self.current_clt = None
 
     def finish_once(self):
@@ -310,12 +310,11 @@
                 mc.MOVSD_sx(8*i, i)     # xmm0 to xmm7
         #
         if IS_X86_32:
-            mc.LEA_rb(eax.value, +8)
             stack_size += 2*WORD
             mc.PUSH_r(eax.value)        # alignment
-            mc.PUSH_r(eax.value)
+            mc.PUSH_r(esp.value)
         elif IS_X86_64:
-            mc.LEA_rb(edi.value, +16)
+            mc.MOV_rr(edi.value, esp.value)
         #
         # esp is now aligned to a multiple of 16 again
         mc.CALL(imm(slowpathaddr))
@@ -326,7 +325,7 @@
         jnz_location = mc.get_relative_pos()
         #
         if IS_X86_32:
-            mc.ADD_ri(esp.value, 2*WORD)
+            mc.ADD_ri(esp.value, 2*WORD)    # cancel the two PUSHes above
         elif IS_X86_64:
             # restore the registers
             for i in range(7, -1, -1):
@@ -422,12 +421,8 @@
 
     def assemble_loop(self, loopname, inputargs, operations, looptoken, log):
         '''adds the following attributes to looptoken:
-               _x86_loop_code       (an integer giving an address)
-               _x86_bootstrap_code  (an integer giving an address)
-               _x86_direct_bootstrap_code  ( "    "     "    "   )
-               _x86_frame_depth
-               _x86_param_depth
-               _x86_arglocs
+               _x86_function_addr   (address of the generated func, as an int)
+               _x86_loop_code       (debug: addr of the start of the ResOps)
                _x86_debug_checksum
         '''
         # XXX this function is too longish and contains some code
@@ -443,37 +438,35 @@
             assert len(set(inputargs)) == len(inputargs)
 
         self.setup(looptoken)
-        self.currently_compiling_loop = looptoken
         if log:
             self._register_counter(False, looptoken.number)
             operations = self._inject_debugging_code(looptoken, operations)
 
         regalloc = RegAlloc(self, self.cpu.translate_support_code)
-        arglocs, operations = regalloc.prepare_loop(inputargs, operations,
-                                                    looptoken, clt.allgcrefs)
-        looptoken._x86_arglocs = arglocs
-
-        bootstrappos = self.mc.get_relative_pos()
-        stackadjustpos = self._assemble_bootstrap_code(inputargs, arglocs)
-        self.looppos = self.mc.get_relative_pos()
-        looptoken._x86_frame_depth = -1     # temporarily
-        looptoken._x86_param_depth = -1     # temporarily
+        #
+        self._call_header_with_stack_check()
+        stackadjustpos = self._patchable_stackadjust()
+        clt._debug_nbargs = len(inputargs)
+        operations = regalloc.prepare_loop(inputargs, operations,
+                                           looptoken, clt.allgcrefs)
+        looppos = self.mc.get_relative_pos()
+        looptoken._x86_loop_code = looppos
+        clt.frame_depth = -1     # temporarily
+        clt.param_depth = -1     # temporarily
         frame_depth, param_depth = self._assemble(regalloc, operations)
-        looptoken._x86_frame_depth = frame_depth
-        looptoken._x86_param_depth = param_depth
-
-        directbootstrappos = self.mc.get_relative_pos()
-        self._assemble_bootstrap_direct_call(arglocs, self.looppos,
-                                             frame_depth+param_depth)
+        clt.frame_depth = frame_depth
+        clt.param_depth = param_depth
+        #
+        size_excluding_failure_stuff = self.mc.get_relative_pos()
         self.write_pending_failure_recoveries()
-        fullsize = self.mc.get_relative_pos()
+        full_size = self.mc.get_relative_pos()
         #
         rawstart = self.materialize_loop(looptoken)
         debug_start("jit-backend-addr")
         debug_print("Loop %d (%s) has address %x to %x (bootstrap %x)" % (
             looptoken.number, loopname,
-            rawstart + self.looppos,
-            rawstart + directbootstrappos,
+            rawstart + looppos,
+            rawstart + size_excluding_failure_stuff,
             rawstart))
         debug_stop("jit-backend-addr")
         self._patch_stackadjust(rawstart + stackadjustpos,
@@ -484,18 +477,17 @@
         if not we_are_translated():
             # used only by looptoken.dump() -- useful in tests
             looptoken._x86_rawstart = rawstart
-            looptoken._x86_fullsize = fullsize
+            looptoken._x86_fullsize = full_size
             looptoken._x86_ops_offset = ops_offset
+        looptoken._x86_function_addr = rawstart
 
-        looptoken._x86_bootstrap_code = rawstart + bootstrappos
-        looptoken._x86_loop_code = rawstart + self.looppos
-        looptoken._x86_direct_bootstrap_code = rawstart + directbootstrappos
+        self.fixup_target_tokens(rawstart)
         self.teardown()
         # oprofile support
         if self.cpu.profile_agent is not None:
             name = "Loop # %s: %s" % (looptoken.number, loopname)
             self.cpu.profile_agent.native_code_written(name,
-                                                       rawstart, fullsize)
+                                                       rawstart, full_size)
         return ops_offset
 
     def assemble_bridge(self, faildescr, inputargs, operations,
@@ -548,6 +540,9 @@
         # patch the jump from original guard
         self.patch_jump_for_descr(faildescr, rawstart)
         ops_offset = self.mc.ops_offset
+        self.fixup_target_tokens(rawstart)
+        self.current_clt.frame_depth = max(self.current_clt.frame_depth, frame_depth)
+        self.current_clt.param_depth = max(self.current_clt.param_depth, param_depth)
         self.teardown()
         # oprofile support
         if self.cpu.profile_agent is not None:
@@ -668,6 +663,11 @@
             mc.copy_to_raw_memory(adr_target)
         faildescr._x86_adr_jump_offset = 0    # means "patched"
 
+    def fixup_target_tokens(self, rawstart):
+        for targettoken in self.target_tokens_currently_compiling:
+            targettoken._x86_loop_code += rawstart
+        self.target_tokens_currently_compiling = None
+
     @specialize.argtype(1)
     def _inject_debugging_code(self, looptoken, operations):
         if self._debug:
@@ -685,20 +685,24 @@
                    ResOperation(rop.INT_ADD, [box, ConstInt(1)], box2),
                    ResOperation(rop.SETFIELD_RAW, [c_adr, box2],
                                 None, descr=self.debug_counter_descr)]
-            operations = ops + operations
+            if operations[0].getopnum() == rop.LABEL:
+                operations = [operations[0]] + ops + operations[1:]
+            else:
+                operations =  ops + operations
         return operations
 
     def _assemble(self, regalloc, operations):
         self._regalloc = regalloc
+        regalloc.compute_hint_frame_locations(operations)
         regalloc.walk_operations(operations)
         if we_are_translated() or self.cpu.dont_keepalive_stuff:
             self._regalloc = None   # else keep it around for debugging
-        frame_depth = regalloc.fm.frame_depth
+        frame_depth = regalloc.fm.get_frame_depth()
         param_depth = regalloc.param_depth
         jump_target_descr = regalloc.jump_target_descr
         if jump_target_descr is not None:
-            target_frame_depth = jump_target_descr._x86_frame_depth
-            target_param_depth = jump_target_descr._x86_param_depth
+            target_frame_depth = jump_target_descr._x86_clt.frame_depth
+            target_param_depth = jump_target_descr._x86_clt.param_depth
             frame_depth = max(frame_depth, target_frame_depth)
             param_depth = max(param_depth, target_param_depth)
         return frame_depth, param_depth
@@ -793,152 +797,21 @@
             self.mc.MOV_ri(ebx.value, rst)           # MOV ebx, rootstacktop
             self.mc.SUB_mi8((ebx.value, 0), 2*WORD)  # SUB [ebx], 2*WORD
 
-    def _assemble_bootstrap_direct_call(self, arglocs, jmppos, stackdepth):
-        if IS_X86_64:
-            return self._assemble_bootstrap_direct_call_64(arglocs, jmppos, stackdepth)
-        # XXX pushing ebx esi and edi is a bit pointless, since we store
-        #     all regsiters anyway, for the case of guard_not_forced
-        # XXX this can be improved greatly. Right now it'll behave like
-        #     a normal call
-        nonfloatlocs, floatlocs = arglocs
-        self._call_header_with_stack_check()
-        self.mc.LEA_rb(esp.value, self._get_offset_of_ebp_from_esp(stackdepth))
-        offset = 2 * WORD
-        tmp = eax
-        xmmtmp = xmm0
-        for i in range(len(nonfloatlocs)):
-            loc = nonfloatlocs[i]
-            if loc is not None:
-                if isinstance(loc, RegLoc):
-                    assert not loc.is_xmm
-                    self.mc.MOV_rb(loc.value, offset)
-                else:
-                    self.mc.MOV_rb(tmp.value, offset)
-                    self.mc.MOV(loc, tmp)
-                offset += WORD
-            loc = floatlocs[i]
-            if loc is not None:
-                if isinstance(loc, RegLoc):
-                    assert loc.is_xmm
-                    self.mc.MOVSD_xb(loc.value, offset)
-                else:
-                    self.mc.MOVSD_xb(xmmtmp.value, offset)
-                    assert isinstance(loc, StackLoc)
-                    self.mc.MOVSD_bx(loc.value, xmmtmp.value)
-                offset += 2 * WORD
-        endpos = self.mc.get_relative_pos() + 5
-        self.mc.JMP_l(jmppos - endpos)
-        assert endpos == self.mc.get_relative_pos()
-
-    def _assemble_bootstrap_direct_call_64(self, arglocs, jmppos, stackdepth):
-        # XXX: Very similar to _emit_call_64
-
-        src_locs = []
-        dst_locs = []
-        xmm_src_locs = []
-        xmm_dst_locs = []
-        get_from_stack = []
-
-        # In reverse order for use with pop()
-        unused_gpr = [r9, r8, ecx, edx, esi, edi]
-        unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
-
-        nonfloatlocs, floatlocs = arglocs
-        self._call_header_with_stack_check()
-        self.mc.LEA_rb(esp.value, self._get_offset_of_ebp_from_esp(stackdepth))
-
-        # The lists are padded with Nones
-        assert len(nonfloatlocs) == len(floatlocs)
-
-        for i in range(len(nonfloatlocs)):
-            loc = nonfloatlocs[i]
-            if loc is not None:
-                if len(unused_gpr) > 0:
-                    src_locs.append(unused_gpr.pop())
-                    dst_locs.append(loc)
-                else:
-                    get_from_stack.append((loc, False))
-
-            floc = floatlocs[i]
-            if floc is not None:
-                if len(unused_xmm) > 0:
-                    xmm_src_locs.append(unused_xmm.pop())
-                    xmm_dst_locs.append(floc)
-                else:
-                    get_from_stack.append((floc, True))
-
-        remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
-        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs, X86_64_XMM_SCRATCH_REG)
-
-        for i in range(len(get_from_stack)):
-            loc, is_xmm = get_from_stack[i]
-            if is_xmm:
-                self.mc.MOVSD_xb(X86_64_XMM_SCRATCH_REG.value, (2 + i) * WORD)
-                self.mc.MOVSD(loc, X86_64_XMM_SCRATCH_REG)
-            else:
-                self.mc.MOV_rb(X86_64_SCRATCH_REG.value, (2 + i) * WORD)
-                # XXX: We're assuming that "loc" won't require regloc to
-                # clobber the scratch register
-                self.mc.MOV(loc, X86_64_SCRATCH_REG)
-
-        endpos = self.mc.get_relative_pos() + 5
-        self.mc.JMP_l(jmppos - endpos)
-        assert endpos == self.mc.get_relative_pos()
-
     def redirect_call_assembler(self, oldlooptoken, newlooptoken):
         # some minimal sanity checking
-        oldnonfloatlocs, oldfloatlocs = oldlooptoken._x86_arglocs
-        newnonfloatlocs, newfloatlocs = newlooptoken._x86_arglocs
-        assert len(oldnonfloatlocs) == len(newnonfloatlocs)
-        assert len(oldfloatlocs) == len(newfloatlocs)
+        old_nbargs = oldlooptoken.compiled_loop_token._debug_nbargs
+        new_nbargs = newlooptoken.compiled_loop_token._debug_nbargs
+        assert old_nbargs == new_nbargs
         # we overwrite the instructions at the old _x86_direct_bootstrap_code
         # to start with a JMP to the new _x86_direct_bootstrap_code.
         # Ideally we should rather patch all existing CALLs, but well.
-        oldadr = oldlooptoken._x86_direct_bootstrap_code
-        target = newlooptoken._x86_direct_bootstrap_code
+        oldadr = oldlooptoken._x86_function_addr
+        target = newlooptoken._x86_function_addr
         mc = codebuf.MachineCodeBlockWrapper()
         mc.JMP(imm(target))
+        assert mc.get_relative_pos() <= 13  # keep in sync with prepare_loop()
         mc.copy_to_raw_memory(oldadr)
 
-    def _assemble_bootstrap_code(self, inputargs, arglocs):
-        nonfloatlocs, floatlocs = arglocs
-        self._call_header()
-        stackadjustpos = self._patchable_stackadjust()
-        tmp = eax
-        xmmtmp = xmm0
-        self.mc.begin_reuse_scratch_register()
-        for i in range(len(nonfloatlocs)):
-            loc = nonfloatlocs[i]
-            if loc is None:
-                continue
-            if isinstance(loc, RegLoc):
-                target = loc
-            else:
-                target = tmp
-            if inputargs[i].type == REF:
-                adr = self.fail_boxes_ptr.get_addr_for_num(i)
-                self.mc.MOV(target, heap(adr))
-                self.mc.MOV(heap(adr), imm0)
-            else:
-                adr = self.fail_boxes_int.get_addr_for_num(i)
-                self.mc.MOV(target, heap(adr))
-            if target is not loc:
-                assert isinstance(loc, StackLoc)
-                self.mc.MOV_br(loc.value, target.value)
-        for i in range(len(floatlocs)):
-            loc = floatlocs[i]
-            if loc is None:
-                continue
-            adr = self.fail_boxes_float.get_addr_for_num(i)
-            if isinstance(loc, RegLoc):
-                self.mc.MOVSD(loc, heap(adr))
-            else:
-                self.mc.MOVSD(xmmtmp, heap(adr))
-                assert isinstance(loc, StackLoc)
-                self.mc.MOVSD_bx(loc.value, xmmtmp.value)
-        self.mc.end_reuse_scratch_register()
-        return stackadjustpos
-
     def dump(self, text):
         if not self.verbose:
             return
@@ -965,7 +838,7 @@
         if isinstance(loc, RegLoc) and loc.is_xmm:
             self.mc.SUB_ri(esp.value, 8)   # = size of doubles
             self.mc.MOVSD_sx(0, loc.value)
-        elif WORD == 4 and isinstance(loc, StackLoc) and loc.width == 8:
+        elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
             # XXX evil trick
             self.mc.PUSH_b(get_ebp_ofs(loc.position))
             self.mc.PUSH_b(get_ebp_ofs(loc.position + 1))
@@ -976,13 +849,25 @@
         if isinstance(loc, RegLoc) and loc.is_xmm:
             self.mc.MOVSD_xs(loc.value, 0)
             self.mc.ADD_ri(esp.value, 8)   # = size of doubles
-        elif WORD == 4 and isinstance(loc, StackLoc) and loc.width == 8:
+        elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
             # XXX evil trick
             self.mc.POP_b(get_ebp_ofs(loc.position + 1))
             self.mc.POP_b(get_ebp_ofs(loc.position))
         else:
             self.mc.POP(loc)
 
+    def regalloc_immedmem2mem(self, from_loc, to_loc):
+        # move a ConstFloatLoc directly to a StackLoc, as two MOVs
+        # (even on x86-64, because the immediates are encoded as 32 bits)
+        assert isinstance(from_loc, ConstFloatLoc)
+        assert isinstance(to_loc,   StackLoc)
+        low_part  = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[0]
+        high_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[1]
+        low_part  = intmask(low_part)
+        high_part = intmask(high_part)
+        self.mc.MOV_bi(to_loc.value,     low_part)
+        self.mc.MOV_bi(to_loc.value + 4, high_part)
+
     def regalloc_perform(self, op, arglocs, resloc):
         genop_list[op.getopnum()](self, op, arglocs, resloc)
 
@@ -1134,18 +1019,18 @@
                     self.mc.MOVSD_sx(p, loc.value)
                 else:
                     self.mc.MOV_sr(p, loc.value)
-            p += round_up_to_4(loc.width)
+            p += loc.get_width()
         p = 0
         for i in range(start, n):
             loc = arglocs[i]
             if not isinstance(loc, RegLoc):
-                if loc.width == 8:
+                if loc.get_width() == 8:
                     self.mc.MOVSD(xmm0, loc)
                     self.mc.MOVSD_sx(p, xmm0.value)
                 else:
                     self.mc.MOV(tmp, loc)
                     self.mc.MOV_sr(p, tmp.value)
-            p += round_up_to_4(loc.width)
+            p += loc.get_width()
         self._regalloc.reserve_param(p//WORD)
         # x is a location
         self.mc.CALL(x)
@@ -1882,10 +1767,10 @@
     DESCR_INT       = 0x01
     DESCR_FLOAT     = 0x02
     DESCR_SPECIAL   = 0x03
-    # XXX: 4*8 works on i386, should we optimize for that case?
-    CODE_FROMSTACK  = 4*16
+    CODE_FROMSTACK  = 4 * (8 + 8*IS_X86_64)
     CODE_STOP       = 0 | DESCR_SPECIAL
     CODE_HOLE       = 4 | DESCR_SPECIAL
+    CODE_INPUTARG   = 8 | DESCR_SPECIAL
 
     def write_failure_recovery_description(self, mc, failargs, locs):
         for i in range(len(failargs)):
@@ -1901,7 +1786,11 @@
                     raise AssertionError("bogus kind")
                 loc = locs[i]
                 if isinstance(loc, StackLoc):
-                    n = self.CODE_FROMSTACK//4 + loc.position
+                    pos = loc.position
+                    if pos < 0:
+                        mc.writechar(chr(self.CODE_INPUTARG))
+                        pos = ~pos
+                    n = self.CODE_FROMSTACK//4 + pos
                 else:
                     assert isinstance(loc, RegLoc)
                     n = loc.value
@@ -1921,6 +1810,7 @@
         descr_to_box_type = [REF, INT, FLOAT]
         bytecode = rffi.cast(rffi.UCHARP, bytecode)
         arglocs = []
+        code_inputarg = False
         while 1:
             # decode the next instruction from the bytecode
             code = rffi.cast(lltype.Signed, bytecode[0])
@@ -1939,11 +1829,17 @@
                             break
                 kind = code & 3
                 code = (code - self.CODE_FROMSTACK) >> 2
+                if code_inputarg:
+                    code = ~code
+                    code_inputarg = False
                 loc = X86FrameManager.frame_pos(code, descr_to_box_type[kind])
             elif code == self.CODE_STOP:
                 break
             elif code == self.CODE_HOLE:
                 continue
+            elif code == self.CODE_INPUTARG:
+                code_inputarg = True
+                continue
             else:
                 # 'code' identifies a register
                 kind = code & 3
@@ -1959,6 +1855,7 @@
     def grab_frame_values(self, bytecode, frame_addr, allregisters):
         # no malloc allowed here!!
         self.fail_ebp = allregisters[16 + ebp.value]
+        code_inputarg = False
         num = 0
         value_hi = 0
         while 1:
@@ -1979,6 +1876,9 @@
                 # load the value from the stack
                 kind = code & 3
                 code = (code - self.CODE_FROMSTACK) >> 2
+                if code_inputarg:
+                    code = ~code
+                    code_inputarg = False
                 stackloc = frame_addr + get_ebp_ofs(code)
                 value = rffi.cast(rffi.SIGNEDP, stackloc)[0]
                 if kind == self.DESCR_FLOAT and WORD == 4:
@@ -1991,6 +1891,9 @@
                     if code == self.CODE_HOLE:
                         num += 1
                         continue
+                    if code == self.CODE_INPUTARG:
+                        code_inputarg = True
+                        continue
                     assert code == self.CODE_STOP
                     break
                 code >>= 2
@@ -2095,9 +1998,9 @@
         # returns in eax the fail_index
 
         # now we return from the complete frame, which starts from
-        # _assemble_bootstrap_code().  The LEA in _call_footer below throws
-        # away most of the frame, including all the PUSHes that we did just
-        # above.
+        # _call_header_with_stack_check().  The LEA in _call_footer below
+        # throws away most of the frame, including all the PUSHes that we
+        # did just above.
 
         self._call_footer()
         rawstart = mc.materialize(self.cpu.asmmemmgr, [])
@@ -2180,7 +2083,7 @@
                         argtypes=op.getdescr().get_arg_types(),
                         callconv=op.getdescr().get_call_conv())
 
-        if IS_X86_32 and isinstance(resloc, StackLoc) and resloc.width == 8:
+        if IS_X86_32 and isinstance(resloc, StackLoc) and resloc.type == FLOAT:
             # a float or a long long return
             if op.getdescr().get_return_type() == 'L':
                 self.mc.MOV_br(resloc.value, eax.value)      # long long
@@ -2344,11 +2247,11 @@
         fail_index = self.cpu.get_fail_descr_number(faildescr)
         self.mc.MOV_bi(FORCE_INDEX_OFS, fail_index)
         descr = op.getdescr()
-        assert isinstance(descr, LoopToken)
-        assert len(arglocs) - 2 == len(descr._x86_arglocs[0])
+        assert isinstance(descr, JitCellToken)
+        assert len(arglocs) - 2 == descr.compiled_loop_token._debug_nbargs
         #
-        # Write a call to the direct_bootstrap_code of the target assembler
-        self._emit_call(fail_index, imm(descr._x86_direct_bootstrap_code),
+        # Write a call to the target assembler
+        self._emit_call(fail_index, imm(descr._x86_function_addr),
                         arglocs, 2, tmp=eax)
         if op.result is None:
             assert result_loc is None
@@ -2578,15 +2481,21 @@
                     gcrootmap.put(self.gcrootmap_retaddr_forced, mark)
                     self.gcrootmap_retaddr_forced = -1
 
-    def target_arglocs(self, loop_token):
-        return loop_token._x86_arglocs
-
-    def closing_jump(self, loop_token):
-        if loop_token is self.currently_compiling_loop:
+    def closing_jump(self, target_token):
+        # The backend's logic assumes that the target code is in a piece of
+        # assembler that was also called with the same number of arguments,
+        # so that the locations [ebp+8..] of the input arguments are valid
+        # stack locations both before and after the jump.
+        my_nbargs = self.current_clt._debug_nbargs
+        target_nbargs = target_token._x86_clt._debug_nbargs
+        assert my_nbargs == target_nbargs
+        #
+        target = target_token._x86_loop_code
+        if target_token in self.target_tokens_currently_compiling:
             curpos = self.mc.get_relative_pos() + 5
-            self.mc.JMP_l(self.looppos - curpos)
+            self.mc.JMP_l(target - curpos)
         else:
-            self.mc.JMP(imm(loop_token._x86_loop_code))
+            self.mc.JMP(imm(target))
 
     def malloc_cond(self, nursery_free_adr, nursery_top_adr, size, tid):
         size = max(size, self.cpu.gc_ll_descr.minimal_size_in_nursery)
@@ -2659,11 +2568,6 @@
         num = getattr(rop, opname.upper())
         genop_list[num] = value
 
-def round_up_to_4(size):
-    if size < 4:
-        return 4
-    return size
-
 # XXX: ri386 migration shims:
 def addr_add(reg_or_imm1, reg_or_imm2, offset=0, scale=0):
     return AddressLoc(reg_or_imm1, reg_or_imm2, scale, offset)
diff --git a/pypy/jit/backend/x86/jump.py b/pypy/jit/backend/x86/jump.py
--- a/pypy/jit/backend/x86/jump.py
+++ b/pypy/jit/backend/x86/jump.py
@@ -1,6 +1,6 @@
 import sys
 from pypy.tool.pairtype import extendabletype
-from pypy.jit.backend.x86.regloc import ImmedLoc, StackLoc
+from pypy.jit.backend.x86.regloc import ImmediateAssemblerLocation, StackLoc
 
 def remap_frame_layout(assembler, src_locations, dst_locations, tmpreg):
     pending_dests = len(dst_locations)
@@ -12,7 +12,7 @@
         srccount[key] = 0
     for i in range(len(dst_locations)):
         src = src_locations[i]
-        if isinstance(src, ImmedLoc):
+        if isinstance(src, ImmediateAssemblerLocation):
             continue
         key = src._getregkey()
         if key in srccount:
@@ -31,7 +31,7 @@
                 srccount[key] = -1       # means "it's done"
                 pending_dests -= 1
                 src = src_locations[i]
-                if not isinstance(src, ImmedLoc):
+                if not isinstance(src, ImmediateAssemblerLocation):
                     key = src._getregkey()
                     if key in srccount:
                         srccount[key] -= 1
@@ -66,6 +66,13 @@
 
 def _move(assembler, src, dst, tmpreg):
     if dst.is_memory_reference() and src.is_memory_reference():
+        if isinstance(src, ImmediateAssemblerLocation):
+            assembler.regalloc_immedmem2mem(src, dst)
+            return
+        if tmpreg is None:
+            assembler.regalloc_push(src)
+            assembler.regalloc_pop(dst)
+            return
         assembler.regalloc_mov(src, tmpreg)
         src = tmpreg
     assembler.regalloc_mov(src, dst)
@@ -87,7 +94,7 @@
         dstloc = dst_locations2[i]
         if isinstance(loc, StackLoc):
             key = loc._getregkey()
-            if (key in dst_keys or (loc.width > WORD and
+            if (key in dst_keys or (loc.get_width() > WORD and
                                     (key + WORD) in dst_keys)):
                 assembler.regalloc_push(loc)
                 extrapushes.append(dstloc)
diff --git a/pypy/jit/backend/x86/regalloc.py b/pypy/jit/backend/x86/regalloc.py
--- a/pypy/jit/backend/x86/regalloc.py
+++ b/pypy/jit/backend/x86/regalloc.py
@@ -5,7 +5,8 @@
 import os
 from pypy.jit.metainterp.history import (Box, Const, ConstInt, ConstPtr,
                                          ResOperation, BoxPtr, ConstFloat,
-                                         BoxFloat, LoopToken, INT, REF, FLOAT)
+                                         BoxFloat, INT, REF, FLOAT,
+                                         TargetToken, JitCellToken)
 from pypy.jit.backend.x86.regloc import *
 from pypy.rpython.lltypesystem import lltype, rffi, rstr
 from pypy.rlib.objectmodel import we_are_translated
@@ -27,7 +28,7 @@
 class X86RegisterManager(RegisterManager):
 
     box_types = [INT, REF]
-    all_regs = [eax, ecx, edx, ebx, esi, edi]
+    all_regs = [ecx, eax, edx, ebx, esi, edi]
     no_lower_byte_regs = [esi, edi]
     save_around_call_regs = [eax, edx, ecx]
     frame_reg = ebp
@@ -59,7 +60,7 @@
 
 class X86_64_RegisterManager(X86RegisterManager):
     # r11 omitted because it's used as scratch
-    all_regs = [eax, ecx, edx, ebx, esi, edi, r8, r9, r10, r12, r13, r14, r15]
+    all_regs = [ecx, eax, edx, ebx, esi, edi, r8, r9, r10, r12, r13, r14, r15]
     no_lower_byte_regs = []
     save_around_call_regs = [eax, ecx, edx, esi, edi, r8, r9, r10]
 
@@ -129,15 +130,19 @@
     @staticmethod
     def frame_pos(i, box_type):
         if IS_X86_32 and box_type == FLOAT:
-            return StackLoc(i, get_ebp_ofs(i+1), 2, box_type)
+            return StackLoc(i, get_ebp_ofs(i+1), box_type)
         else:
-            return StackLoc(i, get_ebp_ofs(i), 1, box_type)
+            return StackLoc(i, get_ebp_ofs(i), box_type)
     @staticmethod
     def frame_size(box_type):
         if IS_X86_32 and box_type == FLOAT:
             return 2
         else:
             return 1
+    @staticmethod
+    def get_loc_index(loc):
+        assert isinstance(loc, StackLoc)
+        return loc.position
 
 if WORD == 4:
     gpr_reg_mgr_cls = X86RegisterManager
@@ -159,6 +164,8 @@
         # to be read/used by the assembler too
         self.jump_target_descr = None
         self.close_stack_struct = 0
+        self.final_jump_op = None
+        self.min_bytes_before_label = 0
 
     def _prepare(self, inputargs, operations, allgcrefs):
         self.fm = X86FrameManager()
@@ -167,70 +174,83 @@
         operations = cpu.gc_ll_descr.rewrite_assembler(cpu, operations,
                                                        allgcrefs)
         # compute longevity of variables
-        longevity, useful = self._compute_vars_longevity(inputargs, operations)
-        self.longevity = longevity
-        self.rm = gpr_reg_mgr_cls(longevity,
+        self._compute_vars_longevity(inputargs, operations)
+        self.rm = gpr_reg_mgr_cls(self.longevity,
                                   frame_manager = self.fm,
                                   assembler = self.assembler)
-        self.xrm = xmm_reg_mgr_cls(longevity, frame_manager = self.fm,
+        self.xrm = xmm_reg_mgr_cls(self.longevity, frame_manager = self.fm,
                                    assembler = self.assembler)
-        return operations, useful
+        return operations
 
     def prepare_loop(self, inputargs, operations, looptoken, allgcrefs):
-        operations, useful = self._prepare(inputargs, operations, allgcrefs)
-        return self._process_inputargs(inputargs, useful), operations
+        operations = self._prepare(inputargs, operations, allgcrefs)
+        self._set_initial_bindings(inputargs)
+        # note: we need to make a copy of inputargs because possibly_free_vars
+        # is also used on op args, which is a non-resizable list
+        self.possibly_free_vars(list(inputargs))
+        self.min_bytes_before_label = 13
+        return operations
 
     def prepare_bridge(self, prev_depths, inputargs, arglocs, operations,
                        allgcrefs):
-        operations, _ = self._prepare(inputargs, operations, allgcrefs)
+        operations = self._prepare(inputargs, operations, allgcrefs)
         self._update_bindings(arglocs, inputargs)
-        self.fm.frame_depth = prev_depths[0]
         self.param_depth = prev_depths[1]
         return operations
 
     def reserve_param(self, n):
         self.param_depth = max(self.param_depth, n)
 
-    def _process_inputargs(self, inputargs, useful):
-        # XXX we can sort out here by longevity if we need something
-        # more optimal
-        floatlocs = [None] * len(inputargs)
-        nonfloatlocs = [None] * len(inputargs)
-        # Don't use all_regs[0] for passing arguments around a loop.
-        # Must be kept in sync with consider_jump().
-        # XXX this should probably go to llsupport/regalloc.py
-        xmmtmp = self.xrm.free_regs.pop(0)
-        tmpreg = self.rm.free_regs.pop(0)
-        assert tmpreg == X86RegisterManager.all_regs[0]
-        assert xmmtmp == X86XMMRegisterManager.all_regs[0]
-        for i in range(len(inputargs)):
-            arg = inputargs[i]
-            assert not isinstance(arg, Const)
-            reg = None
-            if self.longevity[arg][1] > -1 and arg in useful:
-                if arg.type == FLOAT:
-                    # xxx is it really a good idea?  at the first CALL they
-                    # will all be flushed anyway
-                    reg = self.xrm.try_allocate_reg(arg)
+    def _set_initial_bindings(self, inputargs):
+        if IS_X86_64:
+            inputargs = self._set_initial_bindings_regs_64(inputargs)
+        #                   ...
+        # stack layout:     arg2
+        #                   arg1
+        #                   arg0
+        #                   return address
+        #                   saved ebp        <-- ebp points here
+        #                   ...
+        cur_frame_pos = - 1 - FRAME_FIXED_SIZE
+        assert get_ebp_ofs(cur_frame_pos-1) == 2*WORD
+        assert get_ebp_ofs(cur_frame_pos-2) == 3*WORD
+        #
+        for box in inputargs:
+            assert isinstance(box, Box)
+            #
+            if IS_X86_32 and box.type == FLOAT:
+                cur_frame_pos -= 2
+            else:
+                cur_frame_pos -= 1
+            loc = self.fm.frame_pos(cur_frame_pos, box.type)
+            self.fm.set_binding(box, loc)
+
+    def _set_initial_bindings_regs_64(self, inputargs):
+        # In reverse order for use with pop()
+        unused_gpr = [r9, r8, ecx, edx, esi, edi]
+        unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
+        #
+        pass_on_stack = []
+        #
+        for box in inputargs:
+            assert isinstance(box, Box)
+            #
+            if box.type == FLOAT:
+                if len(unused_xmm) > 0:
+                    ask = unused_xmm.pop()
+                    got = self.xrm.try_allocate_reg(box, selected_reg=ask)
+                    assert ask == got
                 else:
-                    reg = self.rm.try_allocate_reg(arg)
-            if reg:
-                loc = reg
+                    pass_on_stack.append(box)
             else:
-                loc = self.fm.loc(arg)
-            if arg.type == FLOAT:
-                floatlocs[i] = loc
-            else:
-                nonfloatlocs[i] = loc
-            # otherwise we have it saved on stack, so no worry
-        self.rm.free_regs.insert(0, tmpreg)
-        self.xrm.free_regs.insert(0, xmmtmp)
-        assert tmpreg not in nonfloatlocs
-        assert xmmtmp not in floatlocs
-        # note: we need to make a copy of inputargs because possibly_free_vars
-        # is also used on op args, which is a non-resizable list
-        self.possibly_free_vars(list(inputargs))
-        return nonfloatlocs, floatlocs
+                if len(unused_gpr) > 0:
+                    ask = unused_gpr.pop()
+                    got = self.rm.try_allocate_reg(box, selected_reg=ask)
+                    assert ask == got
+                else:
+                    pass_on_stack.append(box)
+        #
+        return pass_on_stack
 
     def possibly_free_var(self, var):
         if var.type == FLOAT:
@@ -307,7 +327,7 @@
                     self.xrm.reg_bindings[arg] = loc
                     used[loc] = None
                 else:
-                    self.fm.frame_bindings[arg] = loc
+                    self.fm.set_binding(arg, loc)
             else:
                 if isinstance(loc, RegLoc):
                     if loc is ebp:
@@ -316,7 +336,7 @@
                         self.rm.reg_bindings[arg] = loc
                         used[loc] = None
                 else:
-                    self.fm.frame_bindings[arg] = loc
+                    self.fm.set_binding(arg, loc)
         self.rm.free_regs = []
         for reg in self.rm.all_regs:
             if reg not in used:
@@ -352,7 +372,7 @@
     def get_current_depth(self):
         # return (self.fm.frame_depth, self.param_depth), but trying to share
         # the resulting tuple among several calls
-        arg0 = self.fm.frame_depth
+        arg0 = self.fm.get_frame_depth()
         arg1 = self.param_depth
         result = self.assembler._current_depths_cache
         if result[0] != arg0 or result[1] != arg1:
@@ -441,8 +461,15 @@
             i += 1
         assert not self.rm.reg_bindings
         assert not self.xrm.reg_bindings
+        self.flush_loop()
         self.assembler.mc.mark_op(None) # end of the loop
 
+    def flush_loop(self):
+        # rare case: if the loop is too short, pad with NOPs
+        mc = self.assembler.mc
+        while mc.get_relative_pos() < self.min_bytes_before_label:
+            mc.NOP()
+
     def _compute_vars_longevity(self, inputargs, operations):
         # compute a dictionary that maps variables to index in
         # operations that is a "last-time-seen"
@@ -453,7 +480,7 @@
         # only to guard operations or to jump or to finish
         produced = {}
         last_used = {}
-        useful = {}
+        last_real_usage = {}
         for i in range(len(operations)-1, -1, -1):
             op = operations[i]
             if op.result:
@@ -464,10 +491,13 @@
             opnum = op.getopnum()
             for j in range(op.numargs()):
                 arg = op.getarg(j)
-                if opnum != rop.JUMP and opnum != rop.FINISH:
-                    useful[arg] = None
-                if isinstance(arg, Box) and arg not in last_used:
+                if not isinstance(arg, Box):
+                    continue
+                if arg not in last_used:
                     last_used[arg] = i
+                if opnum != rop.JUMP and opnum != rop.LABEL:
+                    if arg not in last_real_usage:
+                        last_real_usage[arg] = i
             if op.is_guard():
                 for arg in op.getfailargs():
                     if arg is None: # hole
@@ -475,7 +505,8 @@
                     assert isinstance(arg, Box)
                     if arg not in last_used:
                         last_used[arg] = i
-
+        self.last_real_usage = last_real_usage
+        #
         longevity = {}
         for arg in produced:
             if arg in last_used:
@@ -491,7 +522,7 @@
                 longevity[arg] = (0, last_used[arg])
                 del last_used[arg]
         assert len(last_used) == 0
-        return longevity, useful
+        self.longevity = longevity
 
     def loc(self, v):
         if v is None: # xxx kludgy
@@ -888,7 +919,7 @@
 
     def consider_call_assembler(self, op, guard_op):
         descr = op.getdescr()
-        assert isinstance(descr, LoopToken)
+        assert isinstance(descr, JitCellToken)
         jd = descr.outermost_jitdriver_sd
         assert jd is not None
         size = jd.portal_calldescr.get_result_size(self.translate_support_code)
@@ -1318,35 +1349,72 @@
             self.rm.possibly_free_var(tmpbox_low)
         self.rm.possibly_free_var(tmpbox_high)
 
+    def compute_hint_frame_locations(self, operations):
+        # optimization only: fill in the 'hint_frame_locations' dictionary
+        # of 'fm' based on the JUMP at the end of the loop, by looking
+        # at where we would like the boxes to be after the jump.
+        op = operations[-1]
+        if op.getopnum() != rop.JUMP:
+            return
+        self.final_jump_op = op
+        descr = op.getdescr()
+        assert isinstance(descr, TargetToken)
+        if descr._x86_loop_code != 0:
+            # if the target LABEL was already compiled, i.e. if it belongs
+            # to some already-compiled piece of code
+            self._compute_hint_frame_locations_from_descr(descr)
+        #else:
+        #   The loop ends in a JUMP going back to a LABEL in the same loop.
+        #   We cannot fill 'hint_frame_locations' immediately, but we can
+        #   wait until the corresponding consider_label() to know where the
+        #   we would like the boxes to be after the jump.
+
+    def _compute_hint_frame_locations_from_descr(self, descr):
+        arglocs = descr._x86_arglocs
+        jump_op = self.final_jump_op
+        assert len(arglocs) == jump_op.numargs()
+        for i in range(jump_op.numargs()):
+            box = jump_op.getarg(i)
+            if isinstance(box, Box):
+                loc = arglocs[i]
+                if isinstance(loc, StackLoc):
+                    self.fm.hint_frame_locations[box] = loc
+
     def consider_jump(self, op):
         assembler = self.assembler
         assert self.jump_target_descr is None
         descr = op.getdescr()
-        assert isinstance(descr, LoopToken)
+        assert isinstance(descr, TargetToken)
+        arglocs = descr._x86_arglocs
         self.jump_target_descr = descr
-        nonfloatlocs, floatlocs = assembler.target_arglocs(self.jump_target_descr)
-        # compute 'tmploc' to be all_regs[0] by spilling what is there
-        box = TempBox()
-        box1 = TempBox()
-        tmpreg = X86RegisterManager.all_regs[0]
-        tmploc = self.rm.force_allocate_reg(box, selected_reg=tmpreg)
-        xmmtmp = X86XMMRegisterManager.all_regs[0]
-        self.xrm.force_allocate_reg(box1, selected_reg=xmmtmp)
         # Part about non-floats
-        # XXX we don't need a copy, we only just the original list
-        src_locations1 = [self.loc(op.getarg(i)) for i in range(op.numargs())
-                         if op.getarg(i).type != FLOAT]
-        assert tmploc not in nonfloatlocs
-        dst_locations1 = [loc for loc in nonfloatlocs if loc is not None]
+        src_locations1 = []
+        dst_locations1 = []
         # Part about floats
-        src_locations2 = [self.loc(op.getarg(i)) for i in range(op.numargs())
-                         if op.getarg(i).type == FLOAT]
-        dst_locations2 = [loc for loc in floatlocs if loc is not None]
+        src_locations2 = []
+        dst_locations2 = []
+        # Build the four lists
+        for i in range(op.numargs()):
+            box = op.getarg(i)
+            src_loc = self.loc(box)
+            dst_loc = arglocs[i]
+            if box.type != FLOAT:
+                src_locations1.append(src_loc)
+                dst_locations1.append(dst_loc)
+            else:
+                src_locations2.append(src_loc)
+                dst_locations2.append(dst_loc)
+        # Do we have a temp var?
+        if IS_X86_64:
+            tmpreg = X86_64_SCRATCH_REG
+            xmmtmp = X86_64_XMM_SCRATCH_REG
+        else:
+            tmpreg = None
+            xmmtmp = None
+        # Do the remapping
         remap_frame_layout_mixed(assembler,
-                                 src_locations1, dst_locations1, tmploc,
+                                 src_locations1, dst_locations1, tmpreg,
                                  src_locations2, dst_locations2, xmmtmp)
-        self.rm.possibly_free_var(box)
-        self.xrm.possibly_free_var(box1)
         self.possibly_free_vars_for_op(op)
         assembler.closing_jump(self.jump_target_descr)
 
@@ -1362,7 +1430,7 @@
 
     def get_mark_gc_roots(self, gcrootmap, use_copy_area=False):
         shape = gcrootmap.get_basic_shape(IS_X86_64)
-        for v, val in self.fm.frame_bindings.items():
+        for v, val in self.fm.bindings.items():
             if (isinstance(v, BoxPtr) and self.rm.stays_alive(v)):
                 assert isinstance(val, StackLoc)
                 gcrootmap.add_frame_offset(shape, get_ebp_ofs(val.position))
@@ -1397,6 +1465,56 @@
         # the FORCE_TOKEN operation returns directly 'ebp'
         self.rm.force_allocate_frame_reg(op.result)
 
+    def consider_label(self, op):
+        descr = op.getdescr()
+        assert isinstance(descr, TargetToken)
+        inputargs = op.getarglist()
+        arglocs = [None] * len(inputargs)
+        #
+        # we use force_spill() on the boxes that are not going to be really
+        # used any more in the loop, but that are kept alive anyway
+        # by being in a next LABEL's or a JUMP's argument or fail_args
+        # of some guard
+        position = self.rm.position
+        for arg in inputargs:
+            assert isinstance(arg, Box)
+            if self.last_real_usage.get(arg, -1) <= position:
+                self.force_spill_var(arg)
+        #
+        # we need to make sure that no variable is stored in ebp
+        for arg in inputargs:
+            if self.loc(arg) is ebp:
+                loc2 = self.fm.loc(arg)
+                self.assembler.mc.MOV(loc2, ebp)
+        self.rm.bindings_to_frame_reg.clear()
+        #
+        for i in range(len(inputargs)):
+            arg = inputargs[i]
+            assert isinstance(arg, Box)
+            loc = self.loc(arg)
+            assert loc is not ebp
+            arglocs[i] = loc
+            if isinstance(loc, RegLoc):
+                self.fm.mark_as_free(arg)
+        #
+        # if we are too close to the start of the loop, the label's target may
+        # get overridden by redirect_call_assembler().  (rare case)
+        self.flush_loop()
+        #
+        descr._x86_arglocs = arglocs
+        descr._x86_loop_code = self.assembler.mc.get_relative_pos()
+        descr._x86_clt = self.assembler.current_clt
+        self.assembler.target_tokens_currently_compiling[descr] = None
+        self.possibly_free_vars_for_op(op)
+        #
+        # if the LABEL's descr is precisely the target of the JUMP at the
+        # end of the same loop, i.e. if what we are compiling is a single
+        # loop that ends up jumping to this LABEL, then we can now provide
+        # the hints about the expected position of the spilled variables.
+        jump_op = self.final_jump_op
+        if jump_op is not None and jump_op.getdescr() is descr:
+            self._compute_hint_frame_locations_from_descr(descr)
+
     def not_implemented_op(self, op):
         not_implemented("not implemented operation: %s" % op.getopname())
 
@@ -1452,3 +1570,7 @@
 def not_implemented(msg):
     os.write(2, '[x86/regalloc] %s\n' % msg)
     raise NotImplementedError(msg)
+
+# xxx hack: set a default value for TargetToken._x86_loop_code.
+# If 0, we know that it is a LABEL that was not compiled yet.
+TargetToken._x86_loop_code = 0
diff --git a/pypy/jit/backend/x86/regloc.py b/pypy/jit/backend/x86/regloc.py
--- a/pypy/jit/backend/x86/regloc.py
+++ b/pypy/jit/backend/x86/regloc.py
@@ -16,8 +16,7 @@
 #
 
 class AssemblerLocation(object):
-    # XXX: Is adding "width" here correct?
-    _attrs_ = ('value', 'width', '_location_code')
+    _attrs_ = ('value', '_location_code')
     _immutable_ = True
     def _getregkey(self):
         return self.value
@@ -28,6 +27,9 @@
     def location_code(self):
         return self._location_code
 
+    def get_width(self):
+        raise NotImplementedError
+
     def value_r(self): return self.value
     def value_b(self): return self.value
     def value_s(self): return self.value
@@ -43,14 +45,21 @@
     _immutable_ = True
     _location_code = 'b'
 
-    def __init__(self, position, ebp_offset, num_words, type):
-        assert ebp_offset < 0   # so no confusion with RegLoc.value
+    def __init__(self, position, ebp_offset, type):
+        # _getregkey() returns self.value; the value returned must not
+        # conflict with RegLoc._getregkey().  It doesn't a bit by chance,
+        # so let it fail the following assert if it no longer does.
+        assert not (0 <= ebp_offset < 8 + 8 * IS_X86_64)
         self.position = position
         self.value = ebp_offset
-        self.width = num_words * WORD
         # One of INT, REF, FLOAT
         self.type = type
 
+    def get_width(self):
+        if self.type == FLOAT:
+            return 8
+        return WORD
+
     def __repr__(self):
         return '%d(%%ebp)' % (self.value,)
 
@@ -64,10 +73,8 @@
         self.value = regnum
         self.is_xmm = is_xmm
         if self.is_xmm:
-            self.width = 8
             self._location_code = 'x'
         else:
-            self.width = WORD
             self._location_code = 'r'
     def __repr__(self):
         if self.is_xmm:
@@ -75,6 +82,11 @@
         else:
             return rx86.R.names[self.value]
 
+    def get_width(self):
+        if self.is_xmm:
+            return 8
+        return WORD
+
     def lowest8bits(self):
         assert not self.is_xmm
         return RegLoc(rx86.low_byte(self.value), False)
@@ -92,9 +104,11 @@
         else:
             return eax
 
-class ImmedLoc(AssemblerLocation):
+class ImmediateAssemblerLocation(AssemblerLocation):
     _immutable_ = True
-    width = WORD
+
+class ImmedLoc(ImmediateAssemblerLocation):
+    _immutable_ = True
     _location_code = 'i'
 
     def __init__(self, value):
@@ -105,6 +119,9 @@
     def getint(self):
         return self.value
 
+    def get_width(self):
+        return WORD
+
     def __repr__(self):
         return "ImmedLoc(%d)" % (self.value)
 
@@ -117,7 +134,6 @@
 class AddressLoc(AssemblerLocation):
     _immutable_ = True
 
-    width = WORD
     # The address is base_loc + (scaled_loc << scale) + static_offset
     def __init__(self, base_loc, scaled_loc, scale=0, static_offset=0):
         assert 0 <= scale < 4
@@ -146,6 +162,9 @@
         info = getattr(self, attr, '?')
         return '<AddressLoc %r: %s>' % (self._location_code, info)
 
+    def get_width(self):
+        return WORD
+
     def value_a(self):
         return self.loc_a
 
@@ -180,32 +199,34 @@
             raise AssertionError(self._location_code)
         return result
 
-class ConstFloatLoc(AssemblerLocation):
-    # XXX: We have to use this class instead of just AddressLoc because
-    # we want a width of 8  (... I think.  Check this!)
+class ConstFloatLoc(ImmediateAssemblerLocation):
     _immutable_ = True
-    width = 8
     _location_code = 'j'
 
     def __init__(self, address):
         self.value = address
 
+    def get_width(self):
+        return 8
+
     def __repr__(self):
         return '<ConstFloatLoc @%s>' % (self.value,)
 
 if IS_X86_32:
-    class FloatImmedLoc(AssemblerLocation):
+    class FloatImmedLoc(ImmediateAssemblerLocation):
         # This stands for an immediate float.  It cannot be directly used in
         # any assembler instruction.  Instead, it is meant to be decomposed
         # in two 32-bit halves.  On 64-bit, FloatImmedLoc() is a function
         # instead; see below.
         _immutable_ = True
-        width = 8
         _location_code = '#'     # don't use me
 
         def __init__(self, floatstorage):
             self.aslonglong = floatstorage
 
+        def get_width(self):
+            return 8
+
         def low_part(self):
             return intmask(self.aslonglong)
 
diff --git a/pypy/jit/backend/x86/runner.py b/pypy/jit/backend/x86/runner.py
--- a/pypy/jit/backend/x86/runner.py
+++ b/pypy/jit/backend/x86/runner.py
@@ -3,6 +3,7 @@
 from pypy.rpython.lltypesystem.lloperation import llop
 from pypy.rpython.llinterp import LLInterpreter
 from pypy.rlib.objectmodel import we_are_translated
+from pypy.jit.codewriter import longlong
 from pypy.jit.metainterp import history, compile
 from pypy.jit.backend.x86.assembler import Assembler386
 from pypy.jit.backend.x86.arch import FORCE_INDEX_OFS
@@ -21,7 +22,6 @@
     supports_floats = True
     supports_singlefloats = True
 
-    BOOTSTRAP_TP = lltype.FuncType([], lltype.Signed)
     dont_keepalive_stuff = False # for tests
     with_threads = False
 
@@ -91,15 +91,6 @@
         return self.assembler.assemble_bridge(faildescr, inputargs, operations,
                                               original_loop_token, log=log)
 
-    def set_future_value_int(self, index, intvalue):
-        self.assembler.fail_boxes_int.setitem(index, intvalue)
-
-    def set_future_value_float(self, index, floatvalue):
-        self.assembler.fail_boxes_float.setitem(index, floatvalue)
-
-    def set_future_value_ref(self, index, ptrvalue):
-        self.assembler.fail_boxes_ptr.setitem(index, ptrvalue)
-
     def get_latest_value_int(self, index):
         return self.assembler.fail_boxes_int.getitem(index)
 
@@ -122,27 +113,28 @@
         # the FORCE_TOKEN operation and this helper both return 'ebp'.
         return self.assembler.fail_ebp
 
-    def execute_token(self, executable_token):
-        addr = executable_token._x86_bootstrap_code
-        #llop.debug_print(lltype.Void, ">>>> Entering", addr)
-        func = rffi.cast(lltype.Ptr(self.BOOTSTRAP_TP), addr)
-        fail_index = self._execute_call(func)
-        #llop.debug_print(lltype.Void, "<<<< Back")
-        return self.get_fail_descr_from_number(fail_index)
-
-    def _execute_call(self, func):
-        # help flow objspace
-        prev_interpreter = None
-        if not self.translate_support_code:
-            prev_interpreter = LLInterpreter.current_interpreter
-            LLInterpreter.current_interpreter = self.debug_ll_interpreter
-        res = 0
-        try:
-            res = func()
-        finally:
+    def make_execute_token(self, *ARGS):
+        FUNCPTR = lltype.Ptr(lltype.FuncType(ARGS, lltype.Signed))
+        #
+        def execute_token(executable_token, *args):
+            clt = executable_token.compiled_loop_token
+            assert len(args) == clt._debug_nbargs
+            #
+            addr = executable_token._x86_function_addr
+            func = rffi.cast(FUNCPTR, addr)
+            #llop.debug_print(lltype.Void, ">>>> Entering", addr)
+            prev_interpreter = None   # help flow space
             if not self.translate_support_code:
-                LLInterpreter.current_interpreter = prev_interpreter
-        return res
+                prev_interpreter = LLInterpreter.current_interpreter
+                LLInterpreter.current_interpreter = self.debug_ll_interpreter
+            try:
+                fail_index = func(*args)
+            finally:
+                if not self.translate_support_code:
+                    LLInterpreter.current_interpreter = prev_interpreter
+            #llop.debug_print(lltype.Void, "<<<< Back")
+            return self.get_fail_descr_from_number(fail_index)
+        return execute_token
 
     def cast_ptr_to_int(x):
         adr = llmemory.cast_ptr_to_adr(x)
@@ -215,14 +207,3 @@
         super(CPU_X86_64, self).__init__(*args, **kwargs)
 
 CPU = CPU386
-
-# silence warnings
-##history.LoopToken._x86_param_depth = 0
-##history.LoopToken._x86_arglocs = (None, None)
-##history.LoopToken._x86_frame_depth = 0
-##history.LoopToken._x86_bootstrap_code = 0
-##history.LoopToken._x86_direct_bootstrap_code = 0
-##history.LoopToken._x86_loop_code = 0
-##history.LoopToken._x86_debug_checksum = 0
-##compile.AbstractFailDescr._x86_current_depths = (0, 0)
-##compile.AbstractFailDescr._x86_adr_jump_offset = 0
diff --git a/pypy/jit/backend/x86/test/test_assembler.py b/pypy/jit/backend/x86/test/test_assembler.py
--- a/pypy/jit/backend/x86/test/test_assembler.py
+++ b/pypy/jit/backend/x86/test/test_assembler.py
@@ -46,12 +46,13 @@
             xmm2]
     assert len(failargs) == len(locs)
     assembler.write_failure_recovery_description(mc, failargs, locs)
-    nums = [Assembler386.DESCR_INT   + 4*(16+0),
-            Assembler386.DESCR_REF   + 4*(16+1),
-            Assembler386.DESCR_FLOAT + 4*(16+10),
-            Assembler386.DESCR_INT   + 4*(16+100),
-            Assembler386.DESCR_REF   + 4*(16+101),
-            Assembler386.DESCR_FLOAT + 4*(16+110),
+    base = 8 + 8*IS_X86_64
+    nums = [Assembler386.DESCR_INT   + 4*(base+0),
+            Assembler386.DESCR_REF   + 4*(base+1),
+            Assembler386.DESCR_FLOAT + 4*(base+10),
+            Assembler386.DESCR_INT   + 4*(base+100),
+            Assembler386.DESCR_REF   + 4*(base+101),
+            Assembler386.DESCR_FLOAT + 4*(base+110),
             Assembler386.CODE_HOLE,
             Assembler386.CODE_HOLE,
             Assembler386.DESCR_INT   + 4*ebx.value,
diff --git a/pypy/jit/backend/x86/test/test_gc_integration.py b/pypy/jit/backend/x86/test/test_gc_integration.py
--- a/pypy/jit/backend/x86/test/test_gc_integration.py
+++ b/pypy/jit/backend/x86/test/test_gc_integration.py
@@ -4,7 +4,7 @@
 
 import py
 from pypy.jit.metainterp.history import BoxInt, ConstInt,\
-     BoxPtr, ConstPtr, TreeLoop
+     BoxPtr, ConstPtr, TreeLoop, TargetToken
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.codewriter import heaptracker
 from pypy.jit.codewriter.effectinfo import EffectInfo
@@ -113,6 +113,8 @@
     descr0 = cpu.fielddescrof(S, 'int')
     ptr0 = struct_ref
 
+    targettoken = TargetToken()
+
     namespace = locals().copy()
 
     def test_basic(self):
@@ -136,6 +138,7 @@
     def test_bug_0(self):
         ops = '''
         [i0, i1, i2, i3, i4, i5, i6, i7, i8]
+        label(i0, i1, i2, i3, i4, i5, i6, i7, i8, descr=targettoken)
         guard_value(i2, 1) [i2, i3, i4, i5, i6, i7, i0, i1, i8]
         guard_class(i4, 138998336) [i4, i5, i6, i7, i0, i1, i8]
         i11 = getfield_gc(i4, descr=descr0)
@@ -163,7 +166,7 @@
         guard_false(i32) [i4, i6, i7, i0, i1, i24]
         i33 = getfield_gc(i0, descr=descr0)
         guard_value(i33, ConstPtr(ptr0)) [i4, i6, i7, i0, i1, i33, i24]
-        jump(i0, i1, 1, 17, i4, ConstPtr(ptr0), i6, i7, i24)
+        jump(i0, i1, 1, 17, i4, ConstPtr(ptr0), i6, i7, i24, descr=targettoken)
         '''
         self.interpret(ops, [0, 0, 0, 0, 0, 0, 0, 0, 0], run=False)
 
diff --git a/pypy/jit/backend/x86/test/test_jump.py b/pypy/jit/backend/x86/test/test_jump.py
--- a/pypy/jit/backend/x86/test/test_jump.py
+++ b/pypy/jit/backend/x86/test/test_jump.py
@@ -71,6 +71,18 @@
                              ('mov', eax, s24),
                              ('mov', s12, edi)]
 
+def test_no_tmp_reg():
+    assembler = MockAssembler()
+    s8 = frame_pos(0, INT)
+    s12 = frame_pos(13, INT)
+    s20 = frame_pos(20, INT)
+    s24 = frame_pos(221, INT)
+    remap_frame_layout(assembler, [s8, eax, s12], [s20, s24, edi], None)
+    assert assembler.ops == [('push', s8),
+                             ('pop', s20),
+                             ('mov', eax, s24),
+                             ('mov', s12, edi)]
+
 def test_reordering():
     assembler = MockAssembler()
     s8 = frame_pos(8, INT)
diff --git a/pypy/jit/backend/x86/test/test_recompilation.py b/pypy/jit/backend/x86/test/test_recompilation.py
--- a/pypy/jit/backend/x86/test/test_recompilation.py
+++ b/pypy/jit/backend/x86/test/test_recompilation.py
@@ -5,10 +5,11 @@
     def test_compile_bridge_not_deeper(self):
         ops = '''
         [i0]
+        label(i0, descr=targettoken)
         i1 = int_add(i0, 1)
         i2 = int_lt(i1, 20)
         guard_true(i2, descr=fdescr1) [i1]
-        jump(i1)
+        jump(i1, descr=targettoken)
         '''
         loop = self.interpret(ops, [0])
         assert self.getint(0) == 20
@@ -18,22 +19,22 @@
         finish(i3, descr=fdescr2)
         '''
         bridge = self.attach_bridge(ops, loop, -2)
-        self.cpu.set_future_value_int(0, 0)
-        fail = self.run(loop)
+        fail = self.run(loop, 0)
         assert fail.identifier == 2
         assert self.getint(0) == 21
     
     def test_compile_bridge_deeper(self):
         ops = '''
         [i0]
+        label(i0, descr=targettoken)
         i1 = int_add(i0, 1)
         i2 = int_lt(i1, 20)
         guard_true(i2, descr=fdescr1) [i1]
-        jump(i1)
+        jump(i1, descr=targettoken)
         '''
         loop = self.interpret(ops, [0])
-        previous = loop.token._x86_frame_depth
-        assert loop.token._x86_param_depth == 0
+        previous = loop._jitcelltoken.compiled_loop_token.frame_depth
+        assert loop._jitcelltoken.compiled_loop_token.param_depth == 0
         assert self.getint(0) == 20
         ops = '''
         [i1]
@@ -42,19 +43,18 @@
         i5 = int_add(i4, 1)
         i6 = int_add(i5, 1)
         i7 = int_add(i5, i4)
+        force_spill(i5)
         i8 = int_add(i7, 1)
         i9 = int_add(i8, 1)
         finish(i3, i4, i5, i6, i7, i8, i9, descr=fdescr2)
         '''
         bridge = self.attach_bridge(ops, loop, -2)
-        descr = loop.operations[2].getdescr()
+        descr = loop.operations[3].getdescr()
         new = descr._x86_bridge_frame_depth
-        assert descr._x86_bridge_param_depth == 0        
-        # XXX: Maybe add enough ops to force stack on 64-bit as well?
-        if IS_X86_32:
-            assert new > previous
-        self.cpu.set_future_value_int(0, 0)
-        fail = self.run(loop)
+        assert descr._x86_bridge_param_depth == 0
+        # the force_spill() forces the stack to grow
+        assert new > previous
+        fail = self.run(loop, 0)
         assert fail.identifier == 2
         assert self.getint(0) == 21
         assert self.getint(1) == 22
@@ -64,28 +64,30 @@
     def test_bridge_jump_to_other_loop(self):
         loop = self.interpret('''
         [i0, i10, i11, i12, i13, i14, i15, i16]
+        label(i0, i10, i11, i12, i13, i14, i15, i16, descr=targettoken)
         i1 = int_add(i0, 1)
         i2 = int_lt(i1, 20)
         guard_true(i2, descr=fdescr1) [i1]
-        jump(i1, i10, i11, i12, i13, i14, i15, i16)
-        ''', [0])
+        jump(i1, i10, i11, i12, i13, i14, i15, i16, descr=targettoken)
+        ''', [0, 0, 0, 0, 0, 0, 0, 0])
         other_loop = self.interpret('''
-        [i3]
+        [i3, i10, i11, i12, i13, i14, i15, i16]
+        label(i3, descr=targettoken2)
         guard_false(i3, descr=fdescr2) [i3]
-        jump(i3)
-        ''', [1])
+        jump(i3, descr=targettoken2)
+        ''', [1, 0, 0, 0, 0, 0, 0, 0])
         ops = '''
         [i3]
-        jump(i3, 1, 2, 3, 4, 5, 6, 7, descr=looptoken)
+        jump(i3, 1, 2, 3, 4, 5, 6, 7, descr=targettoken)
         '''
-        bridge = self.attach_bridge(ops, other_loop, 0, looptoken=loop.token)
-        self.cpu.set_future_value_int(0, 1)
-        fail = self.run(other_loop)
+        bridge = self.attach_bridge(ops, other_loop, 1)
+        fail = self.run(other_loop, 1, 0, 0, 0, 0, 0, 0, 0)
         assert fail.identifier == 1
 
     def test_bridge_jumps_to_self_deeper(self):
         loop = self.interpret('''
         [i0, i1, i2, i31, i32, i33]
+        label(i0, i1, i2, i31, i32, i33, descr=targettoken)
         i98 = same_as(0)
         i99 = same_as(1)
         i30 = int_add(i1, i2)
@@ -94,8 +96,8 @@
         guard_false(i4) [i98, i3]
         i5 = int_lt(i3, 20)
         guard_true(i5) [i99, i3]
-        jump(i3, i30, 1, i30, i30, i30)
-        ''', [0])
+        jump(i3, i30, 1, i30, i30, i30, descr=targettoken)
+        ''', [0, 0, 0, 0, 0, 0])
         assert self.getint(0) == 0
         assert self.getint(1) == 1
         ops = '''
@@ -104,28 +106,28 @@
         i8 = int_add(i3, 1)
         i6 = int_add(i8, i10)
         i7 = int_add(i3, i6)
+        force_spill(i6)
+        force_spill(i7)
+        force_spill(i8)
         i12 = int_add(i7, i8)
         i11 = int_add(i12, i6)
-        jump(i3, i12, i11, i10, i6, i7, descr=looptoken)
+        jump(i3, i12, i11, i10, i6, i7, descr=targettoken)
         '''
-        bridge = self.attach_bridge(ops, loop, 5, looptoken=loop.token)
-        guard_op = loop.operations[5]
-        loop_frame_depth = loop.token._x86_frame_depth
-        assert loop.token._x86_param_depth == 0
-        # XXX: Maybe add enough ops to force stack on 64-bit as well?
-        if IS_X86_32:
-            assert guard_op.getdescr()._x86_bridge_frame_depth > loop_frame_depth
+        loop_frame_depth = loop._jitcelltoken.compiled_loop_token.frame_depth
+        bridge = self.attach_bridge(ops, loop, 6)
+        guard_op = loop.operations[6]
+        assert loop._jitcelltoken.compiled_loop_token.param_depth == 0
+        # the force_spill() forces the stack to grow
+        assert guard_op.getdescr()._x86_bridge_frame_depth > loop_frame_depth
         assert guard_op.getdescr()._x86_bridge_param_depth == 0
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.set_future_value_int(1, 0)
-        self.cpu.set_future_value_int(2, 0)
-        self.run(loop)
+        self.run(loop, 0, 0, 0, 0, 0, 0)
         assert self.getint(0) == 1
         assert self.getint(1) == 20
 
     def test_bridge_jumps_to_self_shallower(self):
         loop = self.interpret('''
         [i0, i1, i2]
+        label(i0, i1, i2, descr=targettoken)
         i98 = same_as(0)
         i99 = same_as(1)
         i3 = int_add(i0, 1)
@@ -133,19 +135,16 @@
         guard_false(i4) [i98, i3]
         i5 = int_lt(i3, 20)
         guard_true(i5) [i99, i3]
-        jump(i3, i1, i2)
-        ''', [0])
+        jump(i3, i1, i2, descr=targettoken)
+        ''', [0, 0, 0])
         assert self.getint(0) == 0
         assert self.getint(1) == 1
         ops = '''
         [i97, i3]
-        jump(i3, 0, 1, descr=looptoken)
+        jump(i3, 0, 1, descr=targettoken)
         '''
-        bridge = self.attach_bridge(ops, loop, 4, looptoken=loop.token)
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.set_future_value_int(1, 0)
-        self.cpu.set_future_value_int(2, 0)
-        self.run(loop)
+        bridge = self.attach_bridge(ops, loop, 5)
+        self.run(loop, 0, 0, 0)
         assert self.getint(0) == 1
         assert self.getint(1) == 20
         
diff --git a/pypy/jit/backend/x86/test/test_regalloc.py b/pypy/jit/backend/x86/test/test_regalloc.py
--- a/pypy/jit/backend/x86/test/test_regalloc.py
+++ b/pypy/jit/backend/x86/test/test_regalloc.py
@@ -4,7 +4,7 @@
 
 import py
 from pypy.jit.metainterp.history import BoxInt, ConstInt,\
-     BoxPtr, ConstPtr, LoopToken, BasicFailDescr
+     BoxPtr, ConstPtr, BasicFailDescr, JitCellToken, TargetToken
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.backend.llsupport.descr import GcCache
 from pypy.jit.backend.detect_cpu import getcpuclass
@@ -96,10 +96,16 @@
     raising_calldescr = cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT,
                                         EffectInfo.MOST_GENERAL)
 
+    targettoken = TargetToken()
+    targettoken2 = TargetToken()
     fdescr1 = BasicFailDescr(1)
     fdescr2 = BasicFailDescr(2)
     fdescr3 = BasicFailDescr(3)
 
+    def setup_method(self, meth):
+        self.targettoken._x86_loop_code = 0
+        self.targettoken2._x86_loop_code = 0
+
     def f1(x):
         return x+1
 
@@ -134,26 +140,29 @@
 
     def interpret(self, ops, args, run=True):
         loop = self.parse(ops)
-        self.cpu.compile_loop(loop.inputargs, loop.operations, loop.token)
-        for i, arg in enumerate(args):
+        looptoken = JitCellToken()
+        self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
+        arguments = []
+        for arg in args:
             if isinstance(arg, int):
-                self.cpu.set_future_value_int(i, arg)
+                arguments.append(arg)
             elif isinstance(arg, float):
                 arg = longlong.getfloatstorage(arg)
-                self.cpu.set_future_value_float(i, arg)
+                arguments.append(arg)
             else:
                 assert isinstance(lltype.typeOf(arg), lltype.Ptr)
                 llgcref = lltype.cast_opaque_ptr(llmemory.GCREF, arg)
-                self.cpu.set_future_value_ref(i, llgcref)
+                arguments.append(llgcref)
+        loop._jitcelltoken = looptoken
         if run:
-            self.cpu.execute_token(loop.token)
+            self.cpu.execute_token(looptoken, *arguments)
         return loop
 
     def prepare_loop(self, ops):
         loop = self.parse(ops)
         regalloc = RegAlloc(self.cpu.assembler, False)
         regalloc.prepare_loop(loop.inputargs, loop.operations,
-                              loop.token, [])
+                              loop.original_jitcell_token, [])
         return regalloc
 
     def getint(self, index):
@@ -174,10 +183,7 @@
         gcref = self.cpu.get_latest_value_ref(index)
         return lltype.cast_opaque_ptr(T, gcref)
 
-    def attach_bridge(self, ops, loop, guard_op_index, looptoken=None, **kwds):
-        if looptoken is not None:
-            self.namespace = self.namespace.copy()
-            self.namespace['looptoken'] = looptoken
+    def attach_bridge(self, ops, loop, guard_op_index, **kwds):
         guard_op = loop.operations[guard_op_index]
         assert guard_op.is_guard()
         bridge = self.parse(ops, **kwds)
@@ -185,20 +191,21 @@
                 [box.type for box in guard_op.getfailargs()])
         faildescr = guard_op.getdescr()
         self.cpu.compile_bridge(faildescr, bridge.inputargs, bridge.operations,
-                                loop.token)
+                                loop._jitcelltoken)
         return bridge
 
-    def run(self, loop):
-        return self.cpu.execute_token(loop.token)
+    def run(self, loop, *arguments):
+        return self.cpu.execute_token(loop._jitcelltoken, *arguments)
 
 class TestRegallocSimple(BaseTestRegalloc):
     def test_simple_loop(self):
         ops = '''
         [i0]
+        label(i0, descr=targettoken)
         i1 = int_add(i0, 1)
         i2 = int_lt(i1, 20)
         guard_true(i2) [i1]
-        jump(i1)
+        jump(i1, descr=targettoken)
         '''
         self.interpret(ops, [0])
         assert self.getint(0) == 20
@@ -206,29 +213,30 @@
     def test_two_loops_and_a_bridge(self):
         ops = '''
         [i0, i1, i2, i3]
+        label(i0, i1, i2, i3, descr=targettoken)
         i4 = int_add(i0, 1)
         i5 = int_lt(i4, 20)
         guard_true(i5) [i4, i1, i2, i3]
-        jump(i4, i1, i2, i3)
+        jump(i4, i1, i2, i3, descr=targettoken)
         '''
         loop = self.interpret(ops, [0, 0, 0, 0])
         ops2 = '''
-        [i5]
+        [i5, i6, i7, i8]
+        label(i5, descr=targettoken2)
         i1 = int_add(i5, 1)
         i3 = int_add(i1, 1)
         i4 = int_add(i3, 1)
         i2 = int_lt(i4, 30)
         guard_true(i2) [i4]
-        jump(i4)
+        jump(i4, descr=targettoken2)
         '''
-        loop2 = self.interpret(ops2, [0])
+        loop2 = self.interpret(ops2, [0, 0, 0, 0])
         bridge_ops = '''
         [i4]
-        jump(i4, i4, i4, i4, descr=looptoken)
+        jump(i4, i4, i4, i4, descr=targettoken)
         '''
-        bridge = self.attach_bridge(bridge_ops, loop2, 4, looptoken=loop.token)
-        self.cpu.set_future_value_int(0, 0)
-        self.run(loop2)
+        bridge = self.attach_bridge(bridge_ops, loop2, 5)
+        self.run(loop2, 0, 0, 0, 0)
         assert self.getint(0) == 31
         assert self.getint(1) == 30
         assert self.getint(2) == 30
@@ -237,10 +245,11 @@
     def test_pointer_arg(self):
         ops = '''
         [i0, p0]
+        label(i0, p0, descr=targettoken)
         i1 = int_add(i0, 1)
         i2 = int_lt(i1, 10)
         guard_true(i2) [p0]
-        jump(i1, p0)
+        jump(i1, p0, descr=targettoken)
         '''
         S = lltype.GcStruct('S')
         ptr = lltype.malloc(S)
@@ -265,8 +274,7 @@
         loop = self.interpret(ops, [0])
         assert self.getint(0) == 1
         bridge = self.attach_bridge(bridge_ops, loop, 2)
-        self.cpu.set_future_value_int(0, 0)
-        self.run(loop)
+        self.run(loop, 0)
         assert self.getint(0) == 1
 
     def test_inputarg_unused(self):
@@ -292,9 +300,7 @@
         assert self.getint(0) == 0
         assert self.getint(1) == 10
         bridge = self.attach_bridge(bridge_ops, loop, 0)
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.set_future_value_int(1, 10)
-        self.run(loop)
+        self.run(loop, 0, 10)
         assert self.getint(0) == 0
         assert self.getint(1) == 10
 
@@ -311,17 +317,16 @@
         finish(1, 2)
         '''
         self.attach_bridge(bridge_ops, loop, 0)
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.set_future_value_int(1, 1)
-        self.run(loop)
+        self.run(loop, 0, 1)
 
     def test_spill_for_constant(self):
         ops = '''
         [i0, i1, i2, i3]
+        label(i0, i1, i2, i3, descr=targettoken)
         i4 = int_add(3, i1)
         i5 = int_lt(i4, 30)
         guard_true(i5) [i0, i4, i2, i3]
-        jump(1, i4, 3, 4)
+        jump(1, i4, 3, 4, descr=targettoken)
         '''
         self.interpret(ops, [0, 0, 0, 0])
         assert self.getints(4) == [1, 30, 3, 4]
@@ -329,31 +334,34 @@
     def test_spill_for_constant_lshift(self):
         ops = '''
         [i0, i2, i1, i3]
+        label(i0, i2, i1, i3, descr=targettoken)
         i4 = int_lshift(1, i1)
         i5 = int_add(1, i1)
         i6 = int_lt(i5, 30)
         guard_true(i6) [i4, i5, i2, i3]
-        jump(i4, 3, i5, 4)
+        jump(i4, 3, i5, 4, descr=targettoken)
         '''
         self.interpret(ops, [0, 0, 0, 0])
         assert self.getints(4) == [1<<29, 30, 3, 4]
         ops = '''
         [i0, i1, i2, i3]
+        label(i0, i1, i2, i3, descr=targettoken)
         i4 = int_lshift(1, i1)
         i5 = int_add(1, i1)
         i6 = int_lt(i5, 30)
         guard_true(i6) [i4, i5, i2, i3]
-        jump(i4, i5, 3, 4)
+        jump(i4, i5, 3, 4, descr=targettoken)
         '''
         self.interpret(ops, [0, 0, 0, 0])
         assert self.getints(4) == [1<<29, 30, 3, 4]
         ops = '''
         [i0, i3, i1, i2]
+        label(i0, i3, i1, i2, descr=targettoken)
         i4 = int_lshift(1, i1)
         i5 = int_add(1, i1)
         i6 = int_lt(i5, 30)
         guard_true(i6) [i4, i5, i2, i3]
-        jump(i4, 4, i5, 3)
+        jump(i4, 4, i5, 3, descr=targettoken)
         '''
         self.interpret(ops, [0, 0, 0, 0])
         assert self.getints(4) == [1<<29, 30, 3, 4]
@@ -361,11 +369,12 @@
     def test_result_selected_reg_via_neg(self):
         ops = '''
         [i0, i1, i2, i3]
+        label(i0, i1, i2, i3, descr=targettoken)
         i6 = int_neg(i2)
         i7 = int_add(1, i1)
         i4 = int_lt(i7, 10)
         guard_true(i4) [i0, i6, i7]
-        jump(1, i7, i2, i6)
+        jump(1, i7, i2, i6, descr=targettoken)
         '''
         self.interpret(ops, [0, 0, 3, 0])
         assert self.getints(3) == [1, -3, 10]
@@ -373,11 +382,12 @@
     def test_compare_memory_result_survives(self):
         ops = '''
         [i0, i1, i2, i3]
+        label(i0, i1, i2, i3, descr=targettoken)
         i4 = int_lt(i0, i1)
         i5 = int_add(i3, 1)
         i6 = int_lt(i5, 30)
         guard_true(i6) [i4]
-        jump(i0, i1, i4, i5)
+        jump(i0, i1, i4, i5, descr=targettoken)
         '''
         self.interpret(ops, [0, 10, 0, 0])
         assert self.getint(0) == 1
@@ -385,12 +395,13 @@
     def test_jump_different_args(self):
         ops = '''
         [i0, i15, i16, i18, i1, i2, i3]
+        label(i0, i15, i16, i18, i1, i2, i3, descr=targettoken)
         i4 = int_add(i3, 1)
         i5 = int_lt(i4, 20)
         guard_true(i5) [i2, i1]
-        jump(i0, i18, i15, i16, i2, i1, i4)
+        jump(i0, i18, i15, i16, i2, i1, i4, descr=targettoken)
         '''
-        self.interpret(ops, [0, 1, 2, 3])
+        self.interpret(ops, [0, 1, 2, 3, 0, 0, 0])
 
     def test_op_result_unused(self):
         ops = '''
@@ -424,9 +435,7 @@
         finish(i0, i1, i2, i3, i4, i5, i6, i7, i8)
         '''
         self.attach_bridge(bridge_ops, loop, 1)
-        for i in range(9):
-            self.cpu.set_future_value_int(i, i)
-        self.run(loop)
+        self.run(loop, 0, 1, 2, 3, 4, 5, 6, 7, 8)
         assert self.getints(9) == range(9)
 
     def test_loopargs(self):
@@ -436,27 +445,13 @@
         jump(i4, i1, i2, i3)
         """
         regalloc = self.prepare_loop(ops)
-        assert len(regalloc.rm.reg_bindings) == 2
+        if IS_X86_64:
+            assert len(regalloc.rm.reg_bindings) == 4
+            assert len(regalloc.fm.bindings) == 0
+        else:
+            assert len(regalloc.rm.reg_bindings) == 0
+            assert len(regalloc.fm.bindings) == 4
 
-    def test_loopargs_2(self):
-        ops = """
-        [i0, i1, i2, i3]
-        i4 = int_add(i0, i1)
-        finish(i4, i1, i2, i3)
-        """
-        regalloc = self.prepare_loop(ops)
-        assert len(regalloc.rm.reg_bindings) == 2
-
-    def test_loopargs_3(self):
-        ops = """
-        [i0, i1, i2, i3]
-        i4 = int_add(i0, i1)
-        guard_true(i4) [i0, i1, i2, i3, i4]
-        jump(i4, i1, i2, i3)
-        """
-        regalloc = self.prepare_loop(ops)
-        assert len(regalloc.rm.reg_bindings) == 2
-    
 
 class TestRegallocCompOps(BaseTestRegalloc):
     
@@ -474,6 +469,7 @@
 class TestRegallocMoreRegisters(BaseTestRegalloc):
 
     cpu = BaseTestRegalloc.cpu
+    targettoken = TargetToken()
 
     S = lltype.GcStruct('S', ('field', lltype.Char))
     fielddescr = cpu.fielddescrof(S, 'field')
@@ -546,6 +542,7 @@
     def test_division_optimized(self):
         ops = '''
         [i7, i6]
+        label(i7, i6, descr=targettoken)
         i18 = int_floordiv(i7, i6)
         i19 = int_xor(i7, i6)
         i21 = int_lt(i19, 0)
@@ -553,7 +550,7 @@
         i23 = int_is_true(i22)
         i24 = int_eq(i6, 4)
         guard_false(i24) [i18]
-        jump(i18, i6)
+        jump(i18, i6, descr=targettoken)
         '''
         self.interpret(ops, [10, 4])
         assert self.getint(0) == 2
@@ -622,9 +619,10 @@
         i10 = call(ConstClass(f1ptr), i0, descr=f1_calldescr)
         finish(i10, i1, i2, i3, i4, i5, i6, i7, i8, i9)
         '''
-        loop = self.interpret(ops, [4, 7, 9, 9 ,9, 9, 9, 9, 9, 9, 9])
-        assert self.getints(11) == [5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9]
-        assert loop.token._x86_param_depth == self.expected_param_depth(1)
+        loop = self.interpret(ops, [4, 7, 9, 9 ,9, 9, 9, 9, 9, 9])
+        assert self.getints(10) == [5, 7, 9, 9, 9, 9, 9, 9, 9, 9]
+        clt = loop._jitcelltoken.compiled_loop_token
+        assert clt.param_depth == self.expected_param_depth(1)
 
     def test_two_calls(self):
         ops = '''
@@ -633,9 +631,10 @@
         i11 = call(ConstClass(f2ptr), i10, i1, descr=f2_calldescr)        
         finish(i11, i1,  i2, i3, i4, i5, i6, i7, i8, i9)
         '''
-        loop = self.interpret(ops, [4, 7, 9, 9 ,9, 9, 9, 9, 9, 9, 9])
-        assert self.getints(11) == [5*7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9]
-        assert loop.token._x86_param_depth == self.expected_param_depth(2)
+        loop = self.interpret(ops, [4, 7, 9, 9 ,9, 9, 9, 9, 9, 9])
+        assert self.getints(10) == [5*7, 7, 9, 9, 9, 9, 9, 9, 9, 9]
+        clt = loop._jitcelltoken.compiled_loop_token
+        assert clt.param_depth == self.expected_param_depth(2)
 
     def test_call_many_arguments(self):
         # NB: The first and last arguments in the call are constants. This
@@ -648,7 +647,8 @@
         '''
         loop = self.interpret(ops, [2, 3, 4, 5, 6, 7, 8, 9])
         assert self.getint(0) == 55
-        assert loop.token._x86_param_depth == self.expected_param_depth(10)
+        clt = loop._jitcelltoken.compiled_loop_token
+        assert clt.param_depth == self.expected_param_depth(10)
 
     def test_bridge_calls_1(self):
         ops = '''
@@ -668,9 +668,7 @@
 
         assert loop.operations[-2].getdescr()._x86_bridge_param_depth == self.expected_param_depth(2)
 
-        self.cpu.set_future_value_int(0, 4)
-        self.cpu.set_future_value_int(1, 7)        
-        self.run(loop)
+        self.run(loop, 4, 7)
         assert self.getint(0) == 5*7
 
     def test_bridge_calls_2(self):
@@ -691,8 +689,6 @@
 
         assert loop.operations[-2].getdescr()._x86_bridge_param_depth == self.expected_param_depth(2)
 
-        self.cpu.set_future_value_int(0, 4)
-        self.cpu.set_future_value_int(1, 7)        
-        self.run(loop)
+        self.run(loop, 4, 7)
         assert self.getint(0) == 29
 
diff --git a/pypy/jit/backend/x86/test/test_regalloc2.py b/pypy/jit/backend/x86/test/test_regalloc2.py
--- a/pypy/jit/backend/x86/test/test_regalloc2.py
+++ b/pypy/jit/backend/x86/test/test_regalloc2.py
@@ -1,6 +1,6 @@
 import py
 from pypy.jit.metainterp.history import ResOperation, BoxInt, ConstInt,\
-     BoxPtr, ConstPtr, BasicFailDescr, LoopToken
+     BoxPtr, ConstPtr, BasicFailDescr, JitCellToken
 from pypy.jit.metainterp.resoperation import rop
 from pypy.jit.backend.detect_cpu import getcpuclass
 from pypy.jit.backend.x86.arch import WORD
@@ -20,10 +20,9 @@
         ]
     cpu = CPU(None, None)
     cpu.setup_once()
-    looptoken = LoopToken()
+    looptoken = JitCellToken()
     cpu.compile_loop(inputargs, operations, looptoken)
-    cpu.set_future_value_int(0, 9)
-    cpu.execute_token(looptoken)
+    cpu.execute_token(looptoken, 9)
     assert cpu.get_latest_value_int(0) == (9 >> 3)
     assert cpu.get_latest_value_int(1) == (~18)
 
@@ -43,10 +42,9 @@
             ]
     cpu = CPU(None, None)
     cpu.setup_once()
-    looptoken = LoopToken()
+    looptoken = JitCellToken()
     cpu.compile_loop(inputargs, operations, looptoken)
-    cpu.set_future_value_int(0, -10)
-    cpu.execute_token(looptoken)
+    cpu.execute_token(looptoken, -10)
     assert cpu.get_latest_value_int(0) == 0
     assert cpu.get_latest_value_int(1) == -1000
     assert cpu.get_latest_value_int(2) == 1
@@ -140,19 +138,9 @@
             ]
     cpu = CPU(None, None)
     cpu.setup_once()
-    looptoken = LoopToken()
+    looptoken = JitCellToken()
     cpu.compile_loop(inputargs, operations, looptoken)
-    cpu.set_future_value_int(0, -13)
-    cpu.set_future_value_int(1, 10)
-    cpu.set_future_value_int(2, 10)
-    cpu.set_future_value_int(3, 8)
-    cpu.set_future_value_int(4, -8)
-    cpu.set_future_value_int(5, -16)
-    cpu.set_future_value_int(6, -18)
-    cpu.set_future_value_int(7, 46)
-    cpu.set_future_value_int(8, -12)
-    cpu.set_future_value_int(9, 26)
-    cpu.execute_token(looptoken)
+    cpu.execute_token(looptoken, -13, 10, 10, 8, -8, -16, -18, 46, -12, 26)
     assert cpu.get_latest_value_int(0) == 0
     assert cpu.get_latest_value_int(1) == 0
     assert cpu.get_latest_value_int(2) == 0
@@ -255,19 +243,9 @@
             ]
     cpu = CPU(None, None)
     cpu.setup_once()
-    looptoken = LoopToken()
+    looptoken = JitCellToken()
     cpu.compile_loop(inputargs, operations, looptoken)
-    cpu.set_future_value_int(0, 17)
-    cpu.set_future_value_int(1, -20)
-    cpu.set_future_value_int(2, -6)
-    cpu.set_future_value_int(3, 6)
-    cpu.set_future_value_int(4, 1)
-    cpu.set_future_value_int(5, 13)
-    cpu.set_future_value_int(6, 13)
-    cpu.set_future_value_int(7, 9)
-    cpu.set_future_value_int(8, 49)
-    cpu.set_future_value_int(9, 8)
-    cpu.execute_token(looptoken)
+    cpu.execute_token(looptoken, 17, -20, -6, 6, 1, 13, 13, 9, 49, 8)
     assert cpu.get_latest_value_int(0) == 0
     assert cpu.get_latest_value_int(1) == 8
     assert cpu.get_latest_value_int(2) == 1
diff --git a/pypy/jit/backend/x86/test/test_runner.py b/pypy/jit/backend/x86/test/test_runner.py
--- a/pypy/jit/backend/x86/test/test_runner.py
+++ b/pypy/jit/backend/x86/test/test_runner.py
@@ -1,9 +1,10 @@
 import py
 from pypy.rpython.lltypesystem import lltype, llmemory, rffi, rstr, rclass
 from pypy.rpython.annlowlevel import llhelper
-from pypy.jit.metainterp.history import ResOperation, LoopToken
+from pypy.jit.metainterp.history import ResOperation, TargetToken, JitCellToken
 from pypy.jit.metainterp.history import (BoxInt, BoxPtr, ConstInt, ConstFloat,
-                                         ConstPtr, Box, BoxFloat, BasicFailDescr)
+                                         ConstPtr, Box, BoxFloat,
+                                         BasicFailDescr)
 from pypy.jit.backend.detect_cpu import getcpuclass
 from pypy.jit.backend.x86.arch import WORD
 from pypy.jit.backend.x86.rx86 import fits_in_32bits
@@ -279,13 +280,9 @@
                                      descr=BasicFailDescr()),
                         ]
                     ops[-2].setfailargs([i1])
-                    looptoken = LoopToken()
+                    looptoken = JitCellToken()
                     self.cpu.compile_loop([b], ops, looptoken)
-                    if op == rop.INT_IS_TRUE:
-                        self.cpu.set_future_value_int(0, b.value)
-                    else:
-                        self.cpu.set_future_value_ref(0, b.value)
-                    self.cpu.execute_token(looptoken)
+                    self.cpu.execute_token(looptoken, b.value)
                     result = self.cpu.get_latest_value_int(0)
                     if guard == rop.GUARD_FALSE:
                         assert result == execute(self.cpu, None,
@@ -329,11 +326,10 @@
                         ]
                     ops[-2].setfailargs([i1])
                     inputargs = [i for i in (a, b) if isinstance(i, Box)]
-                    looptoken = LoopToken()
+                    looptoken = JitCellToken()
                     self.cpu.compile_loop(inputargs, ops, looptoken)
-                    for i, box in enumerate(inputargs):
-                        self.cpu.set_future_value_int(i, box.value)
-                    self.cpu.execute_token(looptoken)
+                    inputvalues = [box.value for box in inputargs]
+                    self.cpu.execute_token(looptoken, *inputvalues)
                     result = self.cpu.get_latest_value_int(0)
                     expected = execute(self.cpu, None, op, None, a, b).value
                     if guard == rop.GUARD_FALSE:
@@ -353,9 +349,10 @@
         i0 = BoxInt()
         i1 = BoxInt()
         i2 = BoxInt()
+        targettoken = TargetToken()
         faildescr1 = BasicFailDescr(1)
         faildescr2 = BasicFailDescr(2)
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
         looptoken.number = 17
         class FakeString(object):
             def __init__(self, val):
@@ -365,14 +362,15 @@
                 return self.val
 
         operations = [
+            ResOperation(rop.LABEL, [i0], None, descr=targettoken),
             ResOperation(rop.DEBUG_MERGE_POINT, [FakeString("hello"), 0], None),
             ResOperation(rop.INT_ADD, [i0, ConstInt(1)], i1),
             ResOperation(rop.INT_LE, [i1, ConstInt(9)], i2),
             ResOperation(rop.GUARD_TRUE, [i2], None, descr=faildescr1),
-            ResOperation(rop.JUMP, [i1], None, descr=looptoken),
+            ResOperation(rop.JUMP, [i1], None, descr=targettoken),
             ]
         inputargs = [i0]
-        operations[3].setfailargs([i1])
+        operations[-2].setfailargs([i1])
         self.cpu.compile_loop(inputargs, operations, looptoken)
         name, loopaddress, loopsize = agent.functions[0]
         assert name == "Loop # 17: hello (loop counter 0)"
@@ -385,7 +383,7 @@
             ResOperation(rop.INT_LE, [i1b, ConstInt(19)], i3),
             ResOperation(rop.GUARD_TRUE, [i3], None, descr=faildescr2),
             ResOperation(rop.DEBUG_MERGE_POINT, [FakeString("bye"), 0], None),
-            ResOperation(rop.JUMP, [i1b], None, descr=looptoken),
+            ResOperation(rop.JUMP, [i1b], None, descr=targettoken),
         ]
         bridge[1].setfailargs([i1b])
 
@@ -397,8 +395,7 @@
         assert address >= loopaddress + loopsize
         assert size >= 10 # randomish number
 
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_int(0)
         assert res == 20
@@ -408,11 +405,13 @@
         i0 = BoxInt()
         i1 = BoxInt()
         i2 = BoxInt()
-        looptoken = LoopToken()
+        looptoken = JitCellToken()
+        targettoken = TargetToken()
         operations = [
+            ResOperation(rop.LABEL, [i0], None, descr=targettoken),
             ResOperation(rop.INT_ADD, [i0, ConstInt(1)], i1),
             ResOperation(rop.INT_LE, [i1, ConstInt(9)], i2),
-            ResOperation(rop.JUMP, [i1], None, descr=looptoken),
+            ResOperation(rop.JUMP, [i1], None, descr=targettoken),
             ]
         inputargs = [i0]
         debug._log = dlog = debug.DebugLog()
@@ -499,12 +498,10 @@
             ops[3].setfailargs([])
             ops[5].setfailargs([])
             ops[7].setfailargs([])
-            looptoken = LoopToken()
+            looptoken = JitCellToken()
             self.cpu.compile_loop([i1, i2], ops, looptoken)
 
-            self.cpu.set_future_value_int(0, 123450)
-            self.cpu.set_future_value_int(1, 123408)
-            fail = self.cpu.execute_token(looptoken)
+            fail = self.cpu.execute_token(looptoken, 123450, 123408)
             assert fail.identifier == 0
             assert self.cpu.get_latest_value_int(0) == 42
             assert self.cpu.get_latest_value_int(1) == 42
@@ -523,19 +520,20 @@
 
         loop = """
         [i0]
+        label(i0, descr=targettoken)
         debug_merge_point('xyz', 0)
         i1 = int_add(i0, 1)
         i2 = int_ge(i1, 10)
         guard_false(i2) []
-        jump(i1)
+        jump(i1, descr=targettoken)
         """
-        ops = parse(loop)
+        ops = parse(loop, namespace={'targettoken': TargetToken()})
         debug._log = dlog = debug.DebugLog()
         try:
             self.cpu.assembler.set_debug(True)
-            self.cpu.compile_loop(ops.inputargs, ops.operations, ops.token)
-            self.cpu.set_future_value_int(0, 0)
-            self.cpu.execute_token(ops.token)
+            looptoken = JitCellToken()
+            self.cpu.compile_loop(ops.inputargs, ops.operations, looptoken)
+            self.cpu.execute_token(looptoken, 0)
             # check debugging info
             struct = self.cpu.assembler.loop_run_counters[0]
             assert struct.i == 10
@@ -547,16 +545,17 @@
     def test_debugger_checksum(self):
         loop = """
         [i0]
+        label(i0, descr=targettoken)
         debug_merge_point('xyz', 0)
         i1 = int_add(i0, 1)
         i2 = int_ge(i1, 10)
         guard_false(i2) []
-        jump(i1)
+        jump(i1, descr=targettoken)
         """
-        ops = parse(loop)
+        ops = parse(loop, namespace={'targettoken': TargetToken()})
         self.cpu.assembler.set_debug(True)
-        self.cpu.compile_loop(ops.inputargs, ops.operations, ops.token)
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.execute_token(ops.token)
-        assert ops.token._x86_debug_checksum == sum([op.getopnum()
+        looptoken = JitCellToken()
+        self.cpu.compile_loop(ops.inputargs, ops.operations, looptoken)
+        self.cpu.execute_token(looptoken, 0)
+        assert looptoken._x86_debug_checksum == sum([op.getopnum()
                                                      for op in ops.operations])
diff --git a/pypy/jit/codewriter/jtransform.py b/pypy/jit/codewriter/jtransform.py
--- a/pypy/jit/codewriter/jtransform.py
+++ b/pypy/jit/codewriter/jtransform.py
@@ -498,27 +498,29 @@
         else:
             log.WARNING('ignoring hint %r at %r' % (hints, self.graph))
 
+    def _rewrite_raw_malloc(self, op, name, args):
+        d = op.args[1].value.copy()
+        d.pop('flavor')
+        add_memory_pressure = d.pop('add_memory_pressure', False)
+        zero = d.pop('zero', False)
+        track_allocation = d.pop('track_allocation', True)
+        if d:
+            raise UnsupportedMallocFlags(d)
+        TYPE = op.args[0].value
+        if zero:
+            name += '_zero'
+        if add_memory_pressure:
+            name += '_add_memory_pressure'
+        if not track_allocation:
+            name += '_no_track_allocation'
+        return self._do_builtin_call(op, name, args,
+                                     extra = (TYPE,),
+                                     extrakey = TYPE)
+
     def rewrite_op_malloc_varsize(self, op):
         if op.args[1].value['flavor'] == 'raw':
-            d = op.args[1].value.copy()
-            d.pop('flavor')
-            add_memory_pressure = d.pop('add_memory_pressure', False)
-            zero = d.pop('zero', False)
-            track_allocation = d.pop('track_allocation', True)
-            if d:
-                raise UnsupportedMallocFlags(d)
-            ARRAY = op.args[0].value
-            name = 'raw_malloc'
-            if zero:
-                name += '_zero'
-            if add_memory_pressure:
-                name += '_add_memory_pressure'
-            if not track_allocation:
-                name += '_no_track_allocation'
-            return self._do_builtin_call(op, name,
-                                         [op.args[2]],
-                                         extra = (ARRAY,),
-                                         extrakey = ARRAY)
+            return self._rewrite_raw_malloc(op, 'raw_malloc_varsize',
+                                            [op.args[2]])
         if op.args[0].value == rstr.STR:
             return SpaceOperation('newstr', [op.args[2]], op.result)
         elif op.args[0].value == rstr.UNICODE:
@@ -531,11 +533,18 @@
                                   op.result)
 
     def rewrite_op_free(self, op):
-        flags = op.args[1].value
-        assert flags['flavor'] == 'raw'
-        ARRAY = op.args[0].concretetype.TO
-        return self._do_builtin_call(op, 'raw_free', [op.args[0]],
-                                     extra = (ARRAY,), extrakey = ARRAY)
+        d = op.args[1].value.copy()
+        assert d['flavor'] == 'raw'
+        d.pop('flavor')
+        track_allocation = d.pop('track_allocation', True)
+        if d:
+            raise UnsupportedMallocFlags(d)
+        STRUCT = op.args[0].concretetype.TO
+        name = 'raw_free'
+        if not track_allocation:
+            name += '_no_track_allocation'
+        return self._do_builtin_call(op, name, [op.args[0]],
+                                     extra = (STRUCT,), extrakey = STRUCT)
 
     def rewrite_op_getarrayitem(self, op):
         ARRAY = op.args[0].concretetype.TO
@@ -736,6 +745,9 @@
         return [op0, op1]
 
     def rewrite_op_malloc(self, op):
+        if op.args[1].value['flavor'] == 'raw':
+            return self._rewrite_raw_malloc(op, 'raw_malloc_fixedsize', [])
+        #
         assert op.args[1].value == {'flavor': 'gc'}
         STRUCT = op.args[0].value
         vtable = heaptracker.get_vtable_for_gcstruct(self.cpu, STRUCT)
diff --git a/pypy/jit/codewriter/support.py b/pypy/jit/codewriter/support.py
--- a/pypy/jit/codewriter/support.py
+++ b/pypy/jit/codewriter/support.py
@@ -599,26 +599,75 @@
             return p
         return _ll_0_alloc_with_del
 
-    def build_raw_malloc_builder(zero=False, add_memory_pressure=False, track_allocation=True):
-        def build_ll_1_raw_malloc(ARRAY):
-            def _ll_1_raw_malloc(n):
-                return lltype.malloc(ARRAY, n, flavor='raw', zero=zero, add_memory_pressure=add_memory_pressure)
-            return _ll_1_raw_malloc
-        return build_ll_1_raw_malloc
+    def build_raw_malloc_varsize_builder(zero=False,
+                                         add_memory_pressure=False,
+                                         track_allocation=True):
+        def build_ll_1_raw_malloc_varsize(ARRAY):
+            def _ll_1_raw_malloc_varsize(n):
+                return lltype.malloc(ARRAY, n, flavor='raw', zero=zero,
+                                     add_memory_pressure=add_memory_pressure,
+                                     track_allocation=track_allocation)
+            return _ll_1_raw_malloc_varsize
+        return build_ll_1_raw_malloc_varsize
 
-    build_ll_1_raw_malloc = build_raw_malloc_builder()
-    build_ll_1_raw_malloc_zero = build_raw_malloc_builder(zero=True)
-    build_ll_1_raw_malloc_zero_add_memory_pressure = build_raw_malloc_builder(zero=True, add_memory_pressure=True)
-    build_ll_1_raw_malloc_add_memory_pressure = build_raw_malloc_builder(add_memory_pressure=True)
-    build_ll_1_raw_malloc_no_track_allocation = build_raw_malloc_builder(track_allocation=False)
-    build_ll_1_raw_malloc_zero_no_track_allocation = build_raw_malloc_builder(zero=True, track_allocation=False)
-    build_ll_1_raw_malloc_zero_add_memory_pressure_no_track_allocation = build_raw_malloc_builder(zero=True, add_memory_pressure=True, track_allocation=False)
-    build_ll_1_raw_malloc_add_memory_pressure_no_track_allocation = build_raw_malloc_builder(add_memory_pressure=True, track_allocation=False)
+    build_ll_1_raw_malloc_varsize = (
+        build_raw_malloc_varsize_builder())
+    build_ll_1_raw_malloc_varsize_zero = (
+        build_raw_malloc_varsize_builder(zero=True))
+    build_ll_1_raw_malloc_varsize_zero_add_memory_pressure = (
+        build_raw_malloc_varsize_builder(zero=True, add_memory_pressure=True))
+    build_ll_1_raw_malloc_varsize_add_memory_pressure = (
+        build_raw_malloc_varsize_builder(add_memory_pressure=True))
+    build_ll_1_raw_malloc_varsize_no_track_allocation = (
+        build_raw_malloc_varsize_builder(track_allocation=False))
+    build_ll_1_raw_malloc_varsize_zero_no_track_allocation = (
+        build_raw_malloc_varsize_builder(zero=True, track_allocation=False))
+    build_ll_1_raw_malloc_varsize_zero_add_memory_pressure_no_track_allocation = (
+        build_raw_malloc_varsize_builder(zero=True, add_memory_pressure=True, track_allocation=False))
+    build_ll_1_raw_malloc_varsize_add_memory_pressure_no_track_allocation = (
+        build_raw_malloc_varsize_builder(add_memory_pressure=True, track_allocation=False))
 
-    def build_ll_1_raw_free(ARRAY):
-        def _ll_1_raw_free(p):
-            lltype.free(p, flavor='raw')
-        return _ll_1_raw_free
+    def build_raw_malloc_fixedsize_builder(zero=False,
+                                           add_memory_pressure=False,
+                                           track_allocation=True):
+        def build_ll_0_raw_malloc_fixedsize(STRUCT):
+            def _ll_0_raw_malloc_fixedsize():
+                return lltype.malloc(STRUCT, flavor='raw', zero=zero,
+                                     add_memory_pressure=add_memory_pressure,
+                                     track_allocation=track_allocation)
+            return _ll_0_raw_malloc_fixedsize
+        return build_ll_0_raw_malloc_fixedsize
+
+    build_ll_0_raw_malloc_fixedsize = (
+        build_raw_malloc_fixedsize_builder())
+    build_ll_0_raw_malloc_fixedsize_zero = (
+        build_raw_malloc_fixedsize_builder(zero=True))
+    build_ll_0_raw_malloc_fixedsize_zero_add_memory_pressure = (
+        build_raw_malloc_fixedsize_builder(zero=True, add_memory_pressure=True))
+    build_ll_0_raw_malloc_fixedsize_add_memory_pressure = (
+        build_raw_malloc_fixedsize_builder(add_memory_pressure=True))
+    build_ll_0_raw_malloc_fixedsize_no_track_allocation = (
+        build_raw_malloc_fixedsize_builder(track_allocation=False))
+    build_ll_0_raw_malloc_fixedsize_zero_no_track_allocation = (
+        build_raw_malloc_fixedsize_builder(zero=True, track_allocation=False))
+    build_ll_0_raw_malloc_fixedsize_zero_add_memory_pressure_no_track_allocation = (
+        build_raw_malloc_fixedsize_builder(zero=True, add_memory_pressure=True, track_allocation=False))
+    build_ll_0_raw_malloc_fixedsize_add_memory_pressure_no_track_allocation = (
+        build_raw_malloc_fixedsize_builder(add_memory_pressure=True, track_allocation=False))
+
+    def build_raw_free_builder(track_allocation=True):
+        def build_ll_1_raw_free(ARRAY):
+            def _ll_1_raw_free(p):
+                lltype.free(p, flavor='raw',
+                            track_allocation=track_allocation)
+            return _ll_1_raw_free
+        return build_ll_1_raw_free
+
+    build_ll_1_raw_free = (
+        build_raw_free_builder())
+    build_ll_1_raw_free_no_track_allocation = (
+        build_raw_free_builder(track_allocation=False))
+
 
 class OOtypeHelpers:
 
diff --git a/pypy/jit/codewriter/test/test_codewriter.py b/pypy/jit/codewriter/test/test_codewriter.py
--- a/pypy/jit/codewriter/test/test_codewriter.py
+++ b/pypy/jit/codewriter/test/test_codewriter.py
@@ -217,7 +217,7 @@
     cw.make_jitcodes(verbose=True)
     #
     s = jitdriver_sd.mainjitcode.dump()
-    assert 'residual_call_ir_i $<* fn _ll_1_raw_malloc__Signed>' in s
+    assert 'residual_call_ir_i $<* fn _ll_1_raw_malloc_varsize__Signed>' in s
     assert 'setarrayitem_raw_i' in s
     assert 'getarrayitem_raw_i' in s
     assert 'residual_call_ir_v $<* fn _ll_1_raw_free__arrayPtr>' in s
diff --git a/pypy/jit/codewriter/test/test_jtransform.py b/pypy/jit/codewriter/test/test_jtransform.py
--- a/pypy/jit/codewriter/test/test_jtransform.py
+++ b/pypy/jit/codewriter/test/test_jtransform.py
@@ -550,7 +550,7 @@
     tr = Transformer(FakeCPU(), FakeResidualCallControl())
     op0, op1 = tr.rewrite_operation(op)
     assert op0.opname == 'residual_call_ir_i'
-    assert op0.args[0].value == 'raw_malloc'    # pseudo-function as a str
+    assert op0.args[0].value == 'raw_malloc_varsize' # pseudo-function as a str
     assert op1.opname == '-live-'
     assert op1.args == []
 
@@ -564,7 +564,7 @@
     tr = Transformer(FakeCPU(), FakeResidualCallControl())
     op0, op1 = tr.rewrite_operation(op)
     assert op0.opname == 'residual_call_ir_i'
-    assert op0.args[0].value == 'raw_malloc_zero'    # pseudo-function as a str
+    assert op0.args[0].value == 'raw_malloc_varsize_zero'  # pseudo-fn as a str
     assert op1.opname == '-live-'
     assert op1.args == []
 
@@ -578,6 +578,35 @@
     tr = Transformer(FakeCPU(), FakeResidualCallControl())
     py.test.raises(UnsupportedMallocFlags, tr.rewrite_operation, op)
 
+def test_raw_malloc_fixedsize():
+    S = lltype.Struct('dummy', ('x', lltype.Signed))
+    v = varoftype(lltype.Ptr(S))
+    flags = Constant({'flavor': 'raw', 'zero': True}, lltype.Void)
+    op = SpaceOperation('malloc', [Constant(S, lltype.Void), flags], v)
+    tr = Transformer(FakeCPU(), FakeResidualCallControl())
+    op0, op1 = tr.rewrite_operation(op)
+    assert op0.opname == 'residual_call_r_i'
+    assert op0.args[0].value == 'raw_malloc_fixedsize_zero' #pseudo-fn as a str
+    assert op1.opname == '-live-'
+    assert op1.args == []
+
+def test_raw_free():
+    S = lltype.Struct('dummy', ('x', lltype.Signed))
+    for flag in [True, False]:
+        flags = Constant({'flavor': 'raw', 'track_allocation': flag},
+                         lltype.Void)
+        op = SpaceOperation('free', [varoftype(lltype.Ptr(S)), flags],
+                            varoftype(lltype.Void))
+        tr = Transformer(FakeCPU(), FakeResidualCallControl())
+        op0, op1 = tr.rewrite_operation(op)
+        assert op0.opname == 'residual_call_ir_v'
+        if flag:
+            pseudo_op_name = 'raw_free'
+        else:
+            pseudo_op_name = 'raw_free_no_track_allocation'
+        assert op0.args[0].value == pseudo_op_name   # pseudo-function as a str
+        assert op1.opname == '-live-'
+
 def test_rename_on_links():
     v1 = Variable()
     v2 = Variable(); v2.concretetype = llmemory.Address
diff --git a/pypy/jit/metainterp/blackhole.py b/pypy/jit/metainterp/blackhole.py
--- a/pypy/jit/metainterp/blackhole.py
+++ b/pypy/jit/metainterp/blackhole.py
@@ -1504,7 +1504,6 @@
                         all_virtuals=None):
     from pypy.jit.metainterp.resume import blackhole_from_resumedata
     #debug_start('jit-blackhole')
-    metainterp_sd.profiler.start_blackhole()
     blackholeinterp = blackhole_from_resumedata(
         metainterp_sd.blackholeinterpbuilder,
         jitdriver_sd,
@@ -1518,10 +1517,9 @@
     current_exc = blackholeinterp._prepare_resume_from_failure(
         resumedescr.guard_opnum, dont_change_position)
 
-    try:
-        _run_forever(blackholeinterp, current_exc)
-    finally:
-        metainterp_sd.profiler.end_blackhole()
+    #try:
+    _run_forever(blackholeinterp, current_exc)
+    #finally:
         #debug_stop('jit-blackhole')
 
 def convert_and_run_from_pyjitpl(metainterp, raising_exception=False):
@@ -1529,7 +1527,6 @@
     # 'metainterp.framestack'.
     #debug_start('jit-blackhole')
     metainterp_sd = metainterp.staticdata
-    metainterp_sd.profiler.start_blackhole()
     nextbh = None
     for frame in metainterp.framestack:
         curbh = metainterp_sd.blackholeinterpbuilder.acquire_interp()
@@ -1546,8 +1543,7 @@
         firstbh.exception_last_value = current_exc
         current_exc = lltype.nullptr(rclass.OBJECTPTR.TO)
     #
-    try:
-        _run_forever(firstbh, current_exc)
-    finally:
-        metainterp_sd.profiler.end_blackhole()
+    #try:
+    _run_forever(firstbh, current_exc)
+    #finally:
         #debug_stop('jit-blackhole')
diff --git a/pypy/jit/metainterp/compile.py b/pypy/jit/metainterp/compile.py
--- a/pypy/jit/metainterp/compile.py
+++ b/pypy/jit/metainterp/compile.py
@@ -9,12 +9,13 @@
 from pypy.tool.sourcetools import func_with_new_name
 
 from pypy.jit.metainterp.resoperation import ResOperation, rop, get_deep_immutable_oplist
-from pypy.jit.metainterp.history import TreeLoop, Box, History, LoopToken
+from pypy.jit.metainterp.history import TreeLoop, Box, History, JitCellToken, TargetToken
 from pypy.jit.metainterp.history import AbstractFailDescr, BoxInt
-from pypy.jit.metainterp.history import BoxPtr, BoxObj, BoxFloat, Const
+from pypy.jit.metainterp.history import BoxPtr, BoxObj, BoxFloat, Const, ConstInt
 from pypy.jit.metainterp import history
 from pypy.jit.metainterp.typesystem import llhelper, oohelper
 from pypy.jit.metainterp.optimize import InvalidLoop
+from pypy.jit.metainterp.inliner import Inliner
 from pypy.jit.metainterp.resume import NUMBERING, PENDINGFIELDSP
 from pypy.jit.codewriter import heaptracker, longlong
 
@@ -23,7 +24,7 @@
     from pypy.jit.metainterp.jitprof import ABORT_BRIDGE
     raise SwitchToBlackhole(ABORT_BRIDGE)
 
-def show_loop(metainterp_sd, loop=None, error=None):
+def show_procedures(metainterp_sd, procedure=None, error=None):
     # debugging
     if option.view or option.viewloops:
         if error:
@@ -32,11 +33,12 @@
                 errmsg += ': ' + str(error)
         else:
             errmsg = None
-        if loop is None: # or type(loop) is TerminatingLoop:
-            extraloops = []
+        if procedure is None:
+            extraprocedures = []
         else:
-            extraloops = [loop]
-        metainterp_sd.stats.view(errmsg=errmsg, extraloops=extraloops)
+            extraprocedures = [procedure]
+        metainterp_sd.stats.view(errmsg=errmsg,
+                                 extraprocedures=extraprocedures)
 
 def create_empty_loop(metainterp, name_prefix=''):
     name = metainterp.staticdata.stats.name_for_new_loop()
@@ -45,131 +47,261 @@
     return loop
 
 
-def make_loop_token(nb_args, jitdriver_sd):
-    loop_token = LoopToken()
-    loop_token.outermost_jitdriver_sd = jitdriver_sd
-    return loop_token
+def make_jitcell_token(jitdriver_sd):
+    jitcell_token = JitCellToken()
+    jitcell_token.outermost_jitdriver_sd = jitdriver_sd
+    return jitcell_token
 
 def record_loop_or_bridge(metainterp_sd, loop):
     """Do post-backend recordings and cleanups on 'loop'.
     """
-    # get the original loop token (corresponding to 'loop', or if that is
-    # a bridge, to the loop that this bridge belongs to)
-    looptoken = loop.token
-    assert looptoken is not None
+    # get the original jitcell token corresponding to jitcell form which
+    # this trace starts
+    original_jitcell_token = loop.original_jitcell_token
+    assert original_jitcell_token is not None
     if metainterp_sd.warmrunnerdesc is not None:    # for tests
-        assert looptoken.generation > 0     # has been registered with memmgr
-    wref = weakref.ref(looptoken)
+        assert original_jitcell_token.generation > 0     # has been registered with memmgr
+    wref = weakref.ref(original_jitcell_token)
     for op in loop.operations:
         descr = op.getdescr()
         if isinstance(descr, ResumeDescr):
             descr.wref_original_loop_token = wref   # stick it there
             n = descr.index
             if n >= 0:       # we also record the resumedescr number
-                looptoken.compiled_loop_token.record_faildescr_index(n)
-        elif isinstance(descr, LoopToken):
-            # for a JUMP or a CALL_ASSEMBLER: record it as a potential jump.
+                original_jitcell_token.compiled_loop_token.record_faildescr_index(n)
+        elif isinstance(descr, JitCellToken):
+            # for a CALL_ASSEMBLER: record it as a potential jump.
+            if descr is not original_jitcell_token:
+                original_jitcell_token.record_jump_to(descr)
+            descr.exported_state = None
+            op._descr = None    # clear reference, mostly for tests
+        elif isinstance(descr, TargetToken):
+            # for a JUMP: record it as a potential jump.
             # (the following test is not enough to prevent more complicated
             # cases of cycles, but at least it helps in simple tests of
             # test_memgr.py)
-            if descr is not looptoken:
-                looptoken.record_jump_to(descr)
-            op._descr = None    # clear reference, mostly for tests
+            if descr.original_jitcell_token is not original_jitcell_token:
+                assert descr.original_jitcell_token is not None
+                original_jitcell_token.record_jump_to(descr.original_jitcell_token)
+            # exported_state is clear by optimizeopt when the short preamble is
+            # constrcucted. if that did not happen the label should not show up
+            # in a trace that will be used
+            assert descr.exported_state is None 
             if not we_are_translated():
-                op._jumptarget_number = descr.number
+                op._descr_wref = weakref.ref(op._descr)
+            op._descr = None    # clear reference to prevent the history.Stats
+                                # from keeping the loop alive during tests
     # record this looptoken on the QuasiImmut used in the code
     if loop.quasi_immutable_deps is not None:
         for qmut in loop.quasi_immutable_deps:
             qmut.register_loop_token(wref)
         # XXX maybe we should clear the dictionary here
     # mostly for tests: make sure we don't keep a reference to the LoopToken
-    loop.token = None
+    loop.original_jitcell_token = None
     if not we_are_translated():
-        loop._looptoken_number = looptoken.number
+        loop._looptoken_number = original_jitcell_token.number
 
 # ____________________________________________________________
 
-def compile_new_loop(metainterp, old_loop_tokens, greenkey, start,
-                     start_resumedescr, full_preamble_needed=True):
-    """Try to compile a new loop by closing the current history back
+def compile_loop(metainterp, greenkey, start,
+                 inputargs, jumpargs,
+                 start_resumedescr, full_preamble_needed=True):
+    """Try to compile a new procedure by closing the current history back
     to the first operation.
     """
-    from pypy.jit.metainterp.optimize import optimize_loop
+    from pypy.jit.metainterp.optimizeopt import optimize_trace
 
     history = metainterp.history
-    loop = create_empty_loop(metainterp)
-    loop.inputargs = history.inputargs[:]
+    metainterp_sd = metainterp.staticdata
+    jitdriver_sd = metainterp.jitdriver_sd
+
+    if False:
+        part = partial_trace
+        assert False
+        procedur_token = metainterp.get_procedure_token(greenkey)
+        assert procedure_token
+        all_target_tokens = []
+    else:
+        jitcell_token = make_jitcell_token(jitdriver_sd)
+        part = create_empty_loop(metainterp)
+        part.inputargs = inputargs[:]
+        h_ops = history.operations
+        part.start_resumedescr = start_resumedescr
+        part.operations = [ResOperation(rop.LABEL, inputargs, None, descr=TargetToken(jitcell_token))] + \
+                          [h_ops[i].clone() for i in range(start, len(h_ops))] + \
+                          [ResOperation(rop.JUMP, jumpargs, None, descr=jitcell_token)]
+        try:
+            optimize_trace(metainterp_sd, part, jitdriver_sd.warmstate.enable_opts)
+        except InvalidLoop:
+            return None
+        target_token = part.operations[0].getdescr()
+        assert isinstance(target_token, TargetToken)
+        all_target_tokens = [target_token]
+
+    loop = create_empty_loop(metainterp)        
+    loop.inputargs = part.inputargs
+    loop.operations = part.operations
+    loop.quasi_immutable_deps = {}
+    if part.quasi_immutable_deps:
+        loop.quasi_immutable_deps.update(part.quasi_immutable_deps)
+    while part.operations[-1].getopnum() == rop.LABEL:
+        inliner = Inliner(inputargs, jumpargs)
+        part.quasi_immutable_deps = None
+        part.operations = [part.operations[-1]] + \
+                          [inliner.inline_op(h_ops[i]) for i in range(start, len(h_ops))] + \
+                          [ResOperation(rop.JUMP, [inliner.inline_arg(a) for a in jumpargs],
+                                        None, descr=jitcell_token)]
+        target_token = part.operations[0].getdescr()
+        assert isinstance(target_token, TargetToken)
+        all_target_tokens.append(target_token)
+        inputargs = jumpargs
+        jumpargs = part.operations[-1].getarglist()
+
+        try:
+            optimize_trace(metainterp_sd, part, jitdriver_sd.warmstate.enable_opts)
+        except InvalidLoop:
+            return None
+            
+        loop.operations = loop.operations[:-1] + part.operations
+        if part.quasi_immutable_deps:
+            loop.quasi_immutable_deps.update(part.quasi_immutable_deps)
+
+    if not loop.quasi_immutable_deps:
+        loop.quasi_immutable_deps = None
     for box in loop.inputargs:
         assert isinstance(box, Box)
-    # make a copy, because optimize_loop can mutate the ops and descrs
-    h_ops = history.operations
-    loop.operations = [h_ops[i].clone() for i in range(start, len(h_ops))]
+
+    loop.original_jitcell_token = jitcell_token
+    for label in all_target_tokens:
+        assert isinstance(label, TargetToken)
+        label.original_jitcell_token = jitcell_token
+        if label.virtual_state and label.short_preamble:
+            metainterp_sd.logger_ops.log_short_preamble([], label.short_preamble)
+    jitcell_token.target_tokens = all_target_tokens
+    send_loop_to_backend(greenkey, jitdriver_sd, metainterp_sd, loop, "loop")
+    record_loop_or_bridge(metainterp_sd, loop)
+    return all_target_tokens[0]
+
+def compile_retrace(metainterp, greenkey, start,
+                    inputargs, jumpargs,
+                    start_resumedescr, partial_trace, resumekey):
+    """Try to compile a new procedure by closing the current history back
+    to the first operation.
+    """
+    from pypy.jit.metainterp.optimizeopt import optimize_trace
+
+    history = metainterp.history
     metainterp_sd = metainterp.staticdata
     jitdriver_sd = metainterp.jitdriver_sd
-    loop_token = make_loop_token(len(loop.inputargs), jitdriver_sd)
-    loop.token = loop_token
-    loop.operations[-1].setdescr(loop_token)     # patch the target of the JUMP
 
-    loop.preamble = create_empty_loop(metainterp, 'Preamble ')
-    loop.preamble.inputargs = loop.inputargs
-    loop.preamble.token = make_loop_token(len(loop.inputargs), jitdriver_sd)
-    loop.preamble.start_resumedescr = start_resumedescr
+    loop_jitcell_token = metainterp.get_procedure_token(greenkey)
+    assert loop_jitcell_token
+    assert partial_trace.operations[-1].getopnum() == rop.LABEL
 
+    part = create_empty_loop(metainterp)
+    part.inputargs = inputargs[:]
+    part.start_resumedescr = start_resumedescr
+    h_ops = history.operations
+
+    part.operations = [partial_trace.operations[-1]] + \
+                      [h_ops[i].clone() for i in range(start, len(h_ops))] + \
+                      [ResOperation(rop.JUMP, jumpargs, None, descr=loop_jitcell_token)]
+    label = part.operations[0]
+    orignial_label = label.clone()
+    assert label.getopnum() == rop.LABEL
     try:
-        old_loop_token = optimize_loop(metainterp_sd, old_loop_tokens, loop,
-                                       jitdriver_sd.warmstate.enable_opts)
+        optimize_trace(metainterp_sd, part, jitdriver_sd.warmstate.enable_opts)
     except InvalidLoop:
-        debug_print("compile_new_loop: got an InvalidLoop")
-        return None
-    if old_loop_token is not None:
-        metainterp.staticdata.log("reusing old loop")
-        return old_loop_token
+        #return None # XXX: Dissable for now
+        # Fall back on jumping to preamble
+        target_token = label.getdescr()
+        assert isinstance(target_token, TargetToken)
+        assert target_token.exported_state
+        part.operations = [orignial_label] + \
+                          [ResOperation(rop.JUMP, target_token.exported_state.jump_args,
+                                        None, descr=loop_jitcell_token)]
+        try:
+            optimize_trace(metainterp_sd, part, jitdriver_sd.warmstate.enable_opts,
+                           inline_short_preamble=False)
+            
+        except InvalidLoop:
+            return None
+    assert part.operations[-1].getopnum() != rop.LABEL
+    target_token = label.getdescr()
+    assert isinstance(target_token, TargetToken)
+    assert loop_jitcell_token.target_tokens
+    loop_jitcell_token.target_tokens.append(target_token)
 
-    if loop.preamble.operations is not None:
-        send_loop_to_backend(greenkey, jitdriver_sd, metainterp_sd, loop,
-                             "loop")
-        record_loop_or_bridge(metainterp_sd, loop)
-        token = loop.preamble.token
-        if full_preamble_needed:
-            send_loop_to_backend(greenkey, jitdriver_sd, metainterp_sd,
-                                 loop.preamble, "entry bridge")
-            insert_loop_token(old_loop_tokens, loop.preamble.token)
-            jitdriver_sd.warmstate.attach_unoptimized_bridge_from_interp(
-                greenkey, loop.preamble.token)
-            record_loop_or_bridge(metainterp_sd, loop.preamble)
-        elif token.short_preamble:
-            short = token.short_preamble[-1]
-            metainterp_sd.logger_ops.log_short_preamble(short.inputargs,
-                                                        short.operations)
-        return token
-    else:
-        send_loop_to_backend(greenkey, jitdriver_sd, metainterp_sd, loop,
-                             "loop")
-        insert_loop_token(old_loop_tokens, loop_token)
-        jitdriver_sd.warmstate.attach_unoptimized_bridge_from_interp(
-            greenkey, loop.token)
-        record_loop_or_bridge(metainterp_sd, loop)
-        return loop_token
+    loop = partial_trace
+    loop.operations = loop.operations[:-1] + part.operations
 
-def insert_loop_token(old_loop_tokens, loop_token):
-    # Find where in old_loop_tokens we should insert this new loop_token.
-    # The following algo means "as late as possible, but before another
-    # loop token that would be more general and so completely mask off
-    # the new loop_token".
-    # XXX do we still need a list?
-    old_loop_tokens.append(loop_token)
+    quasi_immutable_deps = {}
+    if loop.quasi_immutable_deps:
+        quasi_immutable_deps.update(loop.quasi_immutable_deps)
+    if part.quasi_immutable_deps:
+        quasi_immutable_deps.update(part.quasi_immutable_deps)
+    if quasi_immutable_deps:
+        loop.quasi_immutable_deps = quasi_immutable_deps
+
+    for box in loop.inputargs:
+        assert isinstance(box, Box)
+
+    target_token = loop.operations[-1].getdescr()
+    resumekey.compile_and_attach(metainterp, loop)
+    target_token = label.getdescr()
+    assert isinstance(target_token, TargetToken)
+    target_token.original_jitcell_token = loop.original_jitcell_token
+    record_loop_or_bridge(metainterp_sd, loop)
+    return target_token
+
+def patch_new_loop_to_load_virtualizable_fields(loop, jitdriver_sd):
+    vinfo = jitdriver_sd.virtualizable_info
+    extra_ops = []
+    inputargs = loop.inputargs
+    vable_box = inputargs[jitdriver_sd.index_of_virtualizable]
+    i = jitdriver_sd.num_red_args
+    loop.inputargs = inputargs[:i]
+    for descr in vinfo.static_field_descrs:
+        assert i < len(inputargs)
+        box = inputargs[i]
+        extra_ops.append(
+            ResOperation(rop.GETFIELD_GC, [vable_box], box, descr))
+        i += 1
+    arrayindex = 0
+    for descr in vinfo.array_field_descrs:
+        vable = vable_box.getref_base()
+        arraylen = vinfo.get_array_length(vable, arrayindex)
+        arraybox = BoxPtr()
+        extra_ops.append(
+            ResOperation(rop.GETFIELD_GC, [vable_box], arraybox, descr))
+        arraydescr = vinfo.array_descrs[arrayindex]
+        assert i + arraylen <= len(inputargs)
+        for index in range(arraylen):
+            box = inputargs[i]
+            extra_ops.append(
+                ResOperation(rop.GETARRAYITEM_GC,
+                             [arraybox, ConstInt(index)],
+                             box, descr=arraydescr))
+            i += 1
+        arrayindex += 1
+    assert i == len(inputargs)
+    loop.operations = extra_ops + loop.operations
 
 def send_loop_to_backend(greenkey, jitdriver_sd, metainterp_sd, loop, type):
-    jitdriver_sd.on_compile(metainterp_sd.logger_ops, loop.token,
+    vinfo = jitdriver_sd.virtualizable_info
+    if vinfo is not None:
+        patch_new_loop_to_load_virtualizable_fields(loop, jitdriver_sd)
+
+    original_jitcell_token = loop.original_jitcell_token
+    jitdriver_sd.on_compile(metainterp_sd.logger_ops, original_jitcell_token,
                             loop.operations, type, greenkey)
     loopname = jitdriver_sd.warmstate.get_location_str(greenkey)
     globaldata = metainterp_sd.globaldata
-    loop_token = loop.token
-    loop_token.number = n = globaldata.loopnumbering
+    original_jitcell_token.number = n = globaldata.loopnumbering
     globaldata.loopnumbering += 1
 
     if not we_are_translated():
-        show_loop(metainterp_sd, loop)
+        show_procedures(metainterp_sd, loop)
         loop.check_consistency()
 
     operations = get_deep_immutable_oplist(loop.operations)
@@ -177,26 +309,19 @@
     debug_start("jit-backend")
     try:
         ops_offset = metainterp_sd.cpu.compile_loop(loop.inputargs, operations,
-                                                    loop.token, name=loopname)
+                                                    original_jitcell_token, name=loopname)
     finally:
         debug_stop("jit-backend")
     metainterp_sd.profiler.end_backend()
     metainterp_sd.stats.add_new_loop(loop)
     if not we_are_translated():
-        if type != "entry bridge":
-            metainterp_sd.stats.compiled()
-        else:
-            loop._ignore_during_counting = True
+        metainterp_sd.stats.compiled()
     metainterp_sd.log("compiled new " + type)
     #
     metainterp_sd.logger_ops.log_loop(loop.inputargs, loop.operations, n, type, ops_offset)
-    short = loop.token.short_preamble
-    if short:
-        metainterp_sd.logger_ops.log_short_preamble(short[-1].inputargs,
-                                                    short[-1].operations)
     #
     if metainterp_sd.warmrunnerdesc is not None:    # for tests
-        metainterp_sd.warmrunnerdesc.memory_manager.keep_loop_alive(loop.token)
+        metainterp_sd.warmrunnerdesc.memory_manager.keep_loop_alive(original_jitcell_token)
 
 def send_bridge_to_backend(jitdriver_sd, metainterp_sd, faildescr, inputargs,
                            operations, original_loop_token):
@@ -204,8 +329,9 @@
     jitdriver_sd.on_compile_bridge(metainterp_sd.logger_ops,
                                    original_loop_token, operations, n)
     if not we_are_translated():
-        show_loop(metainterp_sd)
-        TreeLoop.check_consistency_of(inputargs, operations)
+        show_procedures(metainterp_sd)
+        seen = dict.fromkeys(inputargs)
+        TreeLoop.check_consistency_of_branch(operations, seen)
     metainterp_sd.profiler.start_backend()
     operations = get_deep_immutable_oplist(operations)
     debug_start("jit-backend")
@@ -221,9 +347,9 @@
     #
     metainterp_sd.logger_ops.log_bridge(inputargs, operations, n, ops_offset)
     #
-    if metainterp_sd.warmrunnerdesc is not None:    # for tests
-        metainterp_sd.warmrunnerdesc.memory_manager.keep_loop_alive(
-            original_loop_token)
+    #if metainterp_sd.warmrunnerdesc is not None:    # for tests
+    #    metainterp_sd.warmrunnerdesc.memory_manager.keep_loop_alive(
+    #        original_loop_token)
 
 # ____________________________________________________________
 
@@ -263,7 +389,7 @@
         raise metainterp_sd.ExitFrameWithExceptionRef(cpu, value)
 
 
-class TerminatingLoopToken(LoopToken):
+class TerminatingLoopToken(JitCellToken): # FIXME: kill?
     terminating = True
 
     def __init__(self, nargs, finishdescr):
@@ -346,14 +472,14 @@
         if self.must_compile(metainterp_sd, jitdriver_sd):
             self.start_compiling()
             try:
-                return self._trace_and_compile_from_bridge(metainterp_sd,
-                                                           jitdriver_sd)
+                self._trace_and_compile_from_bridge(metainterp_sd,
+                                                    jitdriver_sd)
             finally:
                 self.done_compiling()
         else:
             from pypy.jit.metainterp.blackhole import resume_in_blackhole
             resume_in_blackhole(metainterp_sd, jitdriver_sd, self)
-            assert 0, "unreachable"
+        assert 0, "unreachable"
 
     def _trace_and_compile_from_bridge(self, metainterp_sd, jitdriver_sd):
         # 'jitdriver_sd' corresponds to the outermost one, i.e. the one
@@ -362,7 +488,7 @@
         # jitdrivers.
         from pypy.jit.metainterp.pyjitpl import MetaInterp
         metainterp = MetaInterp(metainterp_sd, jitdriver_sd)
-        return metainterp.handle_guard_failure(self)
+        metainterp.handle_guard_failure(self)
     _trace_and_compile_from_bridge._dont_inline_ = True
 
     def must_compile(self, metainterp_sd, jitdriver_sd):
@@ -427,13 +553,13 @@
         # We managed to create a bridge.  Attach the new operations
         # to the corresponding guard_op and compile from there
         assert metainterp.resumekey_original_loop_token is not None
-        new_loop.token = metainterp.resumekey_original_loop_token
+        new_loop.original_jitcell_token = metainterp.resumekey_original_loop_token
         inputargs = metainterp.history.inputargs
         if not we_are_translated():
             self._debug_suboperations = new_loop.operations
         send_bridge_to_backend(metainterp.jitdriver_sd, metainterp.staticdata,
                                self, inputargs, new_loop.operations,
-                               new_loop.token)
+                               new_loop.original_jitcell_token)
 
     def copy_all_attributes_into(self, res):
         # XXX a bit ugly to have to list them all here
@@ -616,41 +742,32 @@
         metainterp_sd = metainterp.staticdata
         jitdriver_sd = metainterp.jitdriver_sd
         redargs = new_loop.inputargs
-        # We make a new LoopToken for this entry bridge, and stick it
-        # to every guard in the loop.
-        new_loop_token = make_loop_token(len(redargs), jitdriver_sd)
-        new_loop.token = new_loop_token
+        new_loop.original_jitcell_token = jitcell_token = make_jitcell_token(jitdriver_sd)
         send_loop_to_backend(self.original_greenkey, metainterp.jitdriver_sd,
                              metainterp_sd, new_loop, "entry bridge")
         # send the new_loop to warmspot.py, to be called directly the next time
-        jitdriver_sd.warmstate.attach_unoptimized_bridge_from_interp(
-            self.original_greenkey,
-            new_loop_token)
-        # store the new loop in compiled_merge_points_wref too
-        old_loop_tokens = metainterp.get_compiled_merge_points(
-            self.original_greenkey)
-        # it always goes at the end of the list, as it is the most
-        # general loop token
-        old_loop_tokens.append(new_loop_token)
-        metainterp.set_compiled_merge_points(self.original_greenkey,
-                                             old_loop_tokens)
+        jitdriver_sd.warmstate.attach_procedure_to_interp(
+            self.original_greenkey, jitcell_token)
+        metainterp_sd.stats.add_jitcell_token(jitcell_token)
 
 
-def compile_new_bridge(metainterp, old_loop_tokens, resumekey, retraced=False):
+def compile_trace(metainterp, resumekey, start_resumedescr=None):
     """Try to compile a new bridge leading from the beginning of the history
     to some existing place.
     """
-    from pypy.jit.metainterp.optimize import optimize_bridge
+    from pypy.jit.metainterp.optimizeopt import optimize_trace
     
     # The history contains new operations to attach as the code for the
     # failure of 'resumekey.guard_op'.
-    #
+    # 
     # Attempt to use optimize_bridge().  This may return None in case
     # it does not work -- i.e. none of the existing old_loop_tokens match.
-    new_loop = create_empty_loop(metainterp)
-    new_loop.inputargs = metainterp.history.inputargs[:]
+    new_trace = create_empty_loop(metainterp)
+    new_trace.inputargs = inputargs = metainterp.history.inputargs[:]
     # clone ops, as optimize_bridge can mutate the ops
-    new_loop.operations = [op.clone() for op in metainterp.history.operations]
+
+    new_trace.operations = [op.clone() for op in metainterp.history.operations]
+    new_trace.start_resumedescr = start_resumedescr
     metainterp_sd = metainterp.staticdata
     state = metainterp.jitdriver_sd.warmstate
     if isinstance(resumekey, ResumeAtPositionDescr):
@@ -658,38 +775,25 @@
     else:
         inline_short_preamble = True
     try:
-        target_loop_token = optimize_bridge(metainterp_sd, old_loop_tokens,
-                                            new_loop, state.enable_opts,
-                                            inline_short_preamble, retraced)
+        optimize_trace(metainterp_sd, new_trace, state.enable_opts, inline_short_preamble)
     except InvalidLoop:
         debug_print("compile_new_bridge: got an InvalidLoop")
         # XXX I am fairly convinced that optimize_bridge cannot actually raise
         # InvalidLoop
         debug_print('InvalidLoop in compile_new_bridge')
         return None
-    # Did it work?
-    if target_loop_token is not None:
-        # Yes, we managed to create a bridge.  Dispatch to resumekey to
+
+    if new_trace.operations[-1].getopnum() != rop.LABEL:
+        # We managed to create a bridge.  Dispatch to resumekey to
         # know exactly what we must do (ResumeGuardDescr/ResumeFromInterpDescr)
-        prepare_last_operation(new_loop, target_loop_token)
-        resumekey.compile_and_attach(metainterp, new_loop)
-        record_loop_or_bridge(metainterp_sd, new_loop)
-    return target_loop_token
-
-def prepare_last_operation(new_loop, target_loop_token):
-    op = new_loop.operations[-1]
-    if not isinstance(target_loop_token, TerminatingLoopToken):
-        # normal case
-        #op.setdescr(target_loop_token)     # patch the jump target
-        pass
+        target_token = new_trace.operations[-1].getdescr()
+        resumekey.compile_and_attach(metainterp, new_trace)
+        record_loop_or_bridge(metainterp_sd, new_trace)
+        return target_token
     else:
-        # The target_loop_token is a pseudo loop token,
-        # e.g. loop_tokens_done_with_this_frame_void[0]
-        # Replace the operation with the real operation we want, i.e. a FINISH
-        descr = target_loop_token.finishdescr
-        args = op.getarglist()
-        new_op = ResOperation(rop.FINISH, args, None, descr=descr)
-        new_loop.operations[-1] = new_op
+        metainterp.retrace_needed(new_trace)
+        return None
+        
 
 # ____________________________________________________________
 
@@ -700,21 +804,25 @@
         assert exception, "PropagateExceptionDescr: no exception??"
         raise metainterp_sd.ExitFrameWithExceptionRef(cpu, exception)
 
-def compile_tmp_callback(cpu, jitdriver_sd, greenboxes, redboxes,
+def compile_tmp_callback(cpu, jitdriver_sd, greenboxes, redargtypes,
                          memory_manager=None):
     """Make a LoopToken that corresponds to assembler code that just
     calls back the interpreter.  Used temporarily: a fully compiled
     version of the code may end up replacing it.
     """
-    # 'redboxes' is only used to know the types of red arguments.
-    inputargs = [box.clonebox() for box in redboxes]
-    loop_token = make_loop_token(len(inputargs), jitdriver_sd)
-    # 'nb_red_args' might be smaller than len(redboxes),
-    # because it doesn't include the virtualizable boxes.
+    jitcell_token = make_jitcell_token(jitdriver_sd)
     nb_red_args = jitdriver_sd.num_red_args
+    assert len(redargtypes) == nb_red_args
+    inputargs = []
+    for kind in redargtypes:
+        if   kind == history.INT:   box = BoxInt()
+        elif kind == history.REF:   box = BoxPtr()
+        elif kind == history.FLOAT: box = BoxFloat()
+        else: raise AssertionError
+        inputargs.append(box)
     k = jitdriver_sd.portal_runner_adr
     funcbox = history.ConstInt(heaptracker.adr2int(k))
-    callargs = [funcbox] + greenboxes + inputargs[:nb_red_args]
+    callargs = [funcbox] + greenboxes + inputargs
     #
     result_type = jitdriver_sd.result_type
     if result_type == history.INT:
@@ -741,7 +849,7 @@
         ]
     operations[1].setfailargs([])
     operations = get_deep_immutable_oplist(operations)
-    cpu.compile_loop(inputargs, operations, loop_token, log=False)
+    cpu.compile_loop(inputargs, operations, jitcell_token, log=False)
     if memory_manager is not None:    # for tests
-        memory_manager.keep_loop_alive(loop_token)
-    return loop_token
+        memory_manager.keep_loop_alive(jitcell_token)
+    return jitcell_token
diff --git a/pypy/jit/metainterp/executor.py b/pypy/jit/metainterp/executor.py
--- a/pypy/jit/metainterp/executor.py
+++ b/pypy/jit/metainterp/executor.py
@@ -344,6 +344,7 @@
                          rop.SETINTERIORFIELD_RAW,
                          rop.CALL_RELEASE_GIL,
                          rop.QUASIIMMUT_FIELD,
+                         rop.LABEL,
                          ):      # list of opcodes never executed by pyjitpl
                 continue
             raise AssertionError("missing %r" % (key,))
diff --git a/pypy/jit/metainterp/graphpage.py b/pypy/jit/metainterp/graphpage.py
--- a/pypy/jit/metainterp/graphpage.py
+++ b/pypy/jit/metainterp/graphpage.py
@@ -12,8 +12,9 @@
     def get_display_text(self):
         return None
 
-def display_loops(loops, errmsg=None, highlight_loops={}):
-    graphs = [(loop, highlight_loops.get(loop, 0)) for loop in loops]    
+def display_procedures(procedures, errmsg=None, highlight_procedures={}):
+    graphs = [(procedure, highlight_procedures.get(procedure, 0))
+              for procedure in procedures]
     for graph, highlight in graphs:
         for op in graph.get_operations():
             if is_interesting_guard(op):
@@ -25,18 +26,19 @@
 def is_interesting_guard(op):
     return hasattr(op.getdescr(), '_debug_suboperations')
 
+def getdescr(op):
+    if op._descr is not None:
+        return op._descr
+    if hasattr(op, '_descr_wref'):
+        return op._descr_wref()
+    return None
+
 
 class ResOpGraphPage(GraphPage):
 
     def compute(self, graphs, errmsg=None):
         resopgen = ResOpGen()
         for graph, highlight in graphs:
-            if getattr(graph, 'token', None) is not None:
-                resopgen.jumps_to_graphs[graph.token] = graph
-            if getattr(graph, '_looptoken_number', None) is not None:
-                resopgen.jumps_to_graphs[graph._looptoken_number] = graph
-        
-        for graph, highlight in graphs:
             resopgen.add_graph(graph, highlight)
         if errmsg:
             resopgen.set_errmsg(errmsg)
@@ -54,7 +56,7 @@
         self.block_starters = {}    # {graphindex: {set-of-operation-indices}}
         self.all_operations = {}
         self.errmsg = None
-        self.jumps_to_graphs = {}
+        self.target_tokens = {}
 
     def op_name(self, graphindex, opindex):
         return 'g%dop%d' % (graphindex, opindex)
@@ -73,16 +75,21 @@
         for graphindex in range(len(self.graphs)):
             self.block_starters[graphindex] = {0: True}
         for graphindex, graph in enumerate(self.graphs):
-            last_was_mergepoint = False
+            mergepointblock = None
             for i, op in enumerate(graph.get_operations()):
                 if is_interesting_guard(op):
                     self.mark_starter(graphindex, i+1)
                 if op.getopnum() == rop.DEBUG_MERGE_POINT:
-                    if not last_was_mergepoint:
-                        last_was_mergepoint = True
-                        self.mark_starter(graphindex, i)
+                    if mergepointblock is None:
+                        mergepointblock = i
+                elif op.getopnum() == rop.LABEL:
+                    self.mark_starter(graphindex, i)
+                    self.target_tokens[getdescr(op)] = (graphindex, i)
+                    mergepointblock = i
                 else:
-                    last_was_mergepoint = False
+                    if mergepointblock is not None:
+                        self.mark_starter(graphindex, mergepointblock)
+                        mergepointblock = None
 
     def set_errmsg(self, errmsg):
         self.errmsg = errmsg
@@ -172,24 +179,10 @@
                              (graphindex, opindex))
                 break
         if op.getopnum() == rop.JUMP:
-            tgt_g = -1
-            tgt = None
-            tgt_number = getattr(op, '_jumptarget_number', None)
-            if tgt_number is not None:
-                tgt = self.jumps_to_graphs.get(tgt_number)
-            else:
-                tgt_descr = op.getdescr()
-                if tgt_descr is None:
-                    tgt_g = graphindex
-                else:
-                    tgt = self.jumps_to_graphs.get(tgt_descr.number)
-                    if tgt is None:
-                        tgt = self.jumps_to_graphs.get(tgt_descr)
-            if tgt is not None:
-                tgt_g = self.graphs.index(tgt)
-            if tgt_g != -1:
+            tgt_descr = getdescr(op)
+            if tgt_descr is not None and tgt_descr in self.target_tokens:
                 self.genedge((graphindex, opstartindex),
-                             (tgt_g, 0),
+                             self.target_tokens[tgt_descr],
                              weight="0")
         lines.append("")
         label = "\\l".join(lines)
diff --git a/pypy/jit/metainterp/history.py b/pypy/jit/metainterp/history.py
--- a/pypy/jit/metainterp/history.py
+++ b/pypy/jit/metainterp/history.py
@@ -11,6 +11,7 @@
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.codewriter import heaptracker, longlong
 from pypy.rlib.objectmodel import compute_identity_hash
+import weakref
 
 # ____________________________________________________________
 
@@ -124,9 +125,6 @@
     def sort_key(self):
         raise NotImplementedError
 
-    def set_future_value(self, cpu, j):
-        raise NotImplementedError
-
     def nonnull(self):
         raise NotImplementedError
 
@@ -289,9 +287,6 @@
     def _get_hash_(self):
         return make_hashable_int(self.value)
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_int(j, self.value)
-
     def same_constant(self, other):
         if isinstance(other, ConstInt):
             return self.value == other.value
@@ -329,9 +324,6 @@
     def _get_hash_(self):
         return longlong.gethash(self.value)
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_float(j, self.value)
-
     def same_constant(self, other):
         if isinstance(other, ConstFloat):
             return self.value == other.value
@@ -378,9 +370,6 @@
     def getaddr(self):
         return llmemory.cast_ptr_to_adr(self.value)
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_ref(j, self.value)
-
     def same_constant(self, other):
         if isinstance(other, ConstPtr):
             return self.value == other.value
@@ -432,9 +421,6 @@
         else:
             return 0
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_ref(j, self.value)
-
 ##    def getaddr(self):
 ##        # so far this is used only when calling
 ##        # CodeWriter.IndirectCallset.bytecode_for_address.  We don't need a
@@ -540,9 +526,6 @@
     def _get_hash_(self):
         return make_hashable_int(self.value)
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_int(j, self.value)
-
     def nonnull(self):
         return self.value != 0
 
@@ -575,9 +558,6 @@
     def _get_hash_(self):
         return longlong.gethash(self.value)
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_float(j, self.value)
-
     def nonnull(self):
         return self.value != longlong.ZEROF
 
@@ -620,9 +600,6 @@
         else:
             return 0
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_ref(j, self.value)
-
     def nonnull(self):
         return bool(self.value)
 
@@ -667,19 +644,12 @@
     def nonnull(self):
         return bool(self.value)
 
-    def set_future_value(self, cpu, j):
-        cpu.set_future_value_ref(j, self.value)
-
     def repr_rpython(self):
         return repr_rpython(self, 'bo')
 
     _getrepr_ = repr_object
 
 
-def set_future_values(cpu, boxes):
-    for j in range(len(boxes)):
-        boxes[j].set_future_value(cpu, j)
-
 # ____________________________________________________________
 
 
@@ -724,18 +694,17 @@
 
 # ____________________________________________________________
 
-# The TreeLoop class contains a loop or a generalized loop, i.e. a tree
-# of operations.  Each branch ends in a jump which can go either to
-# the top of the same loop, or to another TreeLoop; or it ends in a FINISH.
+# The JitCellToken class is the root of a tree of traces.  Each branch ends
+# in a jump which goes to a LABEL operation; or it ends in a FINISH.
 
-class LoopToken(AbstractDescr):
+class JitCellToken(AbstractDescr):
     """Used for rop.JUMP, giving the target of the jump.
     This is different from TreeLoop: the TreeLoop class contains the
     whole loop, including 'operations', and goes away after the loop
     was compiled; but the LoopDescr remains alive and points to the
     generated assembler.
     """
-    short_preamble = None
+    target_tokens = None
     failed_states = None
     retraced_count = 0
     terminating = False # see TerminatingLoopToken in compile.py
@@ -752,10 +721,11 @@
 
     def __init__(self):
         # For memory management of assembled loops
-        self._keepalive_target_looktokens = {}      # set of other LoopTokens
+        self._keepalive_jitcell_tokens = {}      # set of other JitCellToken
 
-    def record_jump_to(self, target_loop_token):
-        self._keepalive_target_looktokens[target_loop_token] = None
+    def record_jump_to(self, jitcell_token):
+        assert isinstance(jitcell_token, JitCellToken)
+        self._keepalive_jitcell_tokens[jitcell_token] = None
 
     def __repr__(self):
         return '<Loop %d, gen=%d>' % (self.number, self.generation)
@@ -766,17 +736,49 @@
     def dump(self):
         self.compiled_loop_token.cpu.dump_loop_token(self)
 
+class TargetToken(AbstractDescr):
+    def __init__(self, targeting_jitcell_token=None):
+        # Warning, two different jitcell_tokens here!
+        #
+        # * 'targeting_jitcell_token' is only useful for the front-end,
+        #   and it means: consider the LABEL that uses this TargetToken.
+        #   At this position, the state is logically the one given
+        #   by targeting_jitcell_token.  So e.g. if we want to enter the
+        #   JIT with some given green args, if the jitcell matches, then
+        #   we can jump to this LABEL.
+        #
+        # * 'original_jitcell_token' is information from the backend's
+        #   point of view: it means that this TargetToken is used in
+        #   a LABEL that belongs to either:
+        #   - a loop; then 'original_jitcell_token' is this loop
+        #   - or a bridge; then 'original_jitcell_token' is the loop
+        #     out of which we made this bridge
+        #
+        self.targeting_jitcell_token = targeting_jitcell_token
+        self.original_jitcell_token = None
+
+        self.virtual_state = None
+        self.exported_state = None
+        
 class TreeLoop(object):
     inputargs = None
     operations = None
-    token = None
     call_pure_results = None
     logops = None
     quasi_immutable_deps = None
+    start_resumedescr = None
+
+    def _token(*args):
+        raise Exception("TreeLoop.token is killed")
+    token = property(_token, _token)
+
+    # This is the jitcell where the trace starts. Labels within the trace might
+    # belong to some other jitcells in the sens that jumping to this other
+    # jitcell will result in a jump to the label.
+    original_jitcell_token = None
 
     def __init__(self, name):
         self.name = name
-        # self.inputargs = list of distinct Boxes
         # self.operations = list of ResOperations
         #   ops of the kind 'guard_xxx' contain a further list of operations,
         #   which may itself contain 'guard_xxx' and so on, making a tree.
@@ -809,6 +811,10 @@
     def check_consistency(self):     # for testing
         "NOT_RPYTHON"
         self.check_consistency_of(self.inputargs, self.operations)
+        for op in self.operations:
+            descr = op.getdescr()
+            if op.getopnum() == rop.LABEL and isinstance(descr, TargetToken):
+                assert descr.original_jitcell_token is self.original_jitcell_token
 
     @staticmethod
     def check_consistency_of(inputargs, operations):
@@ -843,15 +849,23 @@
                 assert isinstance(box, Box)
                 assert box not in seen
                 seen[box] = True
+            if op.getopnum() == rop.LABEL:
+                inputargs = op.getarglist()
+                for box in inputargs:
+                    assert isinstance(box, Box), "LABEL contains %r" % (box,)
+                seen = dict.fromkeys(inputargs)
+                assert len(seen) == len(inputargs), (
+                    "duplicate Box in the LABEL arguments")
+                
         assert operations[-1].is_final()
         if operations[-1].getopnum() == rop.JUMP:
             target = operations[-1].getdescr()
             if target is not None:
-                assert isinstance(target, LoopToken)
+                assert isinstance(target, TargetToken)
 
     def dump(self):
         # RPython-friendly
-        print '%r: inputargs =' % self, self._dump_args(self.inputargs)
+        print '%r: inputargs =' % self, self._dump_args(self.inputargs)        
         for op in self.operations:
             args = op.getarglist()
             print '\t', op.getopname(), self._dump_args(args), \
@@ -933,6 +947,9 @@
     def clear(self):
         pass
 
+    def add_jitcell_token(self, token):
+        pass
+
 class Stats(object):
     """For tests."""
 
@@ -945,17 +962,26 @@
         self.loops = []
         self.locations = []
         self.aborted_keys = []
-        self.invalidated_token_numbers = set()
+        self.invalidated_token_numbers = set()    # <- not RPython
+        self.jitcell_token_wrefs = []
+        self.jitcell_dicts = []                   # <- not RPython
 
     def clear(self):
         del self.loops[:]
         del self.locations[:]
         del self.aborted_keys[:]
+        del self.jitcell_token_wrefs[:]
         self.invalidated_token_numbers.clear()
         self.compiled_count = 0
         self.enter_count = 0
         self.aborted_count = 0
+        for dict in self.jitcell_dicts:
+            dict.clear()
 
+    def add_jitcell_token(self, token):
+        assert isinstance(token, JitCellToken)
+        self.jitcell_token_wrefs.append(weakref.ref(token))
+        
     def set_history(self, history):
         self.operations = history.operations
 
@@ -985,6 +1011,15 @@
     def get_all_loops(self):
         return self.loops
 
+    def get_all_jitcell_tokens(self):
+        tokens = [t() for t in self.jitcell_token_wrefs]
+        if None in tokens:
+            assert False, "get_all_jitcell_tokens will not work as "+\
+                          "loops have been freed"
+        return tokens
+            
+        
+
     def check_history(self, expected=None, **check):
         insns = {}
         for op in self.operations:
@@ -1002,13 +1037,14 @@
 
     def check_resops(self, expected=None, **check):
         insns = {}
-        for loop in self.loops:
+        for loop in self.get_all_loops():
             insns = loop.summary(adding_insns=insns)
         return self._check_insns(insns, expected, check)
 
     def _check_insns(self, insns, expected, check):
         if expected is not None:
             insns.pop('debug_merge_point', None)
+            insns.pop('label', None)
             assert insns == expected
         for insn, expected_count in check.items():
             getattr(rop, insn.upper())  # fails if 'rop.INSN' does not exist
@@ -1035,29 +1071,83 @@
             opname = op.getopname()
             insns[opname] = insns.get(opname, 0) + 1
         return self._check_insns(insns, expected, check)
+
+    def check_simple_loop(self, expected=None, **check):
+        # Usefull in the simplest case when we have only one trace ending with
+        # a jump back to itself and possibly a few bridges ending with finnish.
+        # Only the operations within the loop formed by that single jump will
+        # be counted.
+        loops = self.get_all_loops()
+        assert len(loops) == 1
+        loop = loops[0]
+        jumpop = loop.operations[-1]
+        assert jumpop.getopnum() == rop.JUMP
+        assert self.check_resops(jump=1)
+        labels = [op for op in loop.operations if op.getopnum() == rop.LABEL]
+        targets = [op._descr_wref() for op in labels]
+        assert None not in targets # TargetToken was freed, give up
+        target = jumpop._descr_wref()
+        assert target
+        assert targets.count(target) == 1
+        i = loop.operations.index(labels[targets.index(target)])
+        insns = {}
+        for op in loop.operations[i:]:
+            opname = op.getopname()
+            insns[opname] = insns.get(opname, 0) + 1
+        return self._check_insns(insns, expected, check)
         
+    def check_loops(self, expected=None, everywhere=False, **check):
+        insns = {}
+        for loop in self.get_all_loops():
+            #if not everywhere:
+            #    if getattr(loop, '_ignore_during_counting', False):
+            #        continue
+            insns = loop.summary(adding_insns=insns)
+        if expected is not None:
+            insns.pop('debug_merge_point', None)
+            print
+            print
+            print "        self.check_resops(%s)" % str(insns)
+            print
+            import pdb; pdb.set_trace()
+        else:
+            chk = ['%s=%d' % (i, insns.get(i, 0)) for i in check]
+            print
+            print
+            print "        self.check_resops(%s)" % ', '.join(chk)
+            print
+            import pdb; pdb.set_trace()
+        return
+        
+        for insn, expected_count in check.items():
+            getattr(rop, insn.upper())  # fails if 'rop.INSN' does not exist
+            found = insns.get(insn, 0)
+            assert found == expected_count, (
+                "found %d %r, expected %d" % (found, insn, expected_count))
+        return insns
+
     def check_consistency(self):
         "NOT_RPYTHON"
-        for loop in self.loops:
+        for loop in self.get_all_loops():
             loop.check_consistency()
 
     def maybe_view(self):
         if option.view:
             self.view()
 
-    def view(self, errmsg=None, extraloops=[]):
-        from pypy.jit.metainterp.graphpage import display_loops
-        loops = self.get_all_loops()[:]
-        for loop in extraloops:
-            if loop in loops:
-                loops.remove(loop)
-            loops.append(loop)
-        highlight_loops = dict.fromkeys(extraloops, 1)
-        for loop in loops:
-            if hasattr(loop, '_looptoken_number') and (
-                    loop._looptoken_number in self.invalidated_token_numbers):
-                highlight_loops.setdefault(loop, 2)
-        display_loops(loops, errmsg, highlight_loops)
+    def view(self, errmsg=None, extraprocedures=[]):
+        from pypy.jit.metainterp.graphpage import display_procedures
+        procedures = self.get_all_loops()[:]
+        for procedure in extraprocedures:
+            if procedure in procedures:
+                procedures.remove(procedure)
+            procedures.append(procedure)
+        highlight_procedures = dict.fromkeys(extraprocedures, 1)
+        for procedure in procedures:
+            if hasattr(procedure, '_looptoken_number') and (
+               procedure._looptoken_number in self.invalidated_token_numbers):
+                highlight_procedures.setdefault(procedure, 2)
+        display_procedures(procedures, errmsg, highlight_procedures)
 
 # ----------------------------------------------------------------
 
diff --git a/pypy/jit/metainterp/inliner.py b/pypy/jit/metainterp/inliner.py
new file mode 100644
--- /dev/null
+++ b/pypy/jit/metainterp/inliner.py
@@ -0,0 +1,57 @@
+from pypy.jit.metainterp.history import Const
+from pypy.jit.metainterp.resume import Snapshot
+
+class Inliner(object):
+    def __init__(self, inputargs, jump_args):
+        assert len(inputargs) == len(jump_args)
+        self.argmap = {}
+        for i in range(len(inputargs)):
+            if inputargs[i] in self.argmap:
+                assert self.argmap[inputargs[i]] == jump_args[i]
+            else:
+                self.argmap[inputargs[i]] = jump_args[i]
+        self.snapshot_map = {None: None}
+
+    def inline_op(self, newop, ignore_result=False, clone=True,
+                  ignore_failargs=False):
+        if clone:
+            newop = newop.clone()
+        args = newop.getarglist()
+        newop.initarglist([self.inline_arg(a) for a in args])
+
+        if newop.is_guard():
+            args = newop.getfailargs()
+            if args and not ignore_failargs:
+                newop.setfailargs([self.inline_arg(a) for a in args])
+            else:
+                newop.setfailargs([])
+
+        if newop.result and not ignore_result:
+            old_result = newop.result
+            newop.result = newop.result.clonebox()
+            self.argmap[old_result] = newop.result
+
+        self.inline_descr_inplace(newop.getdescr())
+
+        return newop
+
+    def inline_descr_inplace(self, descr):
+        from pypy.jit.metainterp.compile import ResumeGuardDescr
+        if isinstance(descr, ResumeGuardDescr):
+            descr.rd_snapshot = self.inline_snapshot(descr.rd_snapshot)
+
+    def inline_arg(self, arg):
+        if arg is None:
+            return None
+        if isinstance(arg, Const):
+            return arg
+        return self.argmap[arg]
+
+    def inline_snapshot(self, snapshot):
+        if snapshot in self.snapshot_map:
+            return self.snapshot_map[snapshot]
+        boxes = [self.inline_arg(a) for a in snapshot.boxes]
+        new_snapshot = Snapshot(self.inline_snapshot(snapshot.prev), boxes)
+        self.snapshot_map[snapshot] = new_snapshot
+        return new_snapshot
+
diff --git a/pypy/jit/metainterp/jitdriver.py b/pypy/jit/metainterp/jitdriver.py
--- a/pypy/jit/metainterp/jitdriver.py
+++ b/pypy/jit/metainterp/jitdriver.py
@@ -11,6 +11,7 @@
     #    self.portal_calldescr  ... pypy.jit.metainterp.warmspot
     #    self.num_green_args    ... pypy.jit.metainterp.warmspot
     #    self.num_red_args      ... pypy.jit.metainterp.warmspot
+    #    self.red_args_types    ... pypy.jit.metainterp.warmspot
     #    self.result_type       ... pypy.jit.metainterp.warmspot
     #    self.virtualizable_info... pypy.jit.metainterp.warmspot
     #    self.greenfield_info   ... pypy.jit.metainterp.warmspot
diff --git a/pypy/jit/metainterp/jitprof.py b/pypy/jit/metainterp/jitprof.py
--- a/pypy/jit/metainterp/jitprof.py
+++ b/pypy/jit/metainterp/jitprof.py
@@ -10,8 +10,6 @@
 counters="""
 TRACING
 BACKEND
-RUNNING
-BLACKHOLE
 OPS
 RECORDED_OPS
 GUARDS
@@ -67,18 +65,6 @@
     def end_backend(self):
         pass
 
-    def start_running(self):
-        pass
-
-    def end_running(self):
-        pass
-
-    def start_blackhole(self):
-        pass
-
-    def end_blackhole(self):
-        pass
-
     def count(self, kind, inc=1):
         pass
 
@@ -134,16 +120,6 @@
     def start_backend(self):   self._start(BACKEND)
     def end_backend(self):     self._end  (BACKEND)
 
-    # Don't record times for 'running' and 'blackhole' because there are
-    # too many of them: calling time.time() is a major blocker.
-    # If you are interested in these numbers, use 'PYPYLOG=file' and
-    # look at the resulting file with pypy/tool/logparser.py.
-    def start_running(self): self.count(RUNNING)
-    def end_running(self):   pass
-
-    def start_blackhole(self): self.count(BLACKHOLE)
-    def end_blackhole(self):   pass
-
     def count(self, kind, inc=1):
         self.counters[kind] += inc        
     
@@ -165,8 +141,6 @@
         calls = self.calls
         self._print_line_time("Tracing", cnt[TRACING],   tim[TRACING])
         self._print_line_time("Backend", cnt[BACKEND],   tim[BACKEND])
-        self._print_intline("Running asm", cnt[RUNNING])
-        self._print_intline("Blackhole", cnt[BLACKHOLE])
         line = "TOTAL:      \t\t%f" % (self.tk - self.starttime, )
         debug_print(line)
         self._print_intline("ops", cnt[OPS])
diff --git a/pypy/jit/metainterp/optimizeopt/__init__.py b/pypy/jit/metainterp/optimizeopt/__init__.py
--- a/pypy/jit/metainterp/optimizeopt/__init__.py
+++ b/pypy/jit/metainterp/optimizeopt/__init__.py
@@ -4,13 +4,15 @@
 from pypy.jit.metainterp.optimizeopt.virtualize import OptVirtualize
 from pypy.jit.metainterp.optimizeopt.heap import OptHeap
 from pypy.jit.metainterp.optimizeopt.vstring import OptString
-from pypy.jit.metainterp.optimizeopt.unroll import optimize_unroll, OptInlineShortPreamble
+from pypy.jit.metainterp.optimizeopt.unroll import optimize_unroll
 from pypy.jit.metainterp.optimizeopt.fficall import OptFfiCall
 from pypy.jit.metainterp.optimizeopt.simplify import OptSimplify
 from pypy.jit.metainterp.optimizeopt.pure import OptPure
 from pypy.jit.metainterp.optimizeopt.earlyforce import OptEarlyForce
 from pypy.rlib.jit import PARAMETERS
 from pypy.rlib.unroll import unrolling_iterable
+from pypy.rlib.debug import debug_start, debug_stop, debug_print
+
 
 ALL_OPTS = [('intbounds', OptIntBounds),
             ('rewrite', OptRewrite),
@@ -28,8 +30,7 @@
 ALL_OPTS_LIST = [name for name, _ in ALL_OPTS]
 ALL_OPTS_NAMES = ':'.join([name for name, _ in ALL_OPTS])
 
-def build_opt_chain(metainterp_sd, enable_opts,
-                    inline_short_preamble=True, retraced=False):
+def build_opt_chain(metainterp_sd, enable_opts):
     config = metainterp_sd.config
     optimizations = []
     unroll = 'unroll' in enable_opts    # 'enable_opts' is normally a dict
@@ -45,12 +46,9 @@
                 optimizations.append(OptFfiCall())
 
     if ('rewrite' not in enable_opts or 'virtualize' not in enable_opts
-        or 'heap' not in enable_opts):
+        or 'heap' not in enable_opts or 'unroll' not in enable_opts):
         optimizations.append(OptSimplify())
 
-    if inline_short_preamble:
-        optimizations = [OptInlineShortPreamble(retraced)] + optimizations
-
     return optimizations, unroll
 
 
@@ -80,3 +78,21 @@
 
 if __name__ == '__main__':
     print ALL_OPTS_NAMES
+
+def optimize_trace(metainterp_sd, loop, enable_opts, inline_short_preamble=True):
+    """Optimize loop.operations to remove internal overheadish operations.
+    """
+
+    debug_start("jit-optimize")
+    try:
+        loop.logops = metainterp_sd.logger_noopt.log_loop(loop.inputargs,
+                                                          loop.operations)
+        optimizations, unroll = build_opt_chain(metainterp_sd, enable_opts)
+        if unroll:
+            optimize_unroll(metainterp_sd, loop, optimizations, inline_short_preamble)
+        else:
+            optimizer = Optimizer(metainterp_sd, loop, optimizations)
+            optimizer.propagate_all_forward()
+    finally:
+        debug_stop("jit-optimize")
+        
diff --git a/pypy/jit/metainterp/optimizeopt/fficall.py b/pypy/jit/metainterp/optimizeopt/fficall.py
--- a/pypy/jit/metainterp/optimizeopt/fficall.py
+++ b/pypy/jit/metainterp/optimizeopt/fficall.py
@@ -234,6 +234,9 @@
             # longlongs are treated as floats, see
             # e.g. llsupport/descr.py:getDescrClass
             is_float = True
+        elif kind == 'u':
+            # they're all False
+            pass
         else:
             assert False, "unsupported ffitype or kind"
         #
diff --git a/pypy/jit/metainterp/optimizeopt/optimizer.py b/pypy/jit/metainterp/optimizeopt/optimizer.py
--- a/pypy/jit/metainterp/optimizeopt/optimizer.py
+++ b/pypy/jit/metainterp/optimizeopt/optimizer.py
@@ -500,8 +500,9 @@
         else:
             return CVAL_ZERO
 
-    def propagate_all_forward(self):
-        self.clear_newoperations()
+    def propagate_all_forward(self, clear=True):
+        if clear:
+            self.clear_newoperations()
         for op in self.loop.operations:
             self.first_optimization.propagate_forward(op)
         self.loop.operations = self.get_newoperations()
diff --git a/pypy/jit/metainterp/optimizeopt/simplify.py b/pypy/jit/metainterp/optimizeopt/simplify.py
--- a/pypy/jit/metainterp/optimizeopt/simplify.py
+++ b/pypy/jit/metainterp/optimizeopt/simplify.py
@@ -1,9 +1,12 @@
 from pypy.jit.metainterp.optimizeopt.optimizer import Optimization
 from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method
 from pypy.jit.metainterp.resoperation import ResOperation, rop
-
+from pypy.jit.metainterp.history import TargetToken, JitCellToken
 
 class OptSimplify(Optimization):
+    def __init__(self):
+        self.last_label_descr = None
+        
     def optimize_CALL_PURE(self, op):
         args = op.getarglist()
         self.emit_operation(ResOperation(rop.CALL, args, op.result,
@@ -31,6 +34,23 @@
     def optimize_RECORD_KNOWN_CLASS(self, op):
         pass
 
+    def optimize_LABEL(self, op):
+        self.last_label_descr = op.getdescr()
+        self.emit_operation(op)
+        
+    def optimize_JUMP(self, op):
+        descr = op.getdescr()
+        assert isinstance(descr, JitCellToken)
+        if not descr.target_tokens:
+            assert self.last_label_descr is not None
+            target_token = self.last_label_descr
+            assert isinstance(target_token, TargetToken)
+            assert target_token.targeting_jitcell_token is descr
+            op.setdescr(self.last_label_descr)
+        else:
+            assert len(descr.target_tokens) == 1
+            op.setdescr(descr.target_tokens[0])
+        self.emit_operation(op)
 
 dispatch_opt = make_dispatcher_method(OptSimplify, 'optimize_',
         default=OptSimplify.emit_operation)
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_multilabel.py b/pypy/jit/metainterp/optimizeopt/test/test_multilabel.py
new file mode 100644
--- /dev/null
+++ b/pypy/jit/metainterp/optimizeopt/test/test_multilabel.py
@@ -0,0 +1,200 @@
+from pypy.jit.metainterp.optimizeopt.test.test_util import (
+    LLtypeMixin, BaseTest, Storage, _sortboxes, FakeDescrWithSnapshot)
+from pypy.jit.metainterp.history import TreeLoop, JitCellToken, TargetToken
+from pypy.jit.metainterp.resoperation import rop, opname, ResOperation
+from pypy.jit.metainterp.optimize import InvalidLoop
+from py.test import raises
+
+class BaseTestMultiLabel(BaseTest):
+    enable_opts = "intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll"
+
+    def optimize_loop(self, ops, expected):
+        loop = self.parse(ops)
+        if expected != "crash!":
+            expected = self.parse(expected)
+
+        part = TreeLoop('part')
+        part.inputargs = loop.inputargs
+        part.start_resumedescr = FakeDescrWithSnapshot()
+        token = loop.original_jitcell_token
+
+        optimized = TreeLoop('optimized')
+        optimized.inputargs = loop.inputargs
+        optimized.operations = []
+        
+        labels = [i for i, op in enumerate(loop.operations) \
+                  if op.getopnum()==rop.LABEL]
+        prv = 0
+        last_label = []
+        for nxt in labels + [len(loop.operations)]:
+            assert prv != nxt
+            operations = last_label + loop.operations[prv:nxt]
+            if nxt < len(loop.operations):
+                label = loop.operations[nxt]
+                assert label.getopnum() == rop.LABEL
+                jumpop = ResOperation(rop.JUMP, label.getarglist(),
+                                      None, descr=token)
+                operations.append(jumpop)
+            part.operations = operations
+            self._do_optimize_loop(part, None)
+            if part.operations[-1].getopnum() == rop.LABEL:
+                last_label = [part.operations.pop()]
+            else:
+                last_label = []
+            optimized.operations.extend(part.operations)
+            prv = nxt + 1
+        
+        #
+        print
+        print "Optimized:"
+        if optimized.operations:
+            print '\n'.join([str(o) for o in optimized.operations])
+        else:
+            print 'Failed!'
+        print
+
+        assert expected != "crash!", "should have raised an exception"
+        self.assert_equal(optimized, expected)
+
+        return optimized
+
+    def test_simple(self):
+        ops = """
+        [i1]
+        i2 = int_add(i1, 1)
+        escape(i2)
+        label(i1)
+        i3 = int_add(i1, 1)
+        escape(i3)
+        jump(i1)
+        """
+        expected = """
+        [i1]
+        i2 = int_add(i1, 1)
+        escape(i2)
+        label(i1, i2)
+        escape(i2)
+        jump(i1, i2)
+        """
+        self.optimize_loop(ops, expected)
+
+    def test_forced_virtual(self):
+        ops = """
+        [p1]
+        p3 = new_with_vtable(ConstClass(node_vtable))
+        label(p3)
+        escape(p3)
+        jump(p3)
+        """
+        with raises(InvalidLoop):
+            self.optimize_loop(ops, ops)
+
+    def test_virtuals_with_nonmatching_fields(self):
+        ops = """
+        [p1]
+        p3 = new_with_vtable(ConstClass(node_vtable))
+        setfield_gc(p3, 1, descr=valuedescr)
+        label(p3)
+        p4 = new_with_vtable(ConstClass(node_vtable))
+        setfield_gc(p4, 1, descr=nextdescr)
+        jump(p4)
+        """
+        with raises(InvalidLoop):
+            self.optimize_loop(ops, ops)
+
+    def test_virtual_arrays_with_nonmatching_lens(self):
+        ops = """
+        [p1]
+        p2 = new_array(3, descr=arraydescr)
+        label(p2)
+        p4 = new_array(2, descr=arraydescr)        
+        jump(p4)
+        """
+        with raises(InvalidLoop):
+            self.optimize_loop(ops, ops)
+        
+    def test_nonmatching_arraystruct_1(self):
+        ops = """
+        [p1, f0]
+        p2 = new_array(3, descr=complexarraydescr)
+        setinteriorfield_gc(p2, 2, f0, descr=complexrealdescr)
+        label(p2, f0)
+        p4 = new_array(3, descr=complexarraydescr)
+        setinteriorfield_gc(p4, 2, f0, descr=compleximagdescr)
+        jump(p4, f0)
+        """
+        with raises(InvalidLoop):
+            self.optimize_loop(ops, ops)
+        
+    def test_nonmatching_arraystruct_2(self):
+        ops = """
+        [p1, f0]
+        p2 = new_array(3, descr=complexarraydescr)
+        setinteriorfield_gc(p2, 2, f0, descr=complexrealdescr)
+        label(p2, f0)
+        p4 = new_array(2, descr=complexarraydescr)
+        setinteriorfield_gc(p4, 0, f0, descr=complexrealdescr)        
+        jump(p4, f0)
+        """
+        with raises(InvalidLoop):
+            self.optimize_loop(ops, ops)
+
+    def test_not_virtual(self):
+        ops = """
+        [p1]
+        p3 = new_with_vtable(ConstClass(node_vtable))
+        label(p3)
+        p4 = escape()
+        jump(p4)
+        """
+        with raises(InvalidLoop):
+            self.optimize_loop(ops, ops)
+
+    def test_not_virtual_array(self):
+        ops = """
+        [p1]
+        p3 = new_array(3, descr=arraydescr)
+        label(p3)
+        p4 = escape()
+        jump(p4)
+        """
+        with raises(InvalidLoop):
+            self.optimize_loop(ops, ops)
+
+    def test_not_virtual_arraystruct(self):
+        ops = """
+        [p1]
+        p3 = new_array(3, descr=complexarraydescr)
+        label(p3)
+        p4 = escape()
+        jump(p4)
+        """
+        with raises(InvalidLoop):
+            self.optimize_loop(ops, ops)
+
+    def test_virtual_turns_constant(self):
+        ops = """
+        [p1]
+        p3 = new_with_vtable(ConstClass(node_vtable))
+        label(p3)
+        guard_value(p3, ConstPtr(myptr)) []
+        jump(p3)
+        """
+        with raises(InvalidLoop):
+            self.optimize_loop(ops, ops)
+        
+    def test_virtuals_turns_not_equal(self):
+        ops = """
+        [p1, p2]
+        p3 = new_with_vtable(ConstClass(node_vtable))
+        label(p3, p3)
+        p4 = new_with_vtable(ConstClass(node_vtable))
+        jump(p3, p4)
+        """
+        with raises(InvalidLoop):
+            self.optimize_loop(ops, ops)
+        
+    
+class TestLLtype(BaseTestMultiLabel, LLtypeMixin):
+    pass
+
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_optimizebasic.py b/pypy/jit/metainterp/optimizeopt/test/test_optimizebasic.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_optimizebasic.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_optimizebasic.py
@@ -1,7 +1,8 @@
 import py
 from pypy.rlib.objectmodel import instantiate
 from pypy.jit.metainterp.optimizeopt.test.test_util import (
-    LLtypeMixin, BaseTest, FakeMetaInterpStaticData)
+    LLtypeMixin, BaseTest, FakeMetaInterpStaticData, convert_old_style_to_targets)
+from pypy.jit.metainterp.history import TargetToken, JitCellToken
 from pypy.jit.metainterp.test.test_compile import FakeLogger
 import pypy.jit.metainterp.optimizeopt.optimizer as optimizeopt
 import pypy.jit.metainterp.optimizeopt.virtualize as virtualize
@@ -11,7 +12,6 @@
 from pypy.jit.metainterp.resoperation import rop, opname, ResOperation
 from pypy.rlib.rarithmetic import LONG_BIT
 
-
 def test_store_final_boxes_in_guard():
     from pypy.jit.metainterp.compile import ResumeGuardDescr
     from pypy.jit.metainterp.resume import tag, TAGBOX
@@ -116,9 +116,13 @@
     enable_opts = "intbounds:rewrite:virtualize:string:earlyforce:pure:heap"
 
     def optimize_loop(self, ops, optops, call_pure_results=None):
-
         loop = self.parse(ops)
-        expected = self.parse(optops)
+        token = JitCellToken() 
+        loop.operations = [ResOperation(rop.LABEL, loop.inputargs, None, descr=TargetToken(token))] + \
+                          loop.operations
+        if loop.operations[-1].getopnum() == rop.JUMP:
+            loop.operations[-1].setdescr(token)
+        expected = convert_old_style_to_targets(self.parse(optops), jump=True)
         self._do_optimize_loop(loop, call_pure_results)
         print '\n'.join([str(o) for o in loop.operations])
         self.assert_equal(loop, expected)
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py b/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py
@@ -1,13 +1,13 @@
 import py
 from pypy.rlib.objectmodel import instantiate
 from pypy.jit.metainterp.optimizeopt.test.test_util import (
-    LLtypeMixin, BaseTest, Storage, _sortboxes)
+    LLtypeMixin, BaseTest, Storage, _sortboxes, convert_old_style_to_targets)
 import pypy.jit.metainterp.optimizeopt.optimizer as optimizeopt
 import pypy.jit.metainterp.optimizeopt.virtualize as virtualize
 from pypy.jit.metainterp.optimizeopt import optimize_loop_1, ALL_OPTS_DICT, build_opt_chain
 from pypy.jit.metainterp.optimize import InvalidLoop
 from pypy.jit.metainterp.history import AbstractDescr, ConstInt, BoxInt
-from pypy.jit.metainterp.history import TreeLoop, LoopToken
+from pypy.jit.metainterp.history import TreeLoop, JitCellToken, TargetToken
 from pypy.jit.metainterp.jitprof import EmptyProfiler
 from pypy.jit.metainterp import executor, compile, resume, history
 from pypy.jit.metainterp.resoperation import rop, opname, ResOperation
@@ -15,7 +15,7 @@
 from pypy.jit.metainterp.optimizeopt.util import args_dict
 from pypy.jit.metainterp.optimizeopt.test.test_optimizebasic import FakeMetaInterpStaticData
 from pypy.config.pypyoption import get_pypy_config
-
+from pypy.jit.metainterp.optimizeopt.unroll import Inliner
 
 def test_build_opt_chain():
     def check(chain, expected_names):
@@ -23,49 +23,37 @@
         assert names == expected_names
     #
     metainterp_sd = FakeMetaInterpStaticData(None)
-    chain, _ = build_opt_chain(metainterp_sd, "", inline_short_preamble=False)
+    chain, _ = build_opt_chain(metainterp_sd, "")
     check(chain, ["OptSimplify"])
     #
     chain, _ = build_opt_chain(metainterp_sd, "")
-    check(chain, ["OptInlineShortPreamble", "OptSimplify"])
+    check(chain, ["OptSimplify"])
     #
     chain, _ = build_opt_chain(metainterp_sd, "")
-    check(chain, ["OptInlineShortPreamble", "OptSimplify"])
+    check(chain, ["OptSimplify"])
     #
     chain, _ = build_opt_chain(metainterp_sd, "heap:intbounds")
-    check(chain, ["OptInlineShortPreamble", "OptIntBounds", "OptHeap", "OptSimplify"])
+    check(chain, ["OptIntBounds", "OptHeap", "OptSimplify"])
     #
     chain, unroll = build_opt_chain(metainterp_sd, "unroll")
-    check(chain, ["OptInlineShortPreamble", "OptSimplify"])
+    check(chain, ["OptSimplify"])
     assert unroll
     #
-    chain, _ = build_opt_chain(metainterp_sd, "aaa:bbb", inline_short_preamble=False)
+    chain, _ = build_opt_chain(metainterp_sd, "aaa:bbb")
     check(chain, ["OptSimplify"])
     #
-    chain, _ = build_opt_chain(metainterp_sd, "ffi", inline_short_preamble=False)
+    chain, _ = build_opt_chain(metainterp_sd, "ffi")
     check(chain, ["OptFfiCall", "OptSimplify"])
     #
     metainterp_sd.config = get_pypy_config(translating=True)
     assert not metainterp_sd.config.translation.jit_ffi
-    chain, _ = build_opt_chain(metainterp_sd, "ffi", inline_short_preamble=False)
+    chain, _ = build_opt_chain(metainterp_sd, "ffi")
     check(chain, ["OptSimplify"])
 
 
 # ____________________________________________________________
 
 
-class FakeDescr(compile.ResumeGuardDescr):
-    class rd_snapshot:
-        class prev:
-            prev = None
-            boxes = []
-        boxes = []
-    def clone_if_mutable(self):
-        return FakeDescr()
-    def __eq__(self, other):
-        return isinstance(other, Storage) or isinstance(other, FakeDescr)
-
-
 class BaseTestWithUnroll(BaseTest):
 
     enable_opts = "intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll"
@@ -79,40 +67,41 @@
             expected_preamble = self.parse(expected_preamble)
         if expected_short:
             expected_short = self.parse(expected_short)
-        loop.preamble = TreeLoop('preamble')
-        loop.preamble.inputargs = loop.inputargs
-        loop.preamble.token = LoopToken()
-        loop.preamble.start_resumedescr = FakeDescr()
-        #
-        self._do_optimize_loop(loop, call_pure_results)
+
+        preamble = self.unroll_and_optimize(loop, call_pure_results)
+        
         #
         print
         print "Preamble:"
-        print loop.preamble.inputargs
-        if loop.preamble.operations:
-            print '\n'.join([str(o) for o in loop.preamble.operations])
+        if preamble.operations:
+            print '\n'.join([str(o) for o in preamble.operations])
         else:
             print 'Failed!'
         print
         print "Loop:"
-        print loop.inputargs
         print '\n'.join([str(o) for o in loop.operations])
         print
         if expected_short:
             print "Short Preamble:"
-            short = loop.preamble.token.short_preamble[0]
-            print short.inputargs
-            print '\n'.join([str(o) for o in short.operations])
+            short = loop.operations[0].getdescr().short_preamble
+            print '\n'.join([str(o) for o in short])
             print
 
         assert expected != "crash!", "should have raised an exception"
-        self.assert_equal(loop, expected)
+        self.assert_equal(loop, convert_old_style_to_targets(expected, jump=True))
+        assert loop.operations[0].getdescr() == loop.operations[-1].getdescr()
         if expected_preamble:
-            self.assert_equal(loop.preamble, expected_preamble,
+            self.assert_equal(preamble, convert_old_style_to_targets(expected_preamble, jump=False),
                               text_right='expected preamble')
+            assert preamble.operations[-1].getdescr() == loop.operations[0].getdescr()
         if expected_short:
-            self.assert_equal(short, expected_short,
+            short_preamble = TreeLoop('short preamble')
+            assert short[0].getopnum() == rop.LABEL
+            short_preamble.inputargs = short[0].getarglist()
+            short_preamble.operations = short
+            self.assert_equal(short_preamble, convert_old_style_to_targets(expected_short, jump=True),
                               text_right='expected short preamble')
+            assert short[-1].getdescr() == loop.operations[0].getdescr()
 
         return loop
 
@@ -234,7 +223,7 @@
             """ % expected_value
             self.optimize_loop(ops, expected)
 
-    def test_reverse_of_cast(self):
+    def test_reverse_of_cast_1(self):
         ops = """
         [i0]
         p0 = cast_int_to_ptr(i0)
@@ -246,6 +235,8 @@
         jump(i0)
         """
         self.optimize_loop(ops, expected)
+
+    def test_reverse_of_cast_2(self):        
         ops = """
         [p0]
         i1 = cast_ptr_to_int(p0)
@@ -1181,6 +1172,7 @@
         i1 = getfield_gc(p0, descr=valuedescr)
         i2 = int_sub(i1, 1)
         i3 = int_add(i0, i1)
+        i4 = same_as(i2) # This same_as should be killed by backend
         jump(i3, i2, i1)
         """
         expected = """
@@ -1252,10 +1244,10 @@
         i1 = int_add(i0, 1)
         p1 = new_with_vtable(ConstClass(node_vtable2))
         p2 = new_with_vtable(ConstClass(node_vtable2))
-        setfield_gc(p0, p1, descr=nextdescr)
+        setfield_gc(p2, i1, descr=valuedescr)
         setfield_gc(p2, p1, descr=nextdescr)
         setfield_gc(p1, p2, descr=nextdescr)
-        setfield_gc(p2, i1, descr=valuedescr)
+        setfield_gc(p0, p1, descr=nextdescr)
         jump(p1)
         """
         self.optimize_loop(ops, loop, preamble)
@@ -1317,6 +1309,7 @@
         p30 = new_with_vtable(ConstClass(node_vtable))
         setfield_gc(p30, i28, descr=nextdescr)
         setfield_gc(p3, p30, descr=valuedescr)
+        p46 = same_as(p30) # This same_as should be killed by backend        
         jump(i29, p30, p3)
         """
         expected = """
@@ -1324,8 +1317,8 @@
         i28 = int_add(i0, 1)
         i29 = int_add(i28, 1)
         p30 = new_with_vtable(ConstClass(node_vtable))
+        setfield_gc(p30, i28, descr=nextdescr)
         setfield_gc(p3, p30, descr=valuedescr)
-        setfield_gc(p30, i28, descr=nextdescr)
         jump(i29, p30, p3)
         """
         self.optimize_loop(ops, expected, preamble)
@@ -2118,7 +2111,9 @@
         guard_true(i3) []
         i4 = int_neg(i2)
         setfield_gc(p1, i2, descr=valuedescr)
-        jump(p1, i1, i2, i4, i4)
+        i7 = same_as(i2) # This same_as should be killed by backend
+        i6 = same_as(i4)
+        jump(p1, i1, i2, i4, i6)
         """
         expected = """
         [p1, i1, i2, i4, i5]
@@ -2148,7 +2143,8 @@
         i4 = int_neg(i2)
         setfield_gc(p1, NULL, descr=nextdescr)
         escape()
-        jump(p1, i2, i4, i4)
+        i5 = same_as(i4)
+        jump(p1, i2, i4, i5)
         """
         expected = """
         [p1, i2, i4, i5]
@@ -2177,7 +2173,8 @@
         i4 = int_neg(i2)
         setfield_gc(p1, NULL, descr=nextdescr)
         escape()
-        jump(p1, i2, i4, i4)
+        i5 = same_as(i4)
+        jump(p1, i2, i4, i5)
         """
         expected = """
         [p1, i2, i4, i5]
@@ -2207,7 +2204,9 @@
         guard_true(i5) []
         i4 = int_neg(i2)
         setfield_gc(p1, i2, descr=valuedescr)
-        jump(p1, i1, i2, i4, i4)
+        i8 = same_as(i2) # This same_as should be killed by backend
+        i7 = same_as(i4)
+        jump(p1, i1, i2, i4, i7)
         """
         expected = """
         [p1, i1, i2, i4, i7]
@@ -2433,7 +2432,8 @@
         p2 = new_with_vtable(ConstClass(node_vtable))
         setfield_gc(p2, p4, descr=nextdescr)
         setfield_gc(p1, p2, descr=nextdescr)
-        jump(p1, i2, i4, p4, i4)
+        i101 = same_as(i4) 
+        jump(p1, i2, i4, p4, i101)
         """
         expected = """
         [p1, i2, i4, p4, i5]
@@ -3276,7 +3276,15 @@
         setfield_gc(p1, i3, descr=valuedescr)
         jump(p1, i4, i3)
         '''
-        self.optimize_loop(ops, ops, ops)
+        preamble = '''
+        [p1, i1, i4]
+        setfield_gc(p1, i1, descr=valuedescr)
+        i3 = call_assembler(i1, descr=asmdescr)
+        setfield_gc(p1, i3, descr=valuedescr)
+        i143 = same_as(i3) # Should be killed by backend        
+        jump(p1, i4, i3)
+        '''
+        self.optimize_loop(ops, ops, preamble)
 
     def test_call_assembler_invalidates_heap_knowledge(self):
         ops = '''
@@ -3307,7 +3315,9 @@
         setfield_gc(p1, i1, descr=valuedescr)
         i3 = call(p1, descr=plaincalldescr)
         setfield_gc(p1, i3, descr=valuedescr)
-        jump(p1, i4, i3, i3)
+        i148 = same_as(i3)
+        i147 = same_as(i3)
+        jump(p1, i4, i3, i148)
         '''
         self.optimize_loop(ops, expected, preamble)
 
@@ -3330,7 +3340,8 @@
         setfield_gc(p1, i1, descr=valuedescr)
         i3 = call(p1, descr=plaincalldescr)
         setfield_gc(p1, i1, descr=valuedescr)
-        jump(p1, i4, i3, i3)
+        i151 = same_as(i3)
+        jump(p1, i4, i3, i151)
         '''
         self.optimize_loop(ops, expected, preamble)
 
@@ -3350,7 +3361,8 @@
         escape(i1)
         escape(i2)
         i4 = call(123456, 4, i0, 6, descr=plaincalldescr)
-        jump(i0, i4, i4)
+        i153 = same_as(i4)
+        jump(i0, i4, i153)
         '''
         expected = '''
         [i0, i4, i5]
@@ -3380,7 +3392,8 @@
         escape(i2)
         i4 = call(123456, 4, i0, 6, descr=plaincalldescr)
         guard_no_exception() []
-        jump(i0, i4, i4)
+        i155 = same_as(i4)        
+        jump(i0, i4, i155)
         '''
         expected = '''
         [i0, i2, i3]
@@ -4198,6 +4211,7 @@
         preamble = """
         [p0]
         i0 = strlen(p0)
+        i3 = same_as(i0) # Should be killed by backend        
         jump(p0)
         """
         expected = """
@@ -5418,6 +5432,7 @@
         [p0]
         p1 = getfield_gc(p0, descr=valuedescr)
         setfield_gc(p0, p0, descr=valuedescr)
+        p4450 = same_as(p0) # Should be killed by backend
         jump(p0)
         """
         expected = """
@@ -5653,7 +5668,8 @@
         p3 = newstr(i3)
         copystrcontent(p1, p3, 0, 0, i1)
         copystrcontent(p2, p3, 0, i1, i2)
-        jump(p2, p3, i2)
+        i7 = same_as(i2)        
+        jump(p2, p3, i7)
         """
         expected = """
         [p1, p2, i1]
@@ -5728,7 +5744,9 @@
         copystrcontent(p1, p5, 0, 0, i1)
         copystrcontent(p2, p5, 0, i1, i2)
         copystrcontent(p3, p5, 0, i12, i3)
-        jump(p2, p3, p5, i2, i3)
+        i129 = same_as(i2)
+        i130 = same_as(i3)
+        jump(p2, p3, p5, i129, i130)
         """
         expected = """
         [p1, p2, p3, i1, i2]
@@ -5788,7 +5806,8 @@
         [p1, i1, i2, i3]
         escape(i3)
         i4 = int_sub(i2, i1)
-        jump(p1, i1, i2, i4, i4)
+        i5 = same_as(i4)        
+        jump(p1, i1, i2, i4, i5)
         """
         expected = """
         [p1, i1, i2, i3, i4]
@@ -5813,7 +5832,8 @@
         escape(i5)
         i4 = int_sub(i2, i1)
         setfield_gc(p2, i4, descr=valuedescr)
-        jump(p1, i1, i2, p2, i4, i4)
+        i8 = same_as(i4)
+        jump(p1, i1, i2, p2, i8, i4)
         """
         expected = """
         [p1, i1, i2, p2, i5, i6]
@@ -5939,7 +5959,8 @@
         p4 = newstr(i5)
         copystrcontent(p1, p4, i1, 0, i3)
         copystrcontent(p2, p4, 0, i3, i4)
-        jump(p4, i1, i2, p2, i5, i3, i4)
+        i9 = same_as(i4)
+        jump(p4, i1, i2, p2, i5, i3, i9)
         """
         expected = """
         [p1, i1, i2, p2, i5, i3, i4]
@@ -6061,7 +6082,9 @@
         copystrcontent(p2, p4, 0, i1, i2)
         i0 = call(0, p3, p4, descr=strequaldescr)
         escape(i0)
-        jump(p1, p2, p3, i3, i1, i2)
+        i11 = same_as(i1)
+        i12 = same_as(i2)
+        jump(p1, p2, p3, i3, i11, i12)
         """
         expected = """
         [p1, p2, p3, i3, i1, i2]
@@ -6281,6 +6304,7 @@
         i1 = strlen(p1)
         i0 = int_eq(i1, 0)
         escape(i0)
+        i3 = same_as(i1)        
         jump(p1, i0)
         """
         self.optimize_strunicode_loop_extradescrs(ops, expected, preamble)
@@ -6326,7 +6350,9 @@
         copystrcontent(p2, p4, 0, i1, i2)
         i0 = call(0, s"hello world", p4, descr=streq_nonnull_descr)
         escape(i0)
-        jump(p1, p2, i3, i1, i2)
+        i11 = same_as(i1)
+        i12 = same_as(i2)
+        jump(p1, p2, i3, i11, i12)
         """
         expected = """
         [p1, p2, i3, i1, i2]
@@ -6629,7 +6655,8 @@
         p188 = getarrayitem_gc(p187, 42, descr=<GcPtrArrayDescr>)
         guard_value(p188, ConstPtr(myptr)) []
         p25 = getfield_gc(ConstPtr(myptr), descr=otherdescr)
-        jump(p25, p187, i184, p25)
+        p26 = same_as(p25)
+        jump(p25, p187, i184, p26)
         """
         short = """
         [p1, p187, i184]
@@ -6898,7 +6925,8 @@
         [p9]
         i843 = strlen(p9)
         call(i843, descr=nonwritedescr)
-        jump(p9, i843)
+        i0 = same_as(i843)
+        jump(p9, i0)
         """
         short = """
         [p9]
@@ -7014,6 +7042,40 @@
         """
         self.optimize_loop(ops, expected)
 
+    def test_duplicated_aliased_virtual(self):
+        ops = """
+        [p1, p2]
+        p3 = new_with_vtable(ConstClass(node_vtable))
+        setfield_gc(p3, p3, descr=nextdescr)
+        p4 = getfield_gc(p3, descr=nextdescr)
+        jump(p3, p4)
+        """
+        expected = """
+        []
+        jump()
+        """
+        self.optimize_loop(ops, expected)
+
+    def test_imported_aliased_virtual_in_failargs(self):
+        ops = """
+        [p1, p2, i0]
+        i2 = int_lt(i0, 10)
+        guard_true(i2) [p1, p2]
+        p3 = new_with_vtable(ConstClass(node_vtable))
+        setfield_gc(p3, p3, descr=nextdescr)
+        p4 = getfield_gc(p3, descr=nextdescr)
+        i1 = int_add(i0, 1)
+        jump(p3, p4, i1)
+        """
+        expected = """
+        [i0]
+        i2 = int_lt(i0, 10)
+        guard_true(i2) []
+        i1 = int_add(i0, 1)        
+        jump(i1)
+        """
+        self.optimize_loop(ops, expected)
+
     def test_chained_virtuals(self):
         ops = """
         [p0, p1]
@@ -7590,7 +7652,8 @@
         call(i2, descr=nonwritedescr)
         setfield_gc(p22, i1, descr=valuedescr)
         guard_nonnull_class(p18, ConstClass(node_vtable)) []
-        jump(p22, p18, i1, i1)
+        i10 = same_as(i1)
+        jump(p22, p18, i1, i10)
         """
         short = """
         [p22, p18, i1]
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_util.py b/pypy/jit/metainterp/optimizeopt/test/test_util.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_util.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_util.py
@@ -8,7 +8,8 @@
 from pypy.jit.backend.llgraph import runner
 from pypy.jit.metainterp.history import (BoxInt, BoxPtr, ConstInt, ConstPtr,
                                          Const, TreeLoop, BoxObj,
-                                         ConstObj, AbstractDescr)
+                                         ConstObj, AbstractDescr,
+                                         JitCellToken, TargetToken)
 from pypy.jit.metainterp.optimizeopt.util import sort_descrs, equaloplists
 from pypy.jit.metainterp.optimize import InvalidLoop
 from pypy.jit.codewriter.effectinfo import EffectInfo
@@ -18,6 +19,8 @@
 from pypy.jit.metainterp import compile, resume, history
 from pypy.jit.metainterp.jitprof import EmptyProfiler
 from pypy.config.pypyoption import get_pypy_config
+from pypy.jit.metainterp.resoperation import rop, opname, ResOperation
+from pypy.jit.metainterp.optimizeopt.unroll import Inliner
 
 def test_sort_descrs():
     class PseudoDescr(AbstractDescr):
@@ -344,6 +347,11 @@
         self.config = get_pypy_config(translating=True)
         self.config.translation.jit_ffi = True
 
+    class logger_noopt:
+        @classmethod
+        def log_loop(*args):
+            pass
+
     class warmrunnerdesc:
         class memory_manager:
             retrace_limit = 5
@@ -394,7 +402,7 @@
                             expected.operations, False, remap, text_right)
 
     def _do_optimize_loop(self, loop, call_pure_results):
-        from pypy.jit.metainterp.optimizeopt import optimize_loop_1
+        from pypy.jit.metainterp.optimizeopt import optimize_trace
         from pypy.jit.metainterp.optimizeopt.util import args_dict
 
         self.loop = loop
@@ -408,7 +416,83 @@
         if hasattr(self, 'callinfocollection'):
             metainterp_sd.callinfocollection = self.callinfocollection
         #
-        optimize_loop_1(metainterp_sd, loop, self.enable_opts)
+        optimize_trace(metainterp_sd, loop, self.enable_opts)
+
+    def unroll_and_optimize(self, loop, call_pure_results=None):
+        operations =  loop.operations
+        jumpop = operations[-1]
+        assert jumpop.getopnum() == rop.JUMP
+        inputargs = loop.inputargs
+
+        jump_args = jumpop.getarglist()[:]
+        operations = operations[:-1]
+        cloned_operations = [op.clone() for op in operations]
+
+        preamble = TreeLoop('preamble')
+        preamble.inputargs = inputargs
+        preamble.start_resumedescr = FakeDescrWithSnapshot()
+
+        token = JitCellToken() 
+        preamble.operations = [ResOperation(rop.LABEL, inputargs, None, descr=TargetToken(token))] + \
+                              operations +  \
+                              [ResOperation(rop.JUMP, jump_args, None, descr=token)]
+        self._do_optimize_loop(preamble, call_pure_results)
+
+        assert preamble.operations[-1].getopnum() == rop.LABEL
+
+        inliner = Inliner(inputargs, jump_args)
+        loop.start_resumedescr = preamble.start_resumedescr
+        loop.operations = [preamble.operations[-1]] + \
+                          [inliner.inline_op(op, clone=False) for op in cloned_operations] + \
+                          [ResOperation(rop.JUMP, [inliner.inline_arg(a) for a in jump_args],
+                                        None, descr=token)] 
+                          #[inliner.inline_op(jumpop)]
+        assert loop.operations[-1].getopnum() == rop.JUMP
+        assert loop.operations[0].getopnum() == rop.LABEL
+        loop.inputargs = loop.operations[0].getarglist()
+
+        self._do_optimize_loop(loop, call_pure_results)
+        extra_same_as = []
+        while loop.operations[0].getopnum() != rop.LABEL:
+            extra_same_as.append(loop.operations[0])
+            del loop.operations[0]
+
+        # Hack to prevent random order of same_as ops
+        extra_same_as.sort(key=lambda op: str(preamble.operations).find(str(op.getarg(0))))
+
+        for op in extra_same_as:
+            preamble.operations.insert(-1, op)
+
+        return preamble
+        
+
+class FakeDescr(compile.ResumeGuardDescr):
+    def clone_if_mutable(self):
+        return FakeDescr()
+    def __eq__(self, other):
+        return isinstance(other, FakeDescr)
+
+class FakeDescrWithSnapshot(compile.ResumeGuardDescr):
+    class rd_snapshot:
+        class prev:
+            prev = None
+            boxes = []
+        boxes = []
+    def clone_if_mutable(self):
+        return FakeDescrWithSnapshot()
+    def __eq__(self, other):
+        return isinstance(other, Storage) or isinstance(other, FakeDescrWithSnapshot)
+
+
+def convert_old_style_to_targets(loop, jump):
+    newloop = TreeLoop(loop.name)
+    newloop.inputargs = loop.inputargs
+    newloop.operations = [ResOperation(rop.LABEL, loop.inputargs, None, descr=FakeDescr())] + \
+                      loop.operations
+    if not jump:
+        assert newloop.operations[-1].getopnum() == rop.JUMP
+        newloop.operations[-1] = ResOperation(rop.LABEL, newloop.operations[-1].getarglist(), None, descr=FakeDescr())
+    return newloop
 
 # ____________________________________________________________
 
diff --git a/pypy/jit/metainterp/optimizeopt/unroll.py b/pypy/jit/metainterp/optimizeopt/unroll.py
--- a/pypy/jit/metainterp/optimizeopt/unroll.py
+++ b/pypy/jit/metainterp/optimizeopt/unroll.py
@@ -1,11 +1,12 @@
 from pypy.jit.codewriter.effectinfo import EffectInfo
-from pypy.jit.metainterp.optimizeopt.virtualstate import VirtualStateAdder, ShortBoxes
+from pypy.jit.metainterp.optimizeopt.virtualstate import VirtualStateAdder, ShortBoxes, BadVirtualState
 from pypy.jit.metainterp.compile import ResumeGuardDescr
-from pypy.jit.metainterp.history import TreeLoop, LoopToken
+from pypy.jit.metainterp.history import TreeLoop, TargetToken, JitCellToken
 from pypy.jit.metainterp.jitexc import JitException
 from pypy.jit.metainterp.optimize import InvalidLoop, RetraceLoop
 from pypy.jit.metainterp.optimizeopt.optimizer import *
 from pypy.jit.metainterp.optimizeopt.generalize import KillHugeIntBounds
+from pypy.jit.metainterp.inliner import Inliner
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.metainterp.resume import Snapshot
 from pypy.rlib.debug import debug_print
@@ -13,63 +14,11 @@
 
 # FIXME: Introduce some VirtualOptimizer super class instead
 
-def optimize_unroll(metainterp_sd, loop, optimizations):
+def optimize_unroll(metainterp_sd, loop, optimizations, inline_short_preamble=True):
     opt = UnrollOptimizer(metainterp_sd, loop, optimizations)
+    opt.inline_short_preamble = inline_short_preamble
     opt.propagate_all_forward()
 
-class Inliner(object):
-    def __init__(self, inputargs, jump_args):
-        assert len(inputargs) == len(jump_args)
-        self.argmap = {}
-        for i in range(len(inputargs)):
-            if inputargs[i] in self.argmap:
-                assert self.argmap[inputargs[i]] == jump_args[i]
-            else:
-                self.argmap[inputargs[i]] = jump_args[i]
-        self.snapshot_map = {None: None}
-
-    def inline_op(self, newop, ignore_result=False, clone=True,
-                  ignore_failargs=False):
-        if clone:
-            newop = newop.clone()
-        args = newop.getarglist()
-        newop.initarglist([self.inline_arg(a) for a in args])
-
-        if newop.is_guard():
-            args = newop.getfailargs()
-            if args and not ignore_failargs:
-                newop.setfailargs([self.inline_arg(a) for a in args])
-            else:
-                newop.setfailargs([])
-
-        if newop.result and not ignore_result:
-            old_result = newop.result
-            newop.result = newop.result.clonebox()
-            self.argmap[old_result] = newop.result
-
-        self.inline_descr_inplace(newop.getdescr())
-
-        return newop
-
-    def inline_descr_inplace(self, descr):
-        if isinstance(descr, ResumeGuardDescr):
-            descr.rd_snapshot = self.inline_snapshot(descr.rd_snapshot)
-
-    def inline_arg(self, arg):
-        if arg is None:
-            return None
-        if isinstance(arg, Const):
-            return arg
-        return self.argmap[arg]
-
-    def inline_snapshot(self, snapshot):
-        if snapshot in self.snapshot_map:
-            return self.snapshot_map[snapshot]
-        boxes = [self.inline_arg(a) for a in snapshot.boxes]
-        new_snapshot = Snapshot(self.inline_snapshot(snapshot.prev), boxes)
-        self.snapshot_map[snapshot] = new_snapshot
-        return new_snapshot
-
 class UnrollableOptimizer(Optimizer):
     def setup(self):
         self.importable_values = {}
@@ -101,14 +50,13 @@
     become the preamble or entry bridge (don't think there is a
     distinction anymore)"""
 
+    inline_short_preamble = True
+    did_import = False
+    
     def __init__(self, metainterp_sd, loop, optimizations):
         self.optimizer = UnrollableOptimizer(metainterp_sd, loop, optimizations)
-        self.cloned_operations = []
-        for op in self.optimizer.loop.operations:
-            newop = op.clone()
-            self.cloned_operations.append(newop)
 
-    def fix_snapshot(self, loop, jump_args, snapshot):
+    def fix_snapshot(self, jump_args, snapshot):
         if snapshot is None:
             return None
         snapshot_args = snapshot.boxes 
@@ -116,116 +64,348 @@
         for a in snapshot_args:
             a = self.getvalue(a).get_key_box()
             new_snapshot_args.append(a)
-        prev = self.fix_snapshot(loop, jump_args, snapshot.prev)
+        prev = self.fix_snapshot(jump_args, snapshot.prev)
         return Snapshot(prev, new_snapshot_args)
             
     def propagate_all_forward(self):
         loop = self.optimizer.loop
+        self.optimizer.clear_newoperations()
+
+
+        start_label = loop.operations[0]
+        if start_label.getopnum() == rop.LABEL:
+            loop.operations = loop.operations[1:]
+            # We need to emit the label op before import_state() as emitting it
+            # will clear heap caches
+            self.optimizer.send_extra_operation(start_label)
+        else:
+            start_label = None            
+
         jumpop = loop.operations[-1]
         if jumpop.getopnum() == rop.JUMP:
             loop.operations = loop.operations[:-1]
         else:
-            loopop = None
+            jumpop = None
 
-        self.optimizer.propagate_all_forward()
+        self.import_state(start_label)
+        self.optimizer.propagate_all_forward(clear=False)
 
+        if not jumpop:
+            return 
+        if self.jump_to_already_compiled_trace(jumpop):
+            # Found a compiled trace to jump to
+            if self.did_import:
 
-        if jumpop:
-            assert jumpop.getdescr() is loop.token
-            jump_args = jumpop.getarglist()
-            jumpop.initarglist([])
+                self.close_bridge(start_label)
+                self.finilize_short_preamble(start_label)
+            return
+
+        cell_token = jumpop.getdescr()
+        assert isinstance(cell_token, JitCellToken)
+        stop_label = ResOperation(rop.LABEL, jumpop.getarglist(), None, TargetToken(cell_token))
+
+        if not self.did_import: # Enforce the previous behaviour of always peeling  exactly one iteration (for now)
             self.optimizer.flush()
+            KillHugeIntBounds(self.optimizer).apply()
 
-            KillHugeIntBounds(self.optimizer).apply()
+            loop.operations = self.optimizer.get_newoperations()
+            self.export_state(stop_label)
+            loop.operations.append(stop_label)            
+        else:
+            assert stop_label
+            assert start_label
+            stop_target = stop_label.getdescr()
+            start_target = start_label.getdescr()
+            assert isinstance(stop_target, TargetToken)
+            assert isinstance(start_target, TargetToken)
+            assert stop_target.targeting_jitcell_token is start_target.targeting_jitcell_token
+            jumpop = ResOperation(rop.JUMP, stop_label.getarglist(), None, descr=start_label.getdescr())
+
+            self.close_loop(jumpop)
+            self.finilize_short_preamble(start_label)
+
+    def export_state(self, targetop):
+        original_jump_args = targetop.getarglist()
+        jump_args = [self.getvalue(a).get_key_box() for a in original_jump_args]
+
+        assert self.optimizer.loop.start_resumedescr
+        start_resumedescr = self.optimizer.loop.start_resumedescr.clone_if_mutable()
+        assert isinstance(start_resumedescr, ResumeGuardDescr)
+        start_resumedescr.rd_snapshot = self.fix_snapshot(jump_args, start_resumedescr.rd_snapshot)
+        # FIXME: I dont thnik we need fix_snapshot anymore
+
+        modifier = VirtualStateAdder(self.optimizer)
+        virtual_state = modifier.get_virtual_state(jump_args)
             
-            loop.preamble.operations = self.optimizer.get_newoperations()
-            jump_args = [self.getvalue(a).get_key_box() for a in jump_args]
+        values = [self.getvalue(arg) for arg in jump_args]
+        inputargs = virtual_state.make_inputargs(values, self.optimizer)
+        short_inputargs = virtual_state.make_inputargs(values, self.optimizer, keyboxes=True)
 
-            start_resumedescr = loop.preamble.start_resumedescr.clone_if_mutable()
-            self.start_resumedescr = start_resumedescr
-            assert isinstance(start_resumedescr, ResumeGuardDescr)
-            start_resumedescr.rd_snapshot = self.fix_snapshot(loop, jump_args,
-                                                              start_resumedescr.rd_snapshot)
+        constant_inputargs = {}
+        for box in jump_args: 
+            const = self.get_constant_box(box)
+            if const:
+                constant_inputargs[box] = const
 
-            modifier = VirtualStateAdder(self.optimizer)
-            virtual_state = modifier.get_virtual_state(jump_args)
+        short_boxes = ShortBoxes(self.optimizer, inputargs + constant_inputargs.keys())
+        aliased_vrituals = {}
+        for i in range(len(original_jump_args)):
+            if original_jump_args[i] is not jump_args[i]:
+                if values[i].is_virtual():
+                    aliased_vrituals[original_jump_args[i]] = jump_args[i] 
+                else:
+                    short_boxes.alias(original_jump_args[i], jump_args[i])
+
+        self.optimizer.clear_newoperations()
+        for box in short_inputargs:
+            value = self.getvalue(box)
+            if value.is_virtual():
+                value.force_box(self.optimizer)
+        inputarg_setup_ops = self.optimizer.get_newoperations()
+
+        target_token = targetop.getdescr()
+        assert isinstance(target_token, TargetToken)
+        targetop.initarglist(inputargs)
+        target_token.virtual_state = virtual_state
+        target_token.short_preamble = [ResOperation(rop.LABEL, short_inputargs, None)]
+        target_token.start_resumedescr = start_resumedescr
+        target_token.exported_state = ExportedState(constant_inputargs, short_boxes,
+                                                    inputarg_setup_ops, self.optimizer,
+                                                    aliased_vrituals, jump_args)
+
+    def import_state(self, targetop):
+        self.did_import = False
+        if not targetop:
+            # FIXME: Set up some sort of empty state with no virtuals?
+            return
+        target_token = targetop.getdescr()
+        if not target_token:
+            return
+        assert isinstance(target_token, TargetToken)
+        exported_state = target_token.exported_state
+        if not exported_state:
+            # FIXME: Set up some sort of empty state with no virtuals
+            return
+        self.did_import = True
+        
+        self.short = target_token.short_preamble[:]
+        self.short_seen = {}
+        self.short_boxes = exported_state.short_boxes.clone()
+        for box, const in exported_state.constant_inputargs.items():
+            self.short_seen[box] = True
+        self.imported_state = exported_state
+        self.inputargs = targetop.getarglist()
+        self.initial_virtual_state = target_token.virtual_state
+        self.start_resumedescr = target_token.start_resumedescr
+
+        seen = {}
+        for box in self.inputargs:
+            if box in seen:
+                continue
+            seen[box] = True
+            preamble_value = exported_state.optimizer.getvalue(box)
+            value = self.optimizer.getvalue(box)
+            value.import_from(preamble_value, self.optimizer)
+
+        for newbox, oldbox in self.short_boxes.aliases.items():
+            self.optimizer.make_equal_to(newbox, self.optimizer.getvalue(oldbox))
+        
+        # Setup the state of the new optimizer by emiting the
+        # short operations and discarding the result
+        self.optimizer.emitting_dissabled = True
+        for op in exported_state.inputarg_setup_ops:
+            self.optimizer.send_extra_operation(op)
+        seen = {}
+        
+        for op in self.short_boxes.operations():
+            self.ensure_short_op_emitted(op, self.optimizer, seen)
+            if op and op.result:
+                preamble_value = exported_state.optimizer.getvalue(op.result)
+                value = self.optimizer.getvalue(op.result)
+                if not value.is_virtual():
+                    imp = ValueImporter(self, preamble_value, op)
+                    self.optimizer.importable_values[value] = imp
+                newvalue = self.optimizer.getvalue(op.result)
+                newresult = newvalue.get_key_box()
+                if newresult is not op.result and not newvalue.is_constant():
+                    self.short_boxes.alias(newresult, op.result)
+                    op = ResOperation(rop.SAME_AS, [op.result], newresult)
+                    self.optimizer._newoperations = [op] + self.optimizer._newoperations # XXX
+                    #self.optimizer.getvalue(op.result).box = op.result # FIXME: HACK!!!
+        self.optimizer.flush()
+        self.optimizer.emitting_dissabled = False
+
+        for box, key_box in exported_state.aliased_vrituals.items():
+            self.optimizer.make_equal_to(box, self.getvalue(key_box))
+
+    def close_bridge(self, start_label):
+        inputargs = self.inputargs        
+        short_jumpargs = inputargs[:]
+
+        # We dont need to inline the short preamble we are creating as we are conneting
+        # the bridge to a different trace with a different short preamble
+        self.short_inliner = None
+        
+        newoperations = self.optimizer.get_newoperations()
+        self.boxes_created_this_iteration = {}
+        i = 0
+        while newoperations[i].getopnum() != rop.LABEL:
+            i += 1
+        while i < len(newoperations):
+            op = newoperations[i]
+            self.boxes_created_this_iteration[op.result] = True
+            args = op.getarglist()
+            if op.is_guard():
+                args = args + op.getfailargs()
+            for a in args:
+                self.import_box(a, inputargs, short_jumpargs, [])
+            i += 1
+            newoperations = self.optimizer.get_newoperations()
+        self.short.append(ResOperation(rop.JUMP, short_jumpargs, None, descr=start_label.getdescr()))
+        
+    def close_loop(self, jumpop):
+        virtual_state = self.initial_virtual_state
+        short_inputargs = self.short[0].getarglist()
+        constant_inputargs = self.imported_state.constant_inputargs
+        inputargs = self.inputargs
+        short_jumpargs = inputargs[:]
+
+        # Construct jumpargs from the virtual state
+        original_jumpargs = jumpop.getarglist()[:]
+        values = [self.getvalue(arg) for arg in jumpop.getarglist()]
+        try:
+            jumpargs = virtual_state.make_inputargs(values, self.optimizer)
+        except BadVirtualState:
+            raise InvalidLoop
+        jumpop.initarglist(jumpargs)
+
+        # Inline the short preamble at the end of the loop
+        jmp_to_short_args = virtual_state.make_inputargs(values, self.optimizer, keyboxes=True)
+        assert len(short_inputargs) == len(jmp_to_short_args)
+        args = {}
+        for i in range(len(short_inputargs)):
+            if short_inputargs[i] in args:
+                if args[short_inputargs[i]] != jmp_to_short_args[i]:
+                    raise InvalidLoop
+            args[short_inputargs[i]] = jmp_to_short_args[i]
+        self.short_inliner = Inliner(short_inputargs, jmp_to_short_args)
+        for box, const in constant_inputargs.items():
+            self.short_inliner.argmap[box] = const
+        for op in self.short[1:]:
+            newop = self.short_inliner.inline_op(op)
+            self.optimizer.send_extra_operation(newop)
+
+        # Import boxes produced in the preamble but used in the loop
+        newoperations = self.optimizer.get_newoperations()
+        self.boxes_created_this_iteration = {}
+        i = j = 0
+        while newoperations[i].getopnum() != rop.LABEL:
+            i += 1
+        while i < len(newoperations) or j < len(jumpargs):
+            if i == len(newoperations):
+                while j < len(jumpargs):
+                    a = jumpargs[j]
+                    if self.optimizer.loop.logops:
+                        debug_print('J:  ' + self.optimizer.loop.logops.repr_of_arg(a))
+                    self.import_box(a, inputargs, short_jumpargs, jumpargs)
+                    j += 1
+            else:
+                op = newoperations[i]
+
+                self.boxes_created_this_iteration[op.result] = True
+                args = op.getarglist()
+                if op.is_guard():
+                    args = args + op.getfailargs()
+
+                if self.optimizer.loop.logops:
+                    debug_print('OP: ' + self.optimizer.loop.logops.repr_of_resop(op))
+                for a in args:
+                    if self.optimizer.loop.logops:
+                        debug_print('A:  ' + self.optimizer.loop.logops.repr_of_arg(a))
+                    self.import_box(a, inputargs, short_jumpargs, jumpargs)
+                i += 1
+            newoperations = self.optimizer.get_newoperations()
+
+        jumpop.initarglist(jumpargs)
+        self.optimizer.send_extra_operation(jumpop)
+        self.short.append(ResOperation(rop.JUMP, short_jumpargs, None, descr=jumpop.getdescr()))
+
+        # Verify that the virtual state at the end of the loop is one
+        # that is compatible with the virtual state at the start of the loop
+        modifier = VirtualStateAdder(self.optimizer)
+        final_virtual_state = modifier.get_virtual_state(original_jumpargs)
+        debug_start('jit-log-virtualstate')
+        virtual_state.debug_print('Closed loop with ')
+        bad = {}
+        if not virtual_state.generalization_of(final_virtual_state, bad):
+            # We ended up with a virtual state that is not compatible
+            # and we are thus unable to jump to the start of the loop
+            final_virtual_state.debug_print("Bad virtual state at end of loop, ",
+                                            bad)
+            debug_stop('jit-log-virtualstate')
+            raise InvalidLoop
             
-            values = [self.getvalue(arg) for arg in jump_args]
-            inputargs = virtual_state.make_inputargs(values, self.optimizer)
-            short_inputargs = virtual_state.make_inputargs(values, self.optimizer,
-                                                           keyboxes=True)
+        debug_stop('jit-log-virtualstate')
 
-            self.constant_inputargs = {}
-            for box in jump_args: 
-                const = self.get_constant_box(box)
-                if const:
-                    self.constant_inputargs[box] = const
+        maxguards = self.optimizer.metainterp_sd.warmrunnerdesc.memory_manager.max_retrace_guards
+        if self.optimizer.emitted_guards > maxguards:
+            target_token = jumpop.getdescr()
+            assert isinstance(target_token, TargetToken)
+            target_token.targeting_jitcell_token.retraced_count = sys.maxint
+            
+    def finilize_short_preamble(self, start_label):
+        short = self.short
+        assert short[-1].getopnum() == rop.JUMP
+        target_token = start_label.getdescr()
+        assert isinstance(target_token, TargetToken)
 
-            sb = ShortBoxes(self.optimizer, inputargs + self.constant_inputargs.keys())
-            self.short_boxes = sb
+        # Turn guards into conditional jumps to the preamble
+        for i in range(len(short)):
+            op = short[i]
+            if op.is_guard():
+                op = op.clone()
+                op.setfailargs(None)
+                descr = target_token.start_resumedescr.clone_if_mutable()
+                op.setdescr(descr)
+                short[i] = op
+
+        # Clone ops and boxes to get private versions and
+        short_inputargs = short[0].getarglist()
+        boxmap = {}
+        newargs = [None] * len(short_inputargs)
+        for i in range(len(short_inputargs)):
+            a = short_inputargs[i]
+            if a in boxmap:
+                newargs[i] = boxmap[a]
+            else:
+                newargs[i] = a.clonebox()
+                boxmap[a] = newargs[i]
+        inliner = Inliner(short_inputargs, newargs)
+        for box, const in self.imported_state.constant_inputargs.items():
+            inliner.argmap[box] = const
+        for i in range(len(short)):
+            short[i] = inliner.inline_op(short[i])
+
+        target_token.start_resumedescr = self.start_resumedescr.clone_if_mutable()            
+        inliner.inline_descr_inplace(target_token.start_resumedescr)
+
+        # Forget the values to allow them to be freed
+        for box in short[0].getarglist():
+            box.forget_value()
+        for op in short:
+            if op.result:
+                op.result.forget_value()
+        target_token.short_preamble = self.short
+        target_token.exported_state = None
+
+        
+    def FIXME_old_stuff():
             preamble_optimizer = self.optimizer
             loop.preamble.quasi_immutable_deps = (
                 self.optimizer.quasi_immutable_deps)
             self.optimizer = self.optimizer.new()
             loop.quasi_immutable_deps = self.optimizer.quasi_immutable_deps
 
-            logops = self.optimizer.loop.logops
-            if logops:
-                args = ", ".join([logops.repr_of_arg(arg) for arg in inputargs])
-                debug_print('inputargs:       ' + args)
-                args = ", ".join([logops.repr_of_arg(arg) for arg in short_inputargs])
-                debug_print('short inputargs: ' + args)
-                self.short_boxes.debug_print(logops)
-                
-
-            # Force virtuals amoung the jump_args of the preamble to get the
-            # operations needed to setup the proper state of those virtuals
-            # in the peeled loop
-            inputarg_setup_ops = []
-            preamble_optimizer.clear_newoperations()
-            seen = {}
-            for box in inputargs:
-                if box in seen:
-                    continue
-                seen[box] = True
-                preamble_value = preamble_optimizer.getvalue(box)
-                value = self.optimizer.getvalue(box)
-                value.import_from(preamble_value, self.optimizer)
-            for box in short_inputargs:
-                if box in seen:
-                    continue
-                seen[box] = True
-                value = preamble_optimizer.getvalue(box)
-                value.force_box(preamble_optimizer)
-            inputarg_setup_ops += preamble_optimizer.get_newoperations()
-
-            # Setup the state of the new optimizer by emiting the
-            # short preamble operations and discarding the result
-            self.optimizer.emitting_dissabled = True
-            for op in inputarg_setup_ops:
-                self.optimizer.send_extra_operation(op)
-            seen = {}
-            for op in self.short_boxes.operations():
-                self.ensure_short_op_emitted(op, self.optimizer, seen)
-                if op and op.result:
-                    preamble_value = preamble_optimizer.getvalue(op.result)
-                    value = self.optimizer.getvalue(op.result)
-                    if not value.is_virtual():
-                        imp = ValueImporter(self, preamble_value, op)
-                        self.optimizer.importable_values[value] = imp
-                    newresult = self.optimizer.getvalue(op.result).get_key_box()
-                    if newresult is not op.result:
-                        self.short_boxes.alias(newresult, op.result)
-            self.optimizer.flush()
-            self.optimizer.emitting_dissabled = False
-
-            initial_inputargs_len = len(inputargs)
-            self.inliner = Inliner(loop.inputargs, jump_args)
-
-
-            short = self.inline(inputargs, self.cloned_operations,
-                                loop.inputargs, short_inputargs,
-                                virtual_state)
             
             loop.inputargs = inputargs
             args = [preamble_optimizer.getvalue(self.short_boxes.original(a)).force_box(preamble_optimizer)\
@@ -241,149 +421,7 @@
                 loop.preamble.token.retraced_count = sys.maxint
 
             if short:
-                assert short[-1].getopnum() == rop.JUMP
-                short[-1].setdescr(loop.token)
-
-                # Turn guards into conditional jumps to the preamble
-                for i in range(len(short)):
-                    op = short[i]
-                    if op.is_guard():
-                        op = op.clone()
-                        op.setfailargs(None)
-                        descr = self.start_resumedescr.clone_if_mutable()
-                        op.setdescr(descr)
-                        short[i] = op
-
-                short_loop = TreeLoop('short preamble')
-                short_loop.inputargs = short_inputargs
-                short_loop.operations = short
-
-                # Clone ops and boxes to get private versions and
-                boxmap = {}
-                newargs = [None] * len(short_loop.inputargs)
-                for i in range(len(short_loop.inputargs)):
-                    a = short_loop.inputargs[i]
-                    if a in boxmap:
-                        newargs[i] = boxmap[a]
-                    else:
-                        newargs[i] = a.clonebox()
-                        boxmap[a] = newargs[i]
-                inliner = Inliner(short_loop.inputargs, newargs)
-                for box, const in self.constant_inputargs.items():
-                    inliner.argmap[box] = const
-                short_loop.inputargs = newargs
-                ops = [inliner.inline_op(op) for op in short_loop.operations]
-                short_loop.operations = ops
-                descr = self.start_resumedescr.clone_if_mutable()
-                inliner.inline_descr_inplace(descr)
-                short_loop.start_resumedescr = descr
-
-                assert isinstance(loop.preamble.token, LoopToken)
-                if loop.preamble.token.short_preamble:
-                    loop.preamble.token.short_preamble.append(short_loop)
-                else:
-                    loop.preamble.token.short_preamble = [short_loop]
-                short_loop.virtual_state = virtual_state
-
-                # Forget the values to allow them to be freed
-                for box in short_loop.inputargs:
-                    box.forget_value()
-                for op in short_loop.operations:
-                    if op.result:
-                        op.result.forget_value()
-
-    def inline(self, inputargs, loop_operations, loop_args, short_inputargs, virtual_state):
-        inliner = self.inliner
-
-        short_jumpargs = inputargs[:]
-
-        short = self.short = []
-        short_seen = self.short_seen = {}
-        for box, const in self.constant_inputargs.items():
-            short_seen[box] = True
-
-        # This loop is equivalent to the main optimization loop in
-        # Optimizer.propagate_all_forward
-        jumpop = None
-        for newop in loop_operations:
-            newop = inliner.inline_op(newop, clone=False)
-            if newop.getopnum() == rop.JUMP:
-                jumpop = newop
-                break
-
-            #self.optimizer.first_optimization.propagate_forward(newop)
-            self.optimizer.send_extra_operation(newop)
-
-        self.boxes_created_this_iteration = {}
-
-        assert jumpop
-        original_jumpargs = jumpop.getarglist()[:]
-        values = [self.getvalue(arg) for arg in jumpop.getarglist()]
-        jumpargs = virtual_state.make_inputargs(values, self.optimizer)
-        jumpop.initarglist(jumpargs)
-        jmp_to_short_args = virtual_state.make_inputargs(values, self.optimizer,
-                                                         keyboxes=True)
-        self.short_inliner = Inliner(short_inputargs, jmp_to_short_args)
-        
-        for box, const in self.constant_inputargs.items():
-            self.short_inliner.argmap[box] = const
-
-        for op in short:
-            newop = self.short_inliner.inline_op(op)
-            self.optimizer.send_extra_operation(newop)
-        
-        newoperations = self.optimizer.get_newoperations()
-
-        i = j = 0
-        while i < len(newoperations) or j < len(jumpargs):
-            if i == len(newoperations):
-                while j < len(jumpargs):
-                    a = jumpargs[j]
-                    if self.optimizer.loop.logops:
-                        debug_print('J:  ' + self.optimizer.loop.logops.repr_of_arg(a))
-                    self.import_box(a, inputargs, short, short_jumpargs,
-                                    jumpargs, short_seen)
-                    j += 1
-            else:
-                op = newoperations[i]
-
-                self.boxes_created_this_iteration[op.result] = True
-                args = op.getarglist()
-                if op.is_guard():
-                    args = args + op.getfailargs()
-
-                if self.optimizer.loop.logops:
-                    debug_print('OP: ' + self.optimizer.loop.logops.repr_of_resop(op))
-                for a in args:
-                    if self.optimizer.loop.logops:
-                        debug_print('A:  ' + self.optimizer.loop.logops.repr_of_arg(a))
-                    self.import_box(a, inputargs, short, short_jumpargs,
-                                    jumpargs, short_seen)
-                i += 1
-            newoperations = self.optimizer.get_newoperations()
-
-        jumpop.initarglist(jumpargs)
-        self.optimizer.send_extra_operation(jumpop)
-        short.append(ResOperation(rop.JUMP, short_jumpargs, None))
-
-        modifier = VirtualStateAdder(self.optimizer)
-        final_virtual_state = modifier.get_virtual_state(original_jumpargs)
-        debug_start('jit-log-virtualstate')
-        virtual_state.debug_print('Closed loop with ')
-        bad = {}
-        if not virtual_state.generalization_of(final_virtual_state, bad):
-            # We ended up with a virtual state that is not compatible
-            # and we are thus unable to jump to the start of the loop
-            # XXX Is it possible to end up here? If so, consider:
-            #    - Fallback on having the preamble jump to itself?
-            #    - Would virtual_state.generate_guards make sense here?
-            final_virtual_state.debug_print("Bad virtual state at end of loop, ",
-                                            bad)
-            debug_stop('jit-log-virtualstate')
-            raise InvalidLoop
-        debug_stop('jit-log-virtualstate')
-        
-        return short
+                pass
 
     def ensure_short_op_emitted(self, op, optimizer, seen):
         if op is None:
@@ -399,19 +437,18 @@
             guard = ResOperation(rop.GUARD_NO_OVERFLOW, [], None)
             optimizer.send_extra_operation(guard)
 
-    def add_op_to_short(self, op, short, short_seen, emit=True, guards_needed=False):
+    def add_op_to_short(self, op, emit=True, guards_needed=False):
         if op is None:
             return None
-        if op.result is not None and op.result in short_seen:
-            if emit:
+        if op.result is not None and op.result in self.short_seen:
+            if emit and self.short_inliner:                
                 return self.short_inliner.inline_arg(op.result)
             else:
                 return None
         
         for a in op.getarglist():
-            if not isinstance(a, Const) and a not in short_seen:
-                self.add_op_to_short(self.short_boxes.producer(a), short, short_seen,
-                                     emit, guards_needed)
+            if not isinstance(a, Const) and a not in self.short_seen:
+                self.add_op_to_short(self.short_boxes.producer(a), emit, guards_needed)
         if op.is_guard():
             descr = self.start_resumedescr.clone_if_mutable()
             op.setdescr(descr)
@@ -421,9 +458,9 @@
         else:
             value_guards = []            
 
-        short.append(op)
-        short_seen[op.result] = True
-        if emit:
+        self.short.append(op)
+        self.short_seen[op.result] = True
+        if emit and self.short_inliner:
             newop = self.short_inliner.inline_op(op)
             self.optimizer.send_extra_operation(newop)
         else:
@@ -432,23 +469,22 @@
         if op.is_ovf():
             # FIXME: ensure that GUARD_OVERFLOW:ed ops not end up here
             guard = ResOperation(rop.GUARD_NO_OVERFLOW, [], None)
-            self.add_op_to_short(guard, short, short_seen, emit, guards_needed)
+            self.add_op_to_short(guard, emit, guards_needed)
         for guard in value_guards:
-            self.add_op_to_short(guard, short, short_seen, emit, guards_needed)
+            self.add_op_to_short(guard, emit, guards_needed)
 
         if newop:
             return newop.result
         return None
         
-    def import_box(self, box, inputargs, short, short_jumpargs,
-                   jumpargs, short_seen):
+    def import_box(self, box, inputargs, short_jumpargs, jumpargs):
         if isinstance(box, Const) or box in inputargs:
             return
         if box in self.boxes_created_this_iteration:
             return
 
         short_op = self.short_boxes.producer(box)
-        newresult = self.add_op_to_short(short_op, short, short_seen)
+        newresult = self.add_op_to_short(short_op)
 
         short_jumpargs.append(short_op.result)
         inputargs.append(box)
@@ -456,98 +492,94 @@
         if box in self.optimizer.values:
             box = self.optimizer.values[box].force_box(self.optimizer)
         jumpargs.append(box)
-        
 
-class OptInlineShortPreamble(Optimization):
-    def __init__(self, retraced):
-        self.retraced = retraced
+    def jump_to_already_compiled_trace(self, jumpop):
+        assert jumpop.getopnum() == rop.JUMP
+        cell_token = jumpop.getdescr()
 
-    def new(self):
-        return OptInlineShortPreamble(self.retraced)
+        assert isinstance(cell_token, JitCellToken)
+        if not cell_token.target_tokens:
+            return False
 
-    def propagate_forward(self, op):
-        if op.getopnum() == rop.JUMP:
-            loop_token = op.getdescr()
-            assert isinstance(loop_token, LoopToken)
-            short = loop_token.short_preamble
-            if short:
-                args = op.getarglist()
-                modifier = VirtualStateAdder(self.optimizer)
-                virtual_state = modifier.get_virtual_state(args)
-                debug_start('jit-log-virtualstate')
-                virtual_state.debug_print("Looking for ")
+        if not self.inline_short_preamble:
+            assert cell_token.target_tokens[0].virtual_state is None
+            jumpop.setdescr(cell_token.target_tokens[0])
+            self.optimizer.send_extra_operation(jumpop)
+            return True
 
-                for sh in short:
-                    ok = False
-                    extra_guards = []
+        args = jumpop.getarglist()
+        modifier = VirtualStateAdder(self.optimizer)
+        virtual_state = modifier.get_virtual_state(args)
+        debug_start('jit-log-virtualstate')
+        virtual_state.debug_print("Looking for ")
 
-                    bad = {}
-                    debugmsg = 'Did not match '
-                    if sh.virtual_state.generalization_of(virtual_state, bad):
-                        ok = True
-                        debugmsg = 'Matched '
-                    else:
-                        try:
-                            cpu = self.optimizer.cpu
-                            sh.virtual_state.generate_guards(virtual_state,
-                                                             args, cpu,
-                                                             extra_guards)
+        for target in cell_token.target_tokens:
+            if not target.virtual_state:
+                continue
+            ok = False
+            extra_guards = []
 
-                            ok = True
-                            debugmsg = 'Guarded to match '
-                        except InvalidLoop:
-                            pass
-                    sh.virtual_state.debug_print(debugmsg, bad)
-                    
-                    if ok:
-                        debug_stop('jit-log-virtualstate')
+            bad = {}
+            debugmsg = 'Did not match '
+            if target.virtual_state.generalization_of(virtual_state, bad):
+                ok = True
+                debugmsg = 'Matched '
+            else:
+                try:
+                    cpu = self.optimizer.cpu
+                    target.virtual_state.generate_guards(virtual_state,
+                                                         args, cpu,
+                                                         extra_guards)
 
-                        values = [self.getvalue(arg)
-                                  for arg in op.getarglist()]
-                        args = sh.virtual_state.make_inputargs(values, self.optimizer,
-                                                               keyboxes=True)
-                        inliner = Inliner(sh.inputargs, args)
-                        
-                        for guard in extra_guards:
-                            if guard.is_guard():
-                                descr = sh.start_resumedescr.clone_if_mutable()
-                                inliner.inline_descr_inplace(descr)
-                                guard.setdescr(descr)
-                            self.emit_operation(guard)
-                        
-                        try:
-                            for shop in sh.operations:
-                                newop = inliner.inline_op(shop)
-                                self.emit_operation(newop)
-                        except InvalidLoop:
-                            debug_print("Inlining failed unexpectedly",
-                                        "jumping to preamble instead")
-                            self.emit_operation(op)
-                        return
+                    ok = True
+                    debugmsg = 'Guarded to match '
+                except InvalidLoop:
+                    pass
+            target.virtual_state.debug_print(debugmsg, bad)
+
+            if ok:
                 debug_stop('jit-log-virtualstate')
-                retraced_count = loop_token.retraced_count
-                limit = self.optimizer.metainterp_sd.warmrunnerdesc.memory_manager.retrace_limit
-                if not self.retraced and retraced_count<limit:
-                    loop_token.retraced_count += 1
-                    if not loop_token.failed_states:
-                        debug_print("Retracing (%d of %d)" % (retraced_count,
-                                                              limit))
-                        raise RetraceLoop
-                    for failed in loop_token.failed_states:
-                        if failed.generalization_of(virtual_state):
-                            # Retracing once more will most likely fail again
-                            break
-                    else:
-                        debug_print("Retracing (%d of %d)" % (retraced_count,
-                                                              limit))
 
-                        raise RetraceLoop
-                else:
-                    if not loop_token.failed_states:
-                        loop_token.failed_states=[virtual_state]
-                    else:
-                        loop_token.failed_states.append(virtual_state)
-        self.emit_operation(op)
+                values = [self.getvalue(arg)
+                          for arg in jumpop.getarglist()]
+                args = target.virtual_state.make_inputargs(values, self.optimizer,
+                                                           keyboxes=True)
+                short_inputargs = target.short_preamble[0].getarglist()
+                inliner = Inliner(short_inputargs, args)
+
+                for guard in extra_guards:
+                    if guard.is_guard():
+                        descr = target.start_resumedescr.clone_if_mutable()
+                        inliner.inline_descr_inplace(descr)
+                        guard.setdescr(descr)
+                    self.optimizer.send_extra_operation(guard)
+
+                try:
+                    for shop in target.short_preamble[1:]:
+                        newop = inliner.inline_op(shop)
+                        self.optimizer.send_extra_operation(newop)
+                except InvalidLoop:
+                    debug_print("Inlining failed unexpectedly",
+                                "jumping to preamble instead")
+                    assert cell_token.target_tokens[0].virtual_state is None
+                    jumpop.setdescr(cell_token.target_tokens[0])
+                    self.optimizer.send_extra_operation(jumpop)
+                return True
+        debug_stop('jit-log-virtualstate')
+
+        if self.did_import:
+            return False
+        limit = self.optimizer.metainterp_sd.warmrunnerdesc.memory_manager.retrace_limit
+        if cell_token.retraced_count<limit:
+            cell_token.retraced_count += 1
+            debug_print('Retracing (%d/%d)' % (cell_token.retraced_count, limit))
+            return False
+        else:
+            debug_print("Retrace count reached, jumping to preamble")
+            assert cell_token.target_tokens[0].virtual_state is None
+            jumpop.setdescr(cell_token.target_tokens[0])
+            self.optimizer.send_extra_operation(jumpop)
+            return True
 
 class ValueImporter(object):
     def __init__(self, unroll, value, op):
@@ -557,5 +589,15 @@
 
     def import_value(self, value):
         value.import_from(self.preamble_value, self.unroll.optimizer)
-        self.unroll.add_op_to_short(self.op, self.unroll.short, self.unroll.short_seen, False, True)        
-        
+        self.unroll.add_op_to_short(self.op, False, True)        
+
+class ExportedState(object):
+    def __init__(self, constant_inputargs,
+                 short_boxes, inputarg_setup_ops, optimizer, aliased_vrituals,
+                 jump_args):
+        self.constant_inputargs = constant_inputargs
+        self.short_boxes = short_boxes
+        self.inputarg_setup_ops = inputarg_setup_ops
+        self.optimizer = optimizer
+        self.aliased_vrituals = aliased_vrituals
+        self.jump_args = jump_args
diff --git a/pypy/jit/metainterp/optimizeopt/util.py b/pypy/jit/metainterp/optimizeopt/util.py
--- a/pypy/jit/metainterp/optimizeopt/util.py
+++ b/pypy/jit/metainterp/optimizeopt/util.py
@@ -148,7 +148,7 @@
                 assert op1.result.same_box(remap[op2.result])
         else:
             remap[op2.result] = op1.result
-        if op1.getopnum() != rop.JUMP:      # xxx obscure
+        if op1.getopnum() not in (rop.JUMP, rop.LABEL):      # xxx obscure
             assert op1.getdescr() == op2.getdescr()
         if op1.getfailargs() or op2.getfailargs():
             assert len(op1.getfailargs()) == len(op2.getfailargs())
@@ -171,3 +171,4 @@
     assert len(oplist1) == len(oplist2)
     print '-'*totwidth
     return True
+
diff --git a/pypy/jit/metainterp/optimizeopt/virtualstate.py b/pypy/jit/metainterp/optimizeopt/virtualstate.py
--- a/pypy/jit/metainterp/optimizeopt/virtualstate.py
+++ b/pypy/jit/metainterp/optimizeopt/virtualstate.py
@@ -14,6 +14,9 @@
 from pypy.rlib.objectmodel import we_are_translated
 import os
 
+class BadVirtualState(Exception):
+    pass
+
 class AbstractVirtualStateInfo(resume.AbstractVirtualInfo):
     position = -1
 
@@ -103,10 +106,15 @@
         raise NotImplementedError
 
     def enum_forced_boxes(self, boxes, value, optimizer):
-        assert isinstance(value, virtualize.AbstractVirtualStructValue)
-        assert value.is_virtual()
+        if not isinstance(value, virtualize.AbstractVirtualStructValue):
+            raise BadVirtualState
+        if not value.is_virtual():
+            raise BadVirtualState
         for i in range(len(self.fielddescrs)):
-            v = value._fields[self.fielddescrs[i]]
+            try:
+                v = value._fields[self.fielddescrs[i]]
+            except KeyError:
+                raise BadVirtualState
             s = self.fieldstate[i]
             if s.position > self.position:
                 s.enum_forced_boxes(boxes, v, optimizer)
@@ -180,10 +188,15 @@
             self.arraydescr is other.arraydescr)
 
     def enum_forced_boxes(self, boxes, value, optimizer):
-        assert isinstance(value, virtualize.VArrayValue)
-        assert value.is_virtual()
+        if not isinstance(value, virtualize.VArrayValue):
+            raise BadVirtualState
+        if not value.is_virtual():
+            raise BadVirtualState
         for i in range(len(self.fieldstate)):
-            v = value._items[i]
+            try:
+                v = value._items[i]
+            except IndexError:
+                raise BadVirtualState
             s = self.fieldstate[i]
             if s.position > self.position:
                 s.enum_forced_boxes(boxes, v, optimizer)
@@ -248,12 +261,19 @@
             s.enum(virtual_state)
 
     def enum_forced_boxes(self, boxes, value, optimizer):
-        assert isinstance(value, virtualize.VArrayStructValue)
-        assert value.is_virtual()
+        if not isinstance(value, virtualize.VArrayStructValue):
+            raise BadVirtualState
+        if not value.is_virtual():
+            raise BadVirtualState
         p = 0
         for i in range(len(self.fielddescrs)):
             for j in range(len(self.fielddescrs[i])):
-                v = value._items[i][self.fielddescrs[i][j]]
+                try:
+                    v = value._items[i][self.fielddescrs[i][j]]
+                except IndexError:
+                    raise BadVirtualState
+                except KeyError:
+                    raise BadVirtualState
                 s = self.fieldstate[p]
                 if s.position > self.position:
                     s.enum_forced_boxes(boxes, v, optimizer)
@@ -546,18 +566,27 @@
         self.aliases = {}
         self.rename = {}
         self.optimizer = optimizer
-        for box in surviving_boxes:
-            self.potential_ops[box] = None
-        optimizer.produce_potential_short_preamble_ops(self)
 
-        self.short_boxes = {}
-        self.short_boxes_in_production = {}
+        if surviving_boxes is not None:
+            for box in surviving_boxes:
+                self.potential_ops[box] = None
+            optimizer.produce_potential_short_preamble_ops(self)
 
-        for box in self.potential_ops.keys():
-            try:
-                self.produce_short_preamble_box(box)
-            except BoxNotProducable:
-                pass
+            self.short_boxes = {}
+            self.short_boxes_in_production = {}
+
+            for box in self.potential_ops.keys():
+                try:
+                    self.produce_short_preamble_box(box)
+                except BoxNotProducable:
+                    pass
+
+    def clone(self):
+        sb = ShortBoxes(self.optimizer, None)
+        sb.aliases.update(self.aliases)
+        sb.short_boxes = {}
+        sb.short_boxes.update(self.short_boxes)
+        return sb
 
     def prioritized_alternatives(self, box):
         if box not in self.alternatives:
@@ -598,6 +627,7 @@
                 newbox = newop.result = op.result.clonebox()
                 self.short_boxes[newop.result] = newop
             value = self.optimizer.getvalue(box)
+            self.optimizer.emit_operation(ResOperation(rop.SAME_AS, [box], newbox))
             self.optimizer.make_equal_to(newbox, value)
         else:
             self.short_boxes[box] = op
diff --git a/pypy/jit/metainterp/pyjitpl.py b/pypy/jit/metainterp/pyjitpl.py
--- a/pypy/jit/metainterp/pyjitpl.py
+++ b/pypy/jit/metainterp/pyjitpl.py
@@ -8,7 +8,7 @@
 
 from pypy.jit.metainterp import history, compile, resume
 from pypy.jit.metainterp.history import Const, ConstInt, ConstPtr, ConstFloat
-from pypy.jit.metainterp.history import Box
+from pypy.jit.metainterp.history import Box, TargetToken
 from pypy.jit.metainterp.resoperation import rop
 from pypy.jit.metainterp import executor
 from pypy.jit.metainterp.logger import Logger
@@ -22,7 +22,6 @@
 from pypy.jit.codewriter.jitcode import JitCode, SwitchDictDescr
 from pypy.jit.codewriter import heaptracker
 from pypy.jit.metainterp.optimizeopt.util import args_dict_box
-from pypy.jit.metainterp.optimize import RetraceLoop
 
 # ____________________________________________________________
 
@@ -1567,10 +1566,17 @@
         self.portal_trace_positions = []
         self.free_frames_list = []
         self.last_exc_value_box = None
-        self.retracing_loop_from = None
+        self.partial_trace = None
+        self.retracing_from = -1
         self.call_pure_results = args_dict_box()
         self.heapcache = HeapCache()
 
+    def retrace_needed(self, trace):
+        self.partial_trace = trace
+        self.retracing_from = len(self.history.operations) - 1
+        self.heapcache.reset()
+        
+
     def perform_call(self, jitcode, boxes, greenkey=None):
         # causes the metainterp to enter the given subfunction
         f = self.newframe(jitcode, greenkey)
@@ -1804,7 +1810,7 @@
 
     def _interpret(self):
         # Execute the frames forward until we raise a DoneWithThisFrame,
-        # a ExitFrameWithException, or a GenerateMergePoint exception.
+        # a ExitFrameWithException, or a ContinueRunningNormally exception.
         self.staticdata.stats.entered()
         while True:
             self.framestack[-1].run_one_step()
@@ -1852,8 +1858,6 @@
         self.seen_loop_header_for_jdindex = -1
         try:
             self.interpret()
-        except GenerateMergePoint, gmp:
-            return self.designate_target_loop(gmp)
         except SwitchToBlackhole, stb:
             self.run_blackhole_interp_to_cancel_tracing(stb)
         assert False, "should always raise"
@@ -1888,8 +1892,6 @@
             if self.resumekey_original_loop_token is None:   # very rare case
                 raise SwitchToBlackhole(ABORT_BRIDGE)
             self.interpret()
-        except GenerateMergePoint, gmp:
-            return self.designate_target_loop(gmp)
         except SwitchToBlackhole, stb:
             self.run_blackhole_interp_to_cancel_tracing(stb)
         assert False, "should always raise"
@@ -1937,14 +1939,9 @@
         #   that failed;
         # - if self.resumekey is a ResumeFromInterpDescr, it starts directly
         #   from the interpreter.
-        if not self.retracing_loop_from:
-            try:
-                self.compile_bridge(live_arg_boxes)
-            except RetraceLoop:
-                start = len(self.history.operations)
-                self.current_merge_points.append((live_arg_boxes, start))
-                self.retracing_loop_from = RetraceState(self, live_arg_boxes)
-                return
+        if not self.partial_trace:
+            # FIXME: Support a retrace to be a bridge as well as a loop
+            self.compile_trace(live_arg_boxes, resumedescr)
 
         # raises in case it works -- which is the common case, hopefully,
         # at least for bridges starting from a guard.
@@ -1966,14 +1963,10 @@
             else:
                 # Found!  Compile it as a loop.
                 # raises in case it works -- which is the common case
-                if self.retracing_loop_from and \
-                   self.retracing_loop_from.merge_point == j:
-                    bridge_arg_boxes = self.retracing_loop_from.live_arg_boxes
-                    self.compile_bridge_and_loop(original_boxes, \
-                                                 live_arg_boxes, start,
-                                                 bridge_arg_boxes, resumedescr)
-                else:
-                    self.compile(original_boxes, live_arg_boxes, start, resumedescr)
+                if self.partial_trace:
+                    if  start != self.retracing_from: 
+                        raise SwitchToBlackhole(ABORT_BAD_LOOP) # For now
+                self.compile_loop(original_boxes, live_arg_boxes, start, resumedescr)
                 # creation of the loop was cancelled!
                 self.staticdata.log('cancelled, tracing more...')
                 #self.staticdata.log('cancelled, stopping tracing')
@@ -1983,12 +1976,48 @@
         start = len(self.history.operations)
         self.current_merge_points.append((live_arg_boxes, start))
 
-    def designate_target_loop(self, gmp):
-        loop_token = gmp.target_loop_token
+    def _unpack_boxes(self, boxes, start, stop):
+        ints = []; refs = []; floats = []
+        for i in range(start, stop):
+            box = boxes[i]
+            if   box.type == history.INT: ints.append(box.getint())
+            elif box.type == history.REF: refs.append(box.getref_base())
+            elif box.type == history.FLOAT:floats.append(box.getfloatstorage())
+            else: assert 0
+        return ints[:], refs[:], floats[:]
+
+    def raise_continue_running_normally(self, live_arg_boxes, loop_token):
+        self.history.inputargs = None
+        self.history.operations = None
+        # For simplicity, we just raise ContinueRunningNormally here and
+        # ignore the loop_token passed in.  It means that we go back to
+        # interpreted mode, but it should come back very quickly to the
+        # JIT, find probably the same 'loop_token', and execute it.
+        if we_are_translated():
+            num_green_args = self.jitdriver_sd.num_green_args
+            gi, gr, gf = self._unpack_boxes(live_arg_boxes, 0, num_green_args)
+            ri, rr, rf = self._unpack_boxes(live_arg_boxes, num_green_args,
+                                            len(live_arg_boxes))
+            CRN = self.staticdata.ContinueRunningNormally
+            raise CRN(gi, gr, gf, ri, rr, rf)
+        else:
+            # However, in order to keep the existing tests working
+            # (which are based on the assumption that 'loop_token' is
+            # directly used here), a bit of custom non-translatable code...
+            self._nontranslated_run_directly(live_arg_boxes, loop_token)
+            assert 0, "unreachable"
+
+    def _nontranslated_run_directly(self, live_arg_boxes, loop_token):
+        "NOT_RPYTHON"
+        args = []
         num_green_args = self.jitdriver_sd.num_green_args
-        residual_args = gmp.argboxes[num_green_args:]
-        history.set_future_values(self.cpu, residual_args)
-        return loop_token
+        num_red_args = self.jitdriver_sd.num_red_args
+        for box in live_arg_boxes[num_green_args:num_green_args+num_red_args]:
+            if   box.type == history.INT: args.append(box.getint())
+            elif box.type == history.REF: args.append(box.getref_base())
+            elif box.type == history.FLOAT: args.append(box.getfloatstorage())
+            else: assert 0
+        self.jitdriver_sd.warmstate.execute_assembler(loop_token, *args)
 
     def prepare_resume_from_failure(self, opnum, dont_change_position=False):
         frame = self.framestack[-1]
@@ -2029,54 +2058,57 @@
             from pypy.jit.metainterp.resoperation import opname
             raise NotImplementedError(opname[opnum])
 
-    def get_compiled_merge_points(self, greenkey):
-        """Get the list of looptokens corresponding to the greenkey.
-        Turns the (internal) list of weakrefs into regular refs.
-        """
+    def get_procedure_token(self, greenkey):
         cell = self.jitdriver_sd.warmstate.jit_cell_at_key(greenkey)
-        return cell.get_compiled_merge_points()
+        return cell.get_procedure_token()
+        
+    def compile_loop(self, original_boxes, live_arg_boxes, start, start_resumedescr):
+        num_green_args = self.jitdriver_sd.num_green_args
+        greenkey = original_boxes[:num_green_args]
+        if not self.partial_trace:
+            assert self.get_procedure_token(greenkey) is None or \
+                   self.get_procedure_token(greenkey).target_tokens is None
+        if self.partial_trace:
+            target_token = compile.compile_retrace(self, greenkey, start,
+                                                   original_boxes[num_green_args:],
+                                                   live_arg_boxes[num_green_args:],
+                                                   start_resumedescr, self.partial_trace,
+                                                   self.resumekey)
+        else:
+            target_token = compile.compile_loop(self, greenkey, start,
+                                                original_boxes[num_green_args:],
+                                                live_arg_boxes[num_green_args:],
+                                                start_resumedescr)
+            if target_token is not None:
+                assert isinstance(target_token, TargetToken)
+                self.jitdriver_sd.warmstate.attach_procedure_to_interp(greenkey, target_token.targeting_jitcell_token)
+                self.staticdata.stats.add_jitcell_token(target_token.targeting_jitcell_token)
 
-    def set_compiled_merge_points(self, greenkey, looptokens):
-        cell = self.jitdriver_sd.warmstate.jit_cell_at_key(greenkey)
-        cell.set_compiled_merge_points(looptokens)
 
-    def compile(self, original_boxes, live_arg_boxes, start, start_resumedescr):
-        num_green_args = self.jitdriver_sd.num_green_args
-        original_inputargs = self.history.inputargs
-        self.history.inputargs = original_boxes[num_green_args:]
-        greenkey = original_boxes[:num_green_args]
-        old_loop_tokens = self.get_compiled_merge_points(greenkey)
-        self.history.record(rop.JUMP, live_arg_boxes[num_green_args:], None)
-        loop_token = compile.compile_new_loop(self, old_loop_tokens,
-                                              greenkey, start, start_resumedescr)
-        if loop_token is not None: # raise if it *worked* correctly
-            self.set_compiled_merge_points(greenkey, old_loop_tokens)
-            self.history.inputargs = None
-            self.history.operations = None
-            raise GenerateMergePoint(live_arg_boxes, loop_token)
+        if target_token is not None: # raise if it *worked* correctly
+            assert isinstance(target_token, TargetToken)
+            jitcell_token = target_token.targeting_jitcell_token
+            self.raise_continue_running_normally(live_arg_boxes, jitcell_token)
 
-        self.history.inputargs = original_inputargs
-        self.history.operations.pop()     # remove the JUMP
-
-    def compile_bridge(self, live_arg_boxes):
+    def compile_trace(self, live_arg_boxes, start_resumedescr):
         num_green_args = self.jitdriver_sd.num_green_args
         greenkey = live_arg_boxes[:num_green_args]
-        old_loop_tokens = self.get_compiled_merge_points(greenkey)
-        if len(old_loop_tokens) == 0:
+        target_jitcell_token = self.get_procedure_token(greenkey)
+        if not target_jitcell_token:
             return
-        #if self.resumekey.guard_opnum == rop.GUARD_CLASS:
-        #    return # Kepp tracing for another iteration
-        self.history.record(rop.JUMP, live_arg_boxes[num_green_args:], None)
+        if not target_jitcell_token.target_tokens:
+            return
+
+        self.history.record(rop.JUMP, live_arg_boxes[num_green_args:], None,
+                            descr=target_jitcell_token)
         try:
-            target_loop_token = compile.compile_new_bridge(self,
-                                                           old_loop_tokens,
-                                                           self.resumekey)
+            target_token = compile.compile_trace(self, self.resumekey, start_resumedescr)
         finally:
             self.history.operations.pop()     # remove the JUMP
-        if target_loop_token is not None: # raise if it *worked* correctly
-            self.history.inputargs = None
-            self.history.operations = None
-            raise GenerateMergePoint(live_arg_boxes, target_loop_token)
+        if target_token is not None: # raise if it *worked* correctly
+            assert isinstance(target_token, TargetToken)
+            jitcell_token = target_token.targeting_jitcell_token
+            self.raise_continue_running_normally(live_arg_boxes, jitcell_token)
 
     def compile_bridge_and_loop(self, original_boxes, live_arg_boxes, start,
                                 bridge_arg_boxes, start_resumedescr):
@@ -2112,10 +2144,8 @@
         except RetraceLoop:
             assert False
         assert target_loop_token is not None
-
-        self.history.inputargs = None
-        self.history.operations = None
-        raise GenerateMergePoint(live_arg_boxes, old_loop_tokens[0])
+        self.raise_continue_running_normally(live_arg_boxes,
+                                             old_loop_tokens[0])
 
     def compile_done_with_this_frame(self, exitbox):
         self.gen_store_back_in_virtualizable()
@@ -2137,21 +2167,21 @@
             loop_tokens = sd.loop_tokens_done_with_this_frame_float
         else:
             assert False
-        self.history.record(rop.JUMP, exits, None)
-        target_loop_token = compile.compile_new_bridge(self, loop_tokens,
-                                                       self.resumekey)
-        if target_loop_token is not loop_tokens[0]:
+        # FIXME: kill TerminatingLoopToken?
+        # FIXME: can we call compile_trace?
+        token = loop_tokens[0].finishdescr
+        self.history.record(rop.FINISH, exits, None, descr=token)
+        target_token = compile.compile_trace(self, self.resumekey)
+        if target_token is not token:
             compile.giveup()
 
     def compile_exit_frame_with_exception(self, valuebox):
         self.gen_store_back_in_virtualizable()
-        # temporarily put a JUMP to a pseudo-loop
-        self.history.record(rop.JUMP, [valuebox], None)
         sd = self.staticdata
-        loop_tokens = sd.loop_tokens_exit_frame_with_exception_ref
-        target_loop_token = compile.compile_new_bridge(self, loop_tokens,
-                                                       self.resumekey)
-        if target_loop_token is not loop_tokens[0]:
+        token = sd.loop_tokens_exit_frame_with_exception_ref[0].finishdescr
+        self.history.record(rop.FINISH, [valuebox], None, descr=token)
+        target_token = compile.compile_trace(self, self.resumekey)
+        if target_token is not token:
             compile.giveup()
 
     @specialize.arg(1)
@@ -2393,22 +2423,6 @@
                                             abox, ConstInt(j), itembox)
             assert i + 1 == len(self.virtualizable_boxes)
 
-    def gen_load_from_other_virtualizable(self, vinfo, vbox):
-        boxes = []
-        assert vinfo is not None
-        for i in range(vinfo.num_static_extra_boxes):
-            descr = vinfo.static_field_descrs[i]
-            boxes.append(self.execute_and_record(rop.GETFIELD_GC, descr, vbox))
-        virtualizable = vinfo.unwrap_virtualizable_box(vbox)
-        for k in range(vinfo.num_arrays):
-            descr = vinfo.array_field_descrs[k]
-            abox = self.execute_and_record(rop.GETFIELD_GC, descr, vbox)
-            descr = vinfo.array_descrs[k]
-            for j in range(vinfo.get_array_length(virtualizable, k)):
-                boxes.append(self.execute_and_record(rop.GETARRAYITEM_GC, descr,
-                                                     abox, ConstInt(j)))
-        return boxes
-
     def replace_box(self, oldbox, newbox):
         assert isinstance(oldbox, Box)
         for frame in self.framestack:
@@ -2480,25 +2494,13 @@
         greenargs = arglist[1:num_green_args+1]
         args = arglist[num_green_args+1:]
         assert len(args) == targetjitdriver_sd.num_red_args
-        vinfo = targetjitdriver_sd.virtualizable_info
-        if vinfo is not None:
-            index = targetjitdriver_sd.index_of_virtualizable
-            vbox = args[index]
-            args = args + self.gen_load_from_other_virtualizable(vinfo, vbox)
-            # ^^^ and not "+=", which makes 'args' a resizable list
         warmrunnerstate = targetjitdriver_sd.warmstate
-        token = warmrunnerstate.get_assembler_token(greenargs, args)
+        token = warmrunnerstate.get_assembler_token(greenargs)
         op = op.copy_and_change(rop.CALL_ASSEMBLER, args=args, descr=token)
         self.history.operations.append(op)
 
 # ____________________________________________________________
 
-class GenerateMergePoint(JitException):
-    def __init__(self, args, target_loop_token):
-        assert target_loop_token is not None
-        self.argboxes = args
-        self.target_loop_token = target_loop_token
-
 class ChangeFrame(JitException):
     """Raised after we mutated metainterp.framestack, in order to force
     it to reload the current top-of-stack frame that gets interpreted."""
diff --git a/pypy/jit/metainterp/resoperation.py b/pypy/jit/metainterp/resoperation.py
--- a/pypy/jit/metainterp/resoperation.py
+++ b/pypy/jit/metainterp/resoperation.py
@@ -369,6 +369,8 @@
     'FINISH/*d',
     '_FINAL_LAST',
 
+    'LABEL/*d',
+
     '_GUARD_FIRST',
     '_GUARD_FOLDABLE_FIRST',
     'GUARD_TRUE/1d',
@@ -379,11 +381,11 @@
     'GUARD_ISNULL/1d',
     'GUARD_NONNULL_CLASS/2d',
     '_GUARD_FOLDABLE_LAST',
-    'GUARD_NO_EXCEPTION/0d',
-    'GUARD_EXCEPTION/1d',
+    'GUARD_NO_EXCEPTION/0d',    # may be called with an exception currently set
+    'GUARD_EXCEPTION/1d',       # may be called with an exception currently set
     'GUARD_NO_OVERFLOW/0d',
     'GUARD_OVERFLOW/0d',
-    'GUARD_NOT_FORCED/0d',
+    'GUARD_NOT_FORCED/0d',      # may be called with an exception currently set
     'GUARD_NOT_INVALIDATED/0d',
     '_GUARD_LAST', # ----- end of guard operations -----
 
diff --git a/pypy/jit/metainterp/test/support.py b/pypy/jit/metainterp/test/support.py
--- a/pypy/jit/metainterp/test/support.py
+++ b/pypy/jit/metainterp/test/support.py
@@ -4,9 +4,9 @@
 from pypy.rpython.ootypesystem import ootype
 from pypy.jit.backend.llgraph import runner
 from pypy.jit.metainterp.warmspot import ll_meta_interp, get_stats
+from pypy.jit.metainterp.warmstate import unspecialize_value
 from pypy.jit.metainterp.optimizeopt import ALL_OPTS_DICT
 from pypy.jit.metainterp import pyjitpl, history
-from pypy.jit.metainterp.warmstate import set_future_value
 from pypy.jit.codewriter.policy import JitPolicy
 from pypy.jit.codewriter import codewriter, longlong
 from pypy.rlib.rfloat import isnan
@@ -16,15 +16,16 @@
     from pypy.jit.codewriter import support
 
     class FakeJitCell(object):
-        __compiled_merge_points = []
-        def get_compiled_merge_points(self):
-            return self.__compiled_merge_points[:]
-        def set_compiled_merge_points(self, lst):
-            self.__compiled_merge_points = lst
+        __product_token = None
+        def get_procedure_token(self):
+            return self.__product_token
+        def set_procedure_token(self, token):
+            self.__product_token = token
 
     class FakeWarmRunnerState(object):
-        def attach_unoptimized_bridge_from_interp(self, greenkey, newloop):
-            pass
+        def attach_procedure_to_interp(self, greenkey, procedure_token):
+            cell = self.jit_cell_at_key(greenkey)
+            cell.set_procedure_token(procedure_token)
 
         def helper_func(self, FUNCPTR, func):
             from pypy.rpython.annlowlevel import llhelper
@@ -132,16 +133,14 @@
 def _run_with_machine_code(testself, args):
     metainterp = testself.metainterp
     num_green_args = metainterp.jitdriver_sd.num_green_args
-    loop_tokens = metainterp.get_compiled_merge_points(args[:num_green_args])
-    if len(loop_tokens) != 1:
-        return NotImplemented
+    procedure_token = metainterp.get_procedure_token(args[:num_green_args])
     # a loop was successfully created by _run_with_pyjitpl(); call it
     cpu = metainterp.cpu
+    args1 = []
     for i in range(len(args) - num_green_args):
         x = args[num_green_args + i]
-        typecode = history.getkind(lltype.typeOf(x))
-        set_future_value(cpu, i, x, typecode)
-    faildescr = cpu.execute_token(loop_tokens[0])
+        args1.append(unspecialize_value(x))
+    faildescr = cpu.execute_token(procedure_token, *args1)
     assert faildescr.__class__.__name__.startswith('DoneWithThisFrameDescr')
     if metainterp.jitdriver_sd.result_type == history.INT:
         return cpu.get_latest_value_int(0)
@@ -160,23 +159,31 @@
     def check_simple_loop(self, expected=None, **check):
         get_stats().check_simple_loop(expected=expected, **check)
 
-    def check_loop_count(self, count):
-        """NB. This is a hack; use check_tree_loop_count() or
-        check_enter_count() for the real thing.
-        This counts as 1 every bridge in addition to every loop; and it does
-        not count at all the entry bridges from interpreter, although they
-        are TreeLoops as well."""
+    
+
+    def check_trace_count(self, count): # was check_loop_count
+        # The number of traces compiled
         assert get_stats().compiled_count == count
-    def check_tree_loop_count(self, count):
-        assert len(get_stats().loops) == count
-    def check_loop_count_at_most(self, count):
+    def check_trace_count_at_most(self, count):
         assert get_stats().compiled_count <= count
+
+    def check_jitcell_token_count(self, count): # was check_tree_loop_count
+        assert len(get_stats().jitcell_token_wrefs) == count
+
+    def check_target_token_count(self, count):
+        tokens = get_stats().get_all_jitcell_tokens()
+        n = sum ([len(t.target_tokens) for t in tokens])
+        assert n == count
+
     def check_enter_count(self, count):
         assert get_stats().enter_count == count
     def check_enter_count_at_most(self, count):
         assert get_stats().enter_count <= count
+
     def check_jumps(self, maxcount):
+        return # FIXME
         assert get_stats().exec_jumps <= maxcount
+
     def check_aborted_count(self, count):
         assert get_stats().aborted_count == count
     def check_aborted_count_at_least(self, count):
@@ -219,7 +226,7 @@
         # this can be used after interp_operations
         if expected is not None:
             expected = dict(expected)
-            expected['jump'] = 1
+            expected['finish'] = 1
         self.metainterp.staticdata.stats.check_history(expected, **isns)
 
 
diff --git a/pypy/jit/metainterp/test/test_ajit.py b/pypy/jit/metainterp/test/test_ajit.py
--- a/pypy/jit/metainterp/test/test_ajit.py
+++ b/pypy/jit/metainterp/test/test_ajit.py
@@ -9,7 +9,6 @@
 from pypy.jit.metainterp.test.support import LLJitMixin, OOJitMixin, noConst
 from pypy.jit.metainterp.typesystem import LLTypeHelper, OOTypeHelper
 from pypy.jit.metainterp.warmspot import get_stats
-from pypy.jit.metainterp.warmstate import set_future_value
 from pypy.rlib import rerased
 from pypy.rlib.jit import (JitDriver, we_are_jitted, hint, dont_look_inside,
     loop_invariant, elidable, promote, jit_debug, assert_green,
@@ -66,7 +65,7 @@
         res = self.interp_operations(f, [8, 98])
         assert res == 110
 
-    def test_loop(self):
+    def test_loop_1(self):
         myjitdriver = JitDriver(greens = [], reds = ['x', 'y', 'res'])
         def f(x, y):
             res = 0
@@ -78,19 +77,20 @@
             return res
         res = self.meta_interp(f, [6, 7])
         assert res == 42
-        self.check_loop_count(1)
-        self.check_resops({'jump': 2, 'int_gt': 2, 'int_add': 2, 'guard_true': 2, 'int_sub': 2})
+        self.check_trace_count(1)
+        self.check_resops({'jump': 1, 'int_gt': 2, 'int_add': 2,
+                           'guard_true': 2, 'int_sub': 2})
 
         if self.basic:
             found = 0
-            for op in get_stats().loops[0]._all_operations():
+            for op in get_stats().get_all_loops()[0]._all_operations():
                 if op.getopname() == 'guard_true':
                     liveboxes = op.getfailargs()
                     assert len(liveboxes) == 3
                     for box in liveboxes:
                         assert isinstance(box, history.BoxInt)
                     found += 1
-            assert found == 1
+            assert found == 2
 
     def test_loop_variant_mul1(self):
         myjitdriver = JitDriver(greens = [], reds = ['y', 'res', 'x'])
@@ -106,7 +106,7 @@
             return res
         res = self.meta_interp(f, [6, 7])
         assert res == 1323
-        self.check_loop_count(1)
+        self.check_trace_count(1)
         self.check_simple_loop(int_mul=1)
 
     def test_loop_variant_mul_ovf(self):
@@ -123,7 +123,7 @@
             return res
         res = self.meta_interp(f, [6, 7])
         assert res == 1323
-        self.check_loop_count(1)
+        self.check_trace_count(1)
         self.check_simple_loop(int_mul_ovf=1)
 
     def test_loop_invariant_mul1(self):
@@ -138,9 +138,9 @@
             return res
         res = self.meta_interp(f, [6, 7])
         assert res == 252
-        self.check_loop_count(1)
+        self.check_trace_count(1)
         self.check_simple_loop(int_mul=0)
-        self.check_resops({'jump': 2, 'int_gt': 2, 'int_add': 2,
+        self.check_resops({'jump': 1, 'int_gt': 2, 'int_add': 2,
                            'int_mul': 1, 'guard_true': 2, 'int_sub': 2})
 
 
@@ -157,67 +157,63 @@
             return res
         res = self.meta_interp(f, [6, 7])
         assert res == 308
-        self.check_loop_count(1)
+        self.check_trace_count(1)
         self.check_simple_loop(int_mul_ovf=0)
-        self.check_resops({'jump': 2, 'int_lshift': 2, 'int_gt': 2,
+        self.check_resops({'jump': 1, 'int_lshift': 2, 'int_gt': 2,
                            'int_mul_ovf': 1, 'int_add': 4,
                            'guard_true': 2, 'guard_no_overflow': 1,
                            'int_sub': 2})
 
     def test_loop_invariant_mul_bridge1(self):
-        myjitdriver = JitDriver(greens = [], reds = ['y', 'res', 'x'])
-        def f(x, y):
+        myjitdriver = JitDriver(greens = [], reds = ['y', 'res', 'x', 'n'])
+        def f(x, y, n):
             res = 0
             while y > 0:
-                myjitdriver.can_enter_jit(x=x, y=y, res=res)
-                myjitdriver.jit_merge_point(x=x, y=y, res=res)
+                myjitdriver.can_enter_jit(x=x, y=y, n=n, res=res)
+                myjitdriver.jit_merge_point(x=x, y=y, n=n, res=res)
                 res += x * x
-                if y<16:
+                if y<n:
                     x += 1
                 y -= 1
             return res
-        res = self.meta_interp(f, [6, 32])
+        res = self.meta_interp(f, [6, 32, 16])
         assert res == 3427
-        self.check_loop_count(3)
+        self.check_trace_count(3)
 
     def test_loop_invariant_mul_bridge_maintaining1(self):
-        myjitdriver = JitDriver(greens = [], reds = ['y', 'res', 'x'])
-        def f(x, y):
+        myjitdriver = JitDriver(greens = [], reds = ['y', 'res', 'x', 'n'])
+        def f(x, y, n):
             res = 0
             while y > 0:
-                myjitdriver.can_enter_jit(x=x, y=y, res=res)
-                myjitdriver.jit_merge_point(x=x, y=y, res=res)
+                myjitdriver.can_enter_jit(x=x, y=y, res=res, n=n)
+                myjitdriver.jit_merge_point(x=x, y=y, res=res, n=n)
                 res += x * x
-                if y<16:
+                if y<n:
                     res += 1
                 y -= 1
             return res
-        res = self.meta_interp(f, [6, 32])
+        res = self.meta_interp(f, [6, 32, 16])
         assert res == 1167
-        self.check_loop_count(3)
-        self.check_resops({'int_lt': 3, 'int_gt': 2, 'int_add': 5,
-                           'guard_true': 3, 'int_sub': 4, 'jump': 4,
-                           'int_mul': 2, 'guard_false': 2})
+        self.check_trace_count(3)
+        self.check_resops(int_mul=3)
 
     def test_loop_invariant_mul_bridge_maintaining2(self):
-        myjitdriver = JitDriver(greens = [], reds = ['y', 'res', 'x'])
-        def f(x, y):
+        myjitdriver = JitDriver(greens = [], reds = ['y', 'res', 'x', 'n'])
+        def f(x, y, n):
             res = 0
             while y > 0:
-                myjitdriver.can_enter_jit(x=x, y=y, res=res)
-                myjitdriver.jit_merge_point(x=x, y=y, res=res)
+                myjitdriver.can_enter_jit(x=x, y=y, res=res, n=n)
+                myjitdriver.jit_merge_point(x=x, y=y, res=res, n=n)
                 z = x * x
                 res += z
-                if y<16:
+                if y<n:
                     res += z
                 y -= 1
             return res
-        res = self.meta_interp(f, [6, 32])
+        res = self.meta_interp(f, [6, 32, 16])
         assert res == 1692
-        self.check_loop_count(3)
-        self.check_resops({'int_lt': 3, 'int_gt': 2, 'int_add': 5,
-                           'guard_true': 3, 'int_sub': 4, 'jump': 4,
-                           'int_mul': 2, 'guard_false': 2})
+        self.check_trace_count(3)
+        self.check_resops(int_mul=3)
 
     def test_loop_invariant_mul_bridge_maintaining3(self):
         myjitdriver = JitDriver(greens = [], reds = ['y', 'res', 'x', 'm'])
@@ -234,9 +230,9 @@
             return res
         res = self.meta_interp(f, [6, 32, 16])
         assert res == 1692
-        self.check_loop_count(3)
+        self.check_trace_count(3)
         self.check_resops({'int_lt': 2, 'int_gt': 4, 'guard_false': 2,
-                           'guard_true': 4, 'int_sub': 4, 'jump': 4,
+                           'guard_true': 4, 'int_sub': 4, 'jump': 3,
                            'int_mul': 3, 'int_add': 4})
 
     def test_loop_invariant_intbox(self):
@@ -257,8 +253,8 @@
             return res
         res = self.meta_interp(f, [6, 7])
         assert res == 252
-        self.check_loop_count(1)
-        self.check_resops({'jump': 2, 'int_gt': 2, 'int_add': 2,
+        self.check_trace_count(1)
+        self.check_resops({'jump': 1, 'int_gt': 2, 'int_add': 2,
                            'getfield_gc_pure': 1, 'int_mul': 1,
                            'guard_true': 2, 'int_sub': 2})
 
@@ -289,9 +285,7 @@
         assert res == f(6, 15)
         gc.collect()
 
-        #assert not [wr for wr in wr_loops if wr()]
-        for loop in [wr for wr in wr_loops if wr()]:
-            assert loop().name == 'short preamble'
+        assert not [wr for wr in wr_loops if wr()]
 
     def test_string(self):
         def f(n):
@@ -562,11 +556,11 @@
         #
         res = self.meta_interp(f, [10, 84])
         assert res == -6
-        self.check_loop_count(0)
+        self.check_trace_count(0)
         #
         res = self.meta_interp(f, [3, 19])
         assert res == -2
-        self.check_loop_count(1)
+        self.check_trace_count(1)
 
     def test_can_never_inline(self):
         def can_never_inline(x):
@@ -861,8 +855,8 @@
             return res
         res = self.meta_interp(f, [6, 7])
         assert res == 42.0
-        self.check_loop_count(1)
-        self.check_resops({'jump': 2, 'float_gt': 2, 'float_add': 2,
+        self.check_trace_count(1)
+        self.check_resops({'jump': 1, 'float_gt': 2, 'float_add': 2,
                            'float_sub': 2, 'guard_true': 2})
 
     def test_print(self):
@@ -877,7 +871,7 @@
         res = self.meta_interp(f, [7])
         assert res == 0
 
-    def test_bridge_from_interpreter(self):
+    def test_bridge_from_interpreter_1(self):
         mydriver = JitDriver(reds = ['n'], greens = [])
 
         def f(n):
@@ -887,7 +881,9 @@
                 n -= 1
 
         self.meta_interp(f, [20], repeat=7)
-        self.check_tree_loop_count(2)      # the loop and the entry path
+        # the loop and the entry path as a single trace
+        self.check_jitcell_token_count(1)
+        
         # we get:
         #    ENTER             - compile the new loop and the entry bridge
         #    ENTER             - compile the leaving path
@@ -1261,11 +1257,11 @@
 
         res = self.meta_interp(f, [10, 3, 1])
         assert res == 9 + 8 + 7 + 6 + 5 + 4 + 3 + 2 + 1 + 0
-        self.check_tree_loop_count(2)
+        self.check_jitcell_token_count(1)
 
         res = self.meta_interp(f, [10, 13, 0])
         assert res == 9 + 8 + 7 + 6 + 5 + 4 + 3 + 2 + 1 + 0
-        self.check_tree_loop_count(0)
+        self.check_jitcell_token_count(0)
 
     def test_dont_look_inside(self):
         @dont_look_inside
@@ -1346,7 +1342,7 @@
             return res
         res = self.meta_interp(f, [6, 7])
         assert res == 42
-        self.check_loop_count(1)
+        self.check_trace_count(1)
         self.check_resops(call=2)
 
     def test_merge_guardclass_guardvalue(self):
@@ -1641,7 +1637,7 @@
                 promote(a)
                 x -= 1
         self.meta_interp(f, [50])
-        self.check_loop_count(1)
+        self.check_trace_count(1)
         # this checks that the logic triggered by make_a_counter_per_value()
         # works and prevents generating tons of bridges
 
@@ -1736,10 +1732,10 @@
             return a1.val + b1.val
         res = self.meta_interp(g, [6, 7])
         assert res == 6*8 + 6**8
-        self.check_loop_count(5)
+        self.check_trace_count(4)
         self.check_resops({'guard_class': 2, 'int_gt': 4,
                            'getfield_gc': 4, 'guard_true': 4,
-                           'int_sub': 4, 'jump': 4, 'int_mul': 2,
+                           'int_sub': 4, 'jump': 2, 'int_mul': 2,
                            'int_add': 2})
 
     def test_multiple_specialied_versions_array(self):
@@ -1766,7 +1762,7 @@
                                             array=array)
                 res = res.binop(x)
                 res.val += array[idx] + array[1]
-                if y < 7:
+                if y < 10:
                     idx = 2
                 y -= 1
             return res
@@ -1778,10 +1774,10 @@
             assert a1.val == a2.val
             assert b1.val == b2.val
             return a1.val + b1.val
-        res = self.meta_interp(g, [6, 14])
-        assert res == g(6, 14)
-        self.check_loop_count(9)
-        self.check_resops(getarrayitem_gc=8)
+        res = self.meta_interp(g, [6, 20])
+        assert res == g(6, 20)
+        self.check_trace_count(8)
+        self.check_resops(getarrayitem_gc=10)
 
     def test_multiple_specialied_versions_bridge(self):
         myjitdriver = JitDriver(greens = [], reds = ['y', 'x', 'z', 'res'])
@@ -1968,7 +1964,7 @@
             return a1.val + b1.val
         res = self.meta_interp(g, [3, 23])
         assert res == 7068153
-        self.check_loop_count(7)
+        self.check_trace_count(6)
         self.check_resops(guard_true=6, guard_class=2, int_mul=3,
                           int_add=3, guard_false=3)
 
@@ -2054,7 +2050,7 @@
             return n
         res = self.meta_interp(f, [sys.maxint-10])
         assert res == 11
-        self.check_tree_loop_count(2)
+        self.check_jitcell_token_count(1)
 
     def test_wrap_around_mul(self):
         myjitdriver = JitDriver(greens = [], reds = ['x', 'n'])
@@ -2070,7 +2066,7 @@
             return n
         res = self.meta_interp(f, [sys.maxint>>10])
         assert res == 11
-        self.check_tree_loop_count(2)
+        self.check_jitcell_token_count(1)
 
     def test_wrap_around_sub(self):
         myjitdriver = JitDriver(greens = [], reds = ['x', 'n'])
@@ -2086,7 +2082,7 @@
             return n
         res = self.meta_interp(f, [10-sys.maxint])
         assert res == 12
-        self.check_tree_loop_count(2)
+        self.check_jitcell_token_count(1)
 
     def test_caching_setfield(self):
         myjitdriver = JitDriver(greens = [], reds = ['sa', 'i', 'n', 'a', 'node'])
@@ -2606,10 +2602,12 @@
                 i += 1
             return sa
         assert self.meta_interp(f, [20, 2]) == f(20, 2)
-        self.check_tree_loop_count(4)
+        self.check_jitcell_token_count(1)
+        self.check_target_token_count(4)
         assert self.meta_interp(f, [20, 3]) == f(20, 3)
-        self.check_tree_loop_count(5)
-
+        self.check_jitcell_token_count(1)
+        self.check_target_token_count(5)
+        
     def test_max_retrace_guards(self):
         myjitdriver = JitDriver(greens = [], reds = ['n', 'i', 'sa', 'a'])
 
@@ -2625,10 +2623,11 @@
                 i += 1
             return sa
         assert self.meta_interp(f, [20, 1]) == f(20, 1)
-        self.check_tree_loop_count(2)
+        self.check_jitcell_token_count(1)
+        self.check_target_token_count(2)
         assert self.meta_interp(f, [20, 10]) == f(20, 10)
-        self.check_tree_loop_count(5)
-
+        self.check_jitcell_token_count(1)
+        self.check_target_token_count(5)
 
     def test_retrace_limit_with_extra_guards(self):
         myjitdriver = JitDriver(greens = [], reds = ['n', 'i', 'sa', 'a',
@@ -2648,11 +2647,13 @@
                 i += 1
             return sa
         assert self.meta_interp(f, [20, 2]) == f(20, 2)
-        self.check_tree_loop_count(4)
+        self.check_jitcell_token_count(1)
+        self.check_target_token_count(4)
         assert self.meta_interp(f, [20, 3]) == f(20, 3)
-        self.check_tree_loop_count(5)
-
-    def test_retrace_ending_up_retrazing_another_loop(self):
+        self.check_jitcell_token_count(1)
+        self.check_target_token_count(5)
+
+    def test_retrace_ending_up_retracing_another_loop(self):
 
         myjitdriver = JitDriver(greens = ['pc'], reds = ['n', 'i', 'sa'])
         bytecode = "0+sI0+SI"
@@ -2694,11 +2695,9 @@
         # The attempts of retracing first loop will end up retracing the
         # second and thus fail 5 times, saturating the retrace_count. Instead a
         # bridge back to the preamble of the first loop is produced. A guard in
-        # this bridge is later traced resulting in a retrace of the second loop.
-        # Thus we end up with:
-        #   1 preamble and 1 specialized version of first loop
-        #   1 preamble and 2 specialized version of second loop
-        self.check_tree_loop_count(2 + 3)
+        # this bridge is later traced resulting in a failed attempt of retracing
+        # the second loop.
+        self.check_trace_count(8)
 
         # FIXME: Add a gloabl retrace counter and test that we are not trying more than 5 times.
 
@@ -2709,9 +2708,12 @@
 
         res = self.meta_interp(g, [10])
         assert res == g(10)
-        # 1 preamble and 6 speciealized versions of each loop
-        self.check_tree_loop_count(2*(1 + 6))
-
+
+        self.check_jitcell_token_count(2)
+        for cell in get_stats().get_all_jitcell_tokens():
+            # Initialal trace with two labels and 5 retraces
+            assert len(cell.target_tokens) <= 7
+            
     def test_nested_retrace(self):
 
         myjitdriver = JitDriver(greens = ['pc'], reds = ['n', 'a', 'i', 'j', 'sa'])
@@ -2748,22 +2750,33 @@
 
         res = self.meta_interp(f, [10, 7])
         assert res == f(10, 7)
-        self.check_tree_loop_count(4)
+        self.check_jitcell_token_count(2)
+        for cell in get_stats().get_all_jitcell_tokens():
+            assert len(cell.target_tokens) == 2
 
         def g(n):
             return f(n, 2) + f(n, 3)
 
         res = self.meta_interp(g, [10])
         assert res == g(10)
-        self.check_tree_loop_count(6)
-
+        self.check_jitcell_token_count(2)
+        for cell in get_stats().get_all_jitcell_tokens():
+            assert len(cell.target_tokens) <= 3
 
         def g(n):
             return f(n, 2) + f(n, 3) + f(n, 4) + f(n, 5) + f(n, 6) + f(n, 7)
 
         res = self.meta_interp(g, [10])
         assert res == g(10)
-        self.check_tree_loop_count(8)
+        # 2 loops and one function
+        self.check_jitcell_token_count(3)
+        cnt = 0
+        for cell in get_stats().get_all_jitcell_tokens():
+            if cell.target_tokens is None:
+                cnt += 1
+            else:
+                assert len(cell.target_tokens) <= 4
+        assert cnt == 1
 
     def test_frame_finished_during_retrace(self):
         class Base(object):
@@ -2846,66 +2859,6 @@
         assert res == -2
         self.check_resops(setarrayitem_gc=2, getarrayitem_gc=1)
 
-    def test_retrace_ending_up_retracing_another_loop(self):
-
-        myjitdriver = JitDriver(greens = ['pc'], reds = ['n', 'i', 'sa'])
-        bytecode = "0+sI0+SI"
-        def f(n):
-            set_param(None, 'threshold', 3)
-            set_param(None, 'trace_eagerness', 1)
-            set_param(None, 'retrace_limit', 5)
-            set_param(None, 'function_threshold', -1)
-            pc = sa = i = 0
-            while pc < len(bytecode):
-                myjitdriver.jit_merge_point(pc=pc, n=n, sa=sa, i=i)
-                n = hint(n, promote=True)
-                op = bytecode[pc]
-                if op == '0':
-                    i = 0
-                elif op == '+':
-                    i += 1
-                elif op == 's':
-                    sa += i
-                elif op == 'S':
-                    sa += 2
-                elif op == 'I':
-                    if i < n:
-                        pc -= 2
-                        myjitdriver.can_enter_jit(pc=pc, n=n, sa=sa, i=i)
-                        continue
-                pc += 1
-            return sa
-
-        def g(n1, n2):
-            for i in range(10):
-                f(n1)
-            for i in range(10):
-                f(n2)
-
-        nn = [10, 3]
-        assert self.meta_interp(g, nn) == g(*nn)
-
-        # The attempts of retracing first loop will end up retracing the
-        # second and thus fail 5 times, saturating the retrace_count. Instead a
-        # bridge back to the preamble of the first loop is produced. A guard in
-        # this bridge is later traced resulting in a retrace of the second loop.
-        # Thus we end up with:
-        #   1 preamble and 1 specialized version of first loop
-        #   1 preamble and 2 specialized version of second loop
-        self.check_tree_loop_count(2 + 3)
-
-        # FIXME: Add a gloabl retrace counter and test that we are not trying more than 5 times.
-
-        def g(n):
-            for i in range(n):
-                for j in range(10):
-                    f(n-i)
-
-        res = self.meta_interp(g, [10])
-        assert res == g(10)
-        # 1 preamble and 6 speciealized versions of each loop
-        self.check_tree_loop_count(2*(1 + 6))
-
     def test_continue_tracing_with_boxes_in_start_snapshot_replaced_by_optimizer(self):
         myjitdriver = JitDriver(greens = [], reds = ['sa', 'n', 'a', 'b'])
         def f(n):
@@ -3153,7 +3106,7 @@
             return sa
         res = self.meta_interp(f, [32])
         assert res == f(32)
-        self.check_tree_loop_count(3)
+        self.check_trace_count(2)
 
     def test_two_loopinvariant_arrays2(self):
         from pypy.rpython.lltypesystem import lltype, llmemory, rffi
@@ -3176,7 +3129,7 @@
             return sa
         res = self.meta_interp(f, [32])
         assert res == f(32)
-        self.check_tree_loop_count(3)
+        self.check_trace_count(2)
 
     def test_two_loopinvariant_arrays3(self):
         from pypy.rpython.lltypesystem import lltype, llmemory, rffi
@@ -3200,7 +3153,7 @@
             return sa
         res = self.meta_interp(f, [32])
         assert res == f(32)
-        self.check_tree_loop_count(2)
+        self.check_trace_count(3)
 
     def test_two_loopinvariant_arrays_boxed(self):
         class A(object):
@@ -3371,7 +3324,7 @@
         res = self.meta_interp(main, [10])
         assert res == main(10)
         self.check_resops({'int_gt': 2, 'strlen': 2, 'guard_true': 2,
-                           'int_sub': 2, 'jump': 2, 'call': 2,
+                           'int_sub': 2, 'jump': 1, 'call': 2,
                            'guard_no_exception': 2, 'int_add': 4})
 
     def test_look_inside_iff_const_getarrayitem_gc_pure(self):
@@ -3508,7 +3461,7 @@
 
         res = self.meta_interp(f, [10])
         assert res == 0
-        self.check_resops({'jump': 2, 'guard_true': 2, 'int_gt': 2,
+        self.check_resops({'jump': 1, 'guard_true': 2, 'int_gt': 2,
                            'int_sub': 2})
 
     def test_virtual_opaque_ptr(self):
@@ -3528,7 +3481,7 @@
             return n
         res = self.meta_interp(f, [10])
         assert res == 0
-        self.check_resops({'jump': 2, 'guard_true': 2, 'int_gt': 2,
+        self.check_resops({'jump': 1, 'guard_true': 2, 'int_gt': 2,
                            'int_sub': 2})
 
 
@@ -3551,7 +3504,7 @@
         res = self.meta_interp(f, [10])
         assert res == 0
         self.check_resops({'int_gt': 2, 'getfield_gc': 1, 'int_eq': 1,
-                           'guard_true': 2, 'int_sub': 2, 'jump': 2,
+                           'guard_true': 2, 'int_sub': 2, 'jump': 1,
                            'guard_false': 1})
 
 
@@ -3799,6 +3752,31 @@
         x = self.interp_operations(f, [1000, 1], translationoptions=topt)
         assert x == 999
 
+    def test_retracing_bridge_from_interpreter_to_finnish(self):
+        myjitdriver = JitDriver(greens = [], reds = ['n', 'i', 'sa'])
+        def f(n):
+            sa = i = 0
+            while i < n:
+                myjitdriver.jit_merge_point(n=n, i=i, sa=sa)
+                n = hint(n, promote=True)
+                sa += 2*n
+                i += 1
+            return sa
+        def g(n):
+            return f(n) + f(n) + f(n) + f(n) + f(10*n) + f(11*n)
+        res = self.meta_interp(g, [1], repeat=3)
+        assert res == g(1)
+        #self.check_jitcell_token_count(1)
+        self.check_jitcell_token_count(2)
+        # XXX A bridge from the interpreter to a finish is first
+        # constructed for n=1. It is later replaced with a trace for
+        # the case n=10 which is extended with a retrace for n=11 and
+        # finnaly a new bridge to finnish is again traced and created
+        # for the case n=1. We were not able to reuse the orignial n=1
+        # bridge as a preamble since it does not start with a
+        # label. The alternative would be to have all such bridges
+        # start with labels. I dont know which is better...
+
     def test_ll_arraycopy(self):
         from pypy.rlib import rgc
         A = lltype.GcArray(lltype.Char)
diff --git a/pypy/jit/metainterp/test/test_compile.py b/pypy/jit/metainterp/test/test_compile.py
--- a/pypy/jit/metainterp/test/test_compile.py
+++ b/pypy/jit/metainterp/test/test_compile.py
@@ -1,7 +1,7 @@
 from pypy.config.pypyoption import get_pypy_config
-from pypy.jit.metainterp.history import LoopToken, ConstInt, History, Stats
+from pypy.jit.metainterp.history import TargetToken, ConstInt, History, Stats
 from pypy.jit.metainterp.history import BoxInt, INT
-from pypy.jit.metainterp.compile import insert_loop_token, compile_new_loop
+from pypy.jit.metainterp.compile import compile_loop
 from pypy.jit.metainterp.compile import ResumeGuardDescr
 from pypy.jit.metainterp.compile import ResumeGuardCountersInt
 from pypy.jit.metainterp.compile import compile_tmp_callback
@@ -10,23 +10,6 @@
 from pypy.jit.tool.oparser import parse
 from pypy.jit.metainterp.optimizeopt import ALL_OPTS_DICT
 
-def test_insert_loop_token():
-    # XXX this test is a bit useless now that there are no specnodes
-    lst = []
-    #
-    tok1 = LoopToken()
-    insert_loop_token(lst, tok1)
-    assert lst == [tok1]
-    #
-    tok2 = LoopToken()
-    insert_loop_token(lst, tok2)
-    assert lst == [tok1, tok2]
-    #
-    tok3 = LoopToken()
-    insert_loop_token(lst, tok3)
-    assert lst == [tok1, tok2, tok3]
-
-
 class FakeCPU(object):
     ts = typesystem.llhelper
     def __init__(self):
@@ -72,8 +55,9 @@
         warmstate = FakeState()
         on_compile = staticmethod(lambda *args: None)
         on_compile_bridge = staticmethod(lambda *args: None)
+        virtualizable_info = None
 
-def test_compile_new_loop():
+def test_compile_loop():
     cpu = FakeCPU()
     staticdata = FakeMetaInterpStaticData()
     staticdata.cpu = cpu
@@ -93,34 +77,26 @@
     metainterp.staticdata = staticdata
     metainterp.cpu = cpu
     metainterp.history = History()
-    metainterp.history.operations = loop.operations[:]
+    metainterp.history.operations = loop.operations[:-1]
     metainterp.history.inputargs = loop.inputargs[:]
     cpu._all_size_descrs_with_vtable = (
         LLtypeMixin.cpu._all_size_descrs_with_vtable)
     #
-    loop_tokens = []
-    loop_token = compile_new_loop(metainterp, loop_tokens, [], 0, None)
-    assert loop_tokens == [loop_token]
-    assert loop_token.number == 1
+    greenkey = 'faked'
+    target_token = compile_loop(metainterp, greenkey, 0,
+                                loop.inputargs,
+                                loop.operations[-1].getarglist(),
+                                None)
+    jitcell_token = target_token.targeting_jitcell_token
+    assert jitcell_token == target_token.original_jitcell_token
+    assert jitcell_token.target_tokens == [target_token]
+    assert jitcell_token.number == 1
     assert staticdata.globaldata.loopnumbering == 2
     #
     assert len(cpu.seen) == 1
-    assert cpu.seen[0][2] == loop_token
+    assert cpu.seen[0][2] == jitcell_token
     #
     del cpu.seen[:]
-    metainterp = FakeMetaInterp()
-    metainterp.staticdata = staticdata
-    metainterp.cpu = cpu
-    metainterp.history = History()
-    metainterp.history.operations = loop.operations[:]
-    metainterp.history.inputargs = loop.inputargs[:]
-    #
-    loop_token_2 = compile_new_loop(metainterp, loop_tokens, [], 0, None)
-    assert loop_token_2 is loop_token
-    assert loop_tokens == [loop_token]
-    assert len(cpu.seen) == 0
-    assert staticdata.globaldata.loopnumbering == 2
-
 
 def test_resume_guard_counters():
     rgc = ResumeGuardCountersInt()
@@ -196,23 +172,17 @@
         result_type = INT
     #
     loop_token = compile_tmp_callback(cpu, FakeJitDriverSD(),
-                                      [ConstInt(12), ConstInt(34)],
-                                      [BoxInt(56), ConstInt(78), BoxInt(90)])
+                                      [ConstInt(12), ConstInt(34)], "ii")
     #
     raiseme = None
-    cpu.set_future_value_int(0, -156)
-    cpu.set_future_value_int(1, -178)
-    cpu.set_future_value_int(2, -190)     # passed in, but dropped
-    fail_descr = cpu.execute_token(loop_token)
+    # only two arguments must be passed in
+    fail_descr = cpu.execute_token(loop_token, -156, -178)
     assert fail_descr is FakeJitDriverSD().portal_finishtoken
     #
     EXC = lltype.GcStruct('EXC')
     llexc = lltype.malloc(EXC)
     raiseme = LLException("exception class", llexc)
-    cpu.set_future_value_int(0, -156)
-    cpu.set_future_value_int(1, -178)
-    cpu.set_future_value_int(2, -190)
-    fail_descr = cpu.execute_token(loop_token)
+    fail_descr = cpu.execute_token(loop_token, -156, -178)
     assert isinstance(fail_descr, compile.PropagateExceptionDescr)
     got = cpu.grab_exc_value()
     assert lltype.cast_opaque_ptr(lltype.Ptr(EXC), got) == llexc
@@ -221,10 +191,7 @@
         class ExitFrameWithExceptionRef(Exception):
             pass
     FakeMetaInterpSD.cpu = cpu
-    cpu.set_future_value_int(0, -156)
-    cpu.set_future_value_int(1, -178)
-    cpu.set_future_value_int(2, -190)
-    fail_descr = cpu.execute_token(loop_token)
+    fail_descr = cpu.execute_token(loop_token, -156, -178)
     try:
         fail_descr.handle_fail(FakeMetaInterpSD(), None)
     except FakeMetaInterpSD.ExitFrameWithExceptionRef, e:
diff --git a/pypy/jit/metainterp/test/test_del.py b/pypy/jit/metainterp/test/test_del.py
--- a/pypy/jit/metainterp/test/test_del.py
+++ b/pypy/jit/metainterp/test/test_del.py
@@ -1,5 +1,7 @@
 import py
-from pypy.rlib.jit import JitDriver
+from pypy.rlib.jit import JitDriver, dont_look_inside
+from pypy.rlib.objectmodel import keepalive_until_here
+from pypy.rlib import rgc
 from pypy.jit.metainterp.test.support import LLJitMixin, OOJitMixin
 
 
@@ -25,7 +27,7 @@
                            'int_sub': 2,
                            'int_gt': 2,
                            'guard_true': 2,
-                           'jump': 2})
+                           'jump': 1})
 
     def test_class_of_allocated(self):
         myjitdriver = JitDriver(greens = [], reds = ['n', 'x'])
@@ -80,6 +82,47 @@
         assert res == 1
         self.check_resops(call=1)   # for the case B(), but not for the case A()
 
+    def test_keepalive(self):
+        py.test.skip("XXX fails")   # hum, I think the test itself is broken
+        #
+        mydriver = JitDriver(reds = ['n', 'states'], greens = [])
+        class State:
+            num = 1
+        class X:
+            def __init__(self, state):
+                self.state = state
+            def __del__(self):
+                self.state.num += 1
+        @dont_look_inside
+        def do_stuff():
+            pass
+        def f(n):
+            states = []
+            while n > 0:
+                mydriver.jit_merge_point(n=n, states=states)
+                state = State()
+                states.append(state)
+                x = X(state)
+                do_stuff()
+                state.num *= 1000
+                do_stuff()
+                keepalive_until_here(x)
+                n -= 1
+            return states
+        def main(n):
+            states = f(n)
+            rgc.collect()
+            rgc.collect()
+            err = 1001
+            for state in states:
+                if state.num != 1001:
+                    err = state.num
+                    print 'ERROR:', err
+            return err
+        assert main(20) == 1001
+        res = self.meta_interp(main, [20])
+        assert res == 1001
+
 
 class TestLLtype(DelTests, LLJitMixin):
     def test_signal_action(self):
diff --git a/pypy/jit/metainterp/test/test_dict.py b/pypy/jit/metainterp/test/test_dict.py
--- a/pypy/jit/metainterp/test/test_dict.py
+++ b/pypy/jit/metainterp/test/test_dict.py
@@ -154,7 +154,7 @@
         res = self.meta_interp(f, [100], listops=True)
         assert res == f(50)
         self.check_resops({'new_array': 2, 'getfield_gc': 2,
-                           'guard_true': 2, 'jump': 2,
+                           'guard_true': 2, 'jump': 1,
                            'new_with_vtable': 2, 'getinteriorfield_gc': 2,
                            'setfield_gc': 6, 'int_gt': 2, 'int_sub': 2,
                            'call': 10, 'int_and': 2,
diff --git a/pypy/jit/metainterp/test/test_exception.py b/pypy/jit/metainterp/test/test_exception.py
--- a/pypy/jit/metainterp/test/test_exception.py
+++ b/pypy/jit/metainterp/test/test_exception.py
@@ -35,7 +35,7 @@
             return n
         res = self.meta_interp(f, [10])
         assert res == 0
-        self.check_resops({'jump': 2, 'guard_true': 2,
+        self.check_resops({'jump': 1, 'guard_true': 2,
                            'int_gt': 2, 'int_sub': 2})
 
     def test_bridge_from_guard_exception(self):
@@ -512,7 +512,7 @@
 
         res = self.meta_interp(main, [41], repeat=7)
         assert res == -1
-        self.check_tree_loop_count(2)      # the loop and the entry path
+        self.check_target_token_count(2)      # the loop and the entry path
         # we get:
         #    ENTER    - compile the new loop and the entry bridge
         #    ENTER    - compile the leaving path (raising MyError)
diff --git a/pypy/jit/metainterp/test/test_fficall.py b/pypy/jit/metainterp/test/test_fficall.py
--- a/pypy/jit/metainterp/test/test_fficall.py
+++ b/pypy/jit/metainterp/test/test_fficall.py
@@ -1,3 +1,4 @@
+from __future__ import with_statement
 import py
 
 from pypy.jit.metainterp.test.support import LLJitMixin
@@ -76,14 +77,14 @@
                 int_add=2,
                 int_lt=2,
                 guard_true=2,
-                jump=2)
+                jump=1)
         else:
             self.check_resops(
                 call_release_gil=0,   # no CALL_RELEASE_GIL
                 int_add=2,
                 int_lt=2,
                 guard_true=2,
-                jump=2)
+                jump=1)
         return res
 
     def test_byval_result(self):
@@ -144,9 +145,32 @@
                     return result_point[0].x * result_point[0].y
 
         assert self.meta_interp(main, [10]) == main(10) == 9000
-        self.check_resops({'jump': 2, 'int_lt': 2, 'setinteriorfield_raw': 4,
+        self.check_resops({'jump': 1, 'int_lt': 2, 'setinteriorfield_raw': 4,
                            'getinteriorfield_raw': 8, 'int_add': 6, 'guard_true': 2})
 
+    def test_array_getitem_uint8(self):
+        myjitdriver = JitDriver(
+            greens = [],
+            reds = ["n", "i", "s", "data"],
+        )
+        def f(data, n):
+            i = s = 0
+            while i < n:
+                myjitdriver.jit_merge_point(n=n, i=i, s=s, data=data)
+                s += rffi.cast(lltype.Signed, array_getitem(types.uchar, 1, data, 0, 0))
+                i += 1
+            return s
+
+        def main(n):
+            with lltype.scoped_alloc(rffi.CArray(rffi.UCHAR), 1) as data:
+                data[0] = rffi.cast(rffi.UCHAR, 200)
+                return f(data, n)
+
+        assert self.meta_interp(main, [10]) == 2000
+        self.check_resops({'jump': 1, 'int_lt': 2, 'getinteriorfield_raw': 2,
+                           'guard_true': 2, 'int_add': 4})
+
+
 class TestFfiCall(FfiCallTests, LLJitMixin):
     supports_all = False
 
diff --git a/pypy/jit/metainterp/test/test_greenfield.py b/pypy/jit/metainterp/test/test_greenfield.py
--- a/pypy/jit/metainterp/test/test_greenfield.py
+++ b/pypy/jit/metainterp/test/test_greenfield.py
@@ -24,7 +24,7 @@
         #
         res = self.meta_interp(g, [7])
         assert res == -2
-        self.check_loop_count(2)
+        self.check_trace_count(2)
         self.check_resops(guard_value=0)
 
     def test_green_field_2(self):
@@ -49,7 +49,7 @@
         #
         res = self.meta_interp(g, [7])
         assert res == -22
-        self.check_loop_count(6)
+        self.check_trace_count(6)
         self.check_resops(guard_value=0)
 
 
diff --git a/pypy/jit/metainterp/test/test_jitdriver.py b/pypy/jit/metainterp/test/test_jitdriver.py
--- a/pypy/jit/metainterp/test/test_jitdriver.py
+++ b/pypy/jit/metainterp/test/test_jitdriver.py
@@ -28,10 +28,10 @@
                 i += 1
 
         self.meta_interp(loop, [1, 4])
-        assert sorted(called.keys()) == [(4, 1, "entry bridge"), (4, 1, "loop")]
+        assert sorted(called.keys()) == [(4, 1, "loop")]
         self.meta_interp(loop, [2, 4])
-        assert sorted(called.keys()) == [(4, 1, "entry bridge"), (4, 1, "loop"),
-                                         (4, 2, "entry bridge"), (4, 2, "loop")]
+        assert sorted(called.keys()) == [(4, 1, "loop"),
+                                         (4, 2, "loop")]
 
     def test_on_compile_bridge(self):
         called = {}
@@ -55,8 +55,7 @@
                 i += 1
 
         self.meta_interp(loop, [1, 10])
-        assert sorted(called.keys()) == ['bridge', (10, 1, "entry bridge"),
-                                         (10, 1, "loop")]
+        assert sorted(called.keys()) == ['bridge', (10, 1, "loop")]
 
 
 class TestLLtypeSingle(JitDriverTests, LLJitMixin):
@@ -92,8 +91,9 @@
         # the following numbers are not really expectations of the test
         # itself, but just the numbers that we got after looking carefully
         # at the generated machine code
-        self.check_loop_count(5)
-        self.check_tree_loop_count(4)    # 2 x loop, 2 x enter bridge
+        self.check_trace_count(5)
+        self.check_jitcell_token_count(2)    # 2 x loop including enter bridge
+        self.check_target_token_count(4)    # 2 x loop, 2 x enter bridge
         self.check_enter_count(5)
 
     def test_inline(self):
@@ -125,7 +125,7 @@
         # we expect no loop at all for 'loop1': it should always be inlined
         # we do however get several version of 'loop2', all of which contains
         # at least one int_add, while there are no int_add's in 'loop1'
-        self.check_tree_loop_count(5)
+        self.check_jitcell_token_count(1)
         for loop in get_stats().loops:
             assert loop.summary()['int_add'] >= 1
 
diff --git a/pypy/jit/metainterp/test/test_jitprof.py b/pypy/jit/metainterp/test/test_jitprof.py
--- a/pypy/jit/metainterp/test/test_jitprof.py
+++ b/pypy/jit/metainterp/test/test_jitprof.py
@@ -10,7 +10,7 @@
         self.counter = 123456
         Profiler.start(self)
         self.events = []
-        self.times = [0, 0, 0, 0]
+        self.times = [0, 0]
     
     def timer(self):
         self.counter += 1
@@ -24,12 +24,6 @@
         Profiler._end(self, event)
         self.events.append(~event)
 
-    def start_running(self):   self._start(RUNNING)
-    def end_running(self):     self._end(RUNNING)
-
-    def start_blackhole(self): self._start(BLACKHOLE)
-    def end_blackhole(self):   self._end(BLACKHOLE)
-
 class ProfilerMixin(LLJitMixin):
     def meta_interp(self, *args, **kwds):
         kwds = kwds.copy()
@@ -55,17 +49,11 @@
             TRACING,
             BACKEND,
             ~ BACKEND,
-            BACKEND,
-            ~ BACKEND,
             ~ TRACING,
-            RUNNING,
-            ~ RUNNING,
-            BLACKHOLE,
-            ~ BLACKHOLE
             ]
         assert profiler.events == expected
-        assert profiler.times == [3, 2, 1, 1]
-        assert profiler.counters == [1, 2, 1, 1, 3, 3, 1, 13, 2, 0, 0, 0, 0,
+        assert profiler.times == [2, 1]
+        assert profiler.counters == [1, 1, 3, 3, 1, 15, 2, 0, 0, 0, 0,
                                      0, 0, 0, 0, 0]
 
     def test_simple_loop_with_call(self):
diff --git a/pypy/jit/metainterp/test/test_list.py b/pypy/jit/metainterp/test/test_list.py
--- a/pypy/jit/metainterp/test/test_list.py
+++ b/pypy/jit/metainterp/test/test_list.py
@@ -225,7 +225,7 @@
             return s
         res = self.meta_interp(f, [15], listops=True)
         assert res == f(15)
-        self.check_resops({'jump': 2, 'int_gt': 2, 'int_add': 2,
+        self.check_resops({'jump': 1, 'int_gt': 2, 'int_add': 2,
                            'guard_true': 2, 'int_sub': 2})
 
 class TestOOtype(ListTests, OOJitMixin):
diff --git a/pypy/jit/metainterp/test/test_logger.py b/pypy/jit/metainterp/test/test_logger.py
--- a/pypy/jit/metainterp/test/test_logger.py
+++ b/pypy/jit/metainterp/test/test_logger.py
@@ -5,7 +5,7 @@
 from pypy.jit.metainterp.typesystem import llhelper
 from StringIO import StringIO
 from pypy.jit.metainterp.optimizeopt.util import equaloplists
-from pypy.jit.metainterp.history import AbstractDescr, LoopToken, BasicFailDescr
+from pypy.jit.metainterp.history import AbstractDescr, JitCellToken, BasicFailDescr
 from pypy.jit.backend.model import AbstractCPU
 
 
@@ -131,7 +131,7 @@
         equaloplists(loop.operations, oloop.operations)
 
     def test_jump(self):
-        namespace = {'target': LoopToken()}
+        namespace = {'target': JitCellToken()}
         namespace['target'].number = 3
         inp = '''
         [i0]
diff --git a/pypy/jit/metainterp/test/test_loop.py b/pypy/jit/metainterp/test/test_loop.py
--- a/pypy/jit/metainterp/test/test_loop.py
+++ b/pypy/jit/metainterp/test/test_loop.py
@@ -36,7 +36,7 @@
             return res * 2
         res = self.meta_interp(f, [6, 7])
         assert res == 84
-        self.check_loop_count(1)
+        self.check_trace_count(1)
 
     def test_loop_with_delayed_setfield(self):
         myjitdriver = JitDriver(greens = [], reds = ['x', 'y', 'res', 'a'])
@@ -58,7 +58,7 @@
             return res * 2
         res = self.meta_interp(f, [6, 13])
         assert res == f(6, 13)
-        self.check_loop_count(1)
+        self.check_trace_count(1)
         if self.enable_opts:
             self.check_resops(setfield_gc=2, getfield_gc=0)
 
@@ -90,9 +90,9 @@
         res = self.meta_interp(f, [6, 33], policy=StopAtXPolicy(l))
         assert res == f(6, 33)
         if self.enable_opts:
-            self.check_loop_count(3)
+            self.check_trace_count(2)
         else:
-            self.check_loop_count(2)
+            self.check_trace_count(2)
 
     def test_alternating_loops(self):
         myjitdriver = JitDriver(greens = [], reds = ['pattern'])
@@ -108,9 +108,9 @@
             return 42
         self.meta_interp(f, [0xF0F0F0])
         if self.enable_opts:
-            self.check_loop_count(3)
+            self.check_trace_count(3)
         else:
-            self.check_loop_count(2)
+            self.check_trace_count(2)
 
     def test_interp_simple(self):
         myjitdriver = JitDriver(greens = ['i'], reds = ['x', 'y'])
@@ -135,7 +135,7 @@
             return x
         res = self.meta_interp(f, [100, 30])
         assert res == 42
-        self.check_loop_count(0)
+        self.check_trace_count(0)
 
     def test_green_prevents_loop(self):
         myjitdriver = JitDriver(greens = ['i'], reds = ['x', 'y'])
@@ -154,7 +154,7 @@
             return x
         res = self.meta_interp(f, [100, 5])
         assert res == f(100, 5)
-        self.check_loop_count(0)
+        self.check_trace_count(0)
 
     def test_interp_single_loop(self):
         myjitdriver = JitDriver(greens = ['i'], reds = ['x', 'y'])
@@ -179,7 +179,7 @@
             return x
         res = self.meta_interp(f, [5, 8])
         assert res == 42
-        self.check_loop_count(1)
+        self.check_trace_count(1)
         # the 'int_eq' and following 'guard' should be constant-folded
         if 'unroll' in self.enable_opts:
             self.check_resops(int_eq=0, guard_true=2, guard_false=0)
@@ -194,7 +194,10 @@
                     assert isinstance(liveboxes[0], history.BoxInt)
                     assert isinstance(liveboxes[1], history.BoxInt)
                     found += 1
-            assert found == 1
+            if 'unroll' in self.enable_opts:
+                assert found == 2
+            else:
+                assert found == 1
 
     def test_interp_many_paths(self):
         myjitdriver = JitDriver(greens = ['i'], reds = ['x', 'node'])
@@ -229,7 +232,7 @@
         expected = f(node1)
         res = self.meta_interp(f, [node1])
         assert res == expected
-        self.check_loop_count_at_most(19)
+        self.check_trace_count_at_most(19)
 
     def test_interp_many_paths_2(self):
         myjitdriver = JitDriver(greens = ['i'], reds = ['x', 'node'])
@@ -268,7 +271,7 @@
         expected = f(node1)
         res = self.meta_interp(f, [node1])
         assert res == expected
-        self.check_loop_count_at_most(19)
+        self.check_trace_count_at_most(19)
 
     def test_nested_loops(self):
         myjitdriver = JitDriver(greens = ['i'], reds = ['x', 'y'])
@@ -601,11 +604,11 @@
         assert res == expected
 
         if self.enable_opts:
-            self.check_loop_count(2)
-            self.check_tree_loop_count(2)   # 1 loop, 1 bridge from interp
+            self.check_trace_count(2)
+            self.check_jitcell_token_count(1)   # 1 loop with bridge from interp
         else:
-            self.check_loop_count(2)
-            self.check_tree_loop_count(1)   # 1 loop, callable from the interp
+            self.check_trace_count(2)
+            self.check_jitcell_token_count(1)   # 1 loop, callable from the interp
 
     def test_example(self):
         myjitdriver = JitDriver(greens = ['i'],
@@ -646,10 +649,10 @@
 
         res = self.meta_interp(main_interpreter_loop, [1])
         assert res == 102
-        self.check_loop_count(1)
+        self.check_trace_count(1)
         if 'unroll' in self.enable_opts:
             self.check_resops({'int_add' : 6, 'int_gt' : 2,
-                               'guard_false' : 2, 'jump' : 2})
+                               'guard_false' : 2, 'jump' : 1})
         else:
             self.check_resops({'int_add' : 3, 'int_gt' : 1,
                                'guard_false' : 1, 'jump' : 1})
@@ -691,7 +694,7 @@
 
         res = self.meta_interp(main_interpreter_loop, [1])
         assert res == main_interpreter_loop(1)
-        self.check_loop_count(1)
+        self.check_trace_count(1)
         # These loops do different numbers of ops based on which optimizer we
         # are testing with.
         self.check_resops(self.automatic_promotion_result)
@@ -753,7 +756,7 @@
         res = self.meta_interp(interpret, [1])
         assert res == interpret(1)
         # XXX it's unsure how many loops should be there
-        self.check_loop_count(3)
+        self.check_trace_count(3)
 
     def test_path_with_operations_not_from_start(self):
         jitdriver = JitDriver(greens = ['k'], reds = ['n', 'z'])
diff --git a/pypy/jit/metainterp/test/test_loop_unroll.py b/pypy/jit/metainterp/test/test_loop_unroll.py
--- a/pypy/jit/metainterp/test/test_loop_unroll.py
+++ b/pypy/jit/metainterp/test/test_loop_unroll.py
@@ -8,7 +8,7 @@
     enable_opts = ALL_OPTS_NAMES
     
     automatic_promotion_result = {
-        'int_gt': 2, 'guard_false': 2, 'jump': 2, 'int_add': 6,
+        'int_gt': 2, 'guard_false': 2, 'jump': 1, 'int_add': 6,
         'guard_value': 1        
     }
 
diff --git a/pypy/jit/metainterp/test/test_memmgr.py b/pypy/jit/metainterp/test/test_memmgr.py
--- a/pypy/jit/metainterp/test/test_memmgr.py
+++ b/pypy/jit/metainterp/test/test_memmgr.py
@@ -14,7 +14,9 @@
 from pypy.jit.metainterp.memmgr import MemoryManager
 from pypy.jit.metainterp.test.support import LLJitMixin
 from pypy.rlib.jit import JitDriver, dont_look_inside
-
+from pypy.jit.metainterp.warmspot import get_stats
+from pypy.jit.metainterp.warmstate import JitCell
+from pypy.rlib import rgc
 
 class FakeLoopToken:
     generation = 0
@@ -81,6 +83,20 @@
     # See comments in TestMemoryManager.  To get temporarily the normal
     # behavior just rename this class to TestIntegration.
 
+    # We need an extra rgc.collect in get_procedure_token() for some of
+    # these tests to pass. But we dont want it there always since that will
+    # make all other tests take forever.
+    def setup_class(cls):
+        original_get_procedure_token = JitCell.get_procedure_token
+        def get_procedure_token(self):
+            rgc.collect();
+            return original_get_procedure_token(self)
+        JitCell.get_procedure_token = get_procedure_token
+        cls.original_get_procedure_token = original_get_procedure_token
+
+    def teardown_class(cls):
+        JitCell.get_procedure_token = cls.original_get_procedure_token
+
     def test_loop_kept_alive(self):
         myjitdriver = JitDriver(greens=[], reds=['n'])
         def g():
@@ -99,7 +115,7 @@
         assert res == 42
 
         # we should see only the loop and the entry bridge
-        self.check_tree_loop_count(2)
+        self.check_target_token_count(2)
 
     def test_target_loop_kept_alive_or_not(self):
         myjitdriver = JitDriver(greens=['m'], reds=['n'])
@@ -114,6 +130,8 @@
             # Depending on loop_longevity, either:
             # A. create the loop and the entry bridge for 'g(5)'
             # B. create 8 loops (and throw them away at each iteration)
+            #    Actually, it's 4 loops and 4 exit bridges thrown away
+            #    every second iteration
             for i in range(8):
                 g(5)
             # create another loop and another entry bridge for 'g(7)',
@@ -132,14 +150,15 @@
         # case A
         res = self.meta_interp(f, [], loop_longevity=3)
         assert res == 42
-        # we should see only the loop and the entry bridge for g(5) and g(7)
-        self.check_tree_loop_count(4)
+        # we should see only the loop with preamble and the exit bridge
+        # for g(5) and g(7)
+        self.check_enter_count(4)
 
         # case B, with a lower longevity
         res = self.meta_interp(f, [], loop_longevity=1)
         assert res == 42
         # we should see a loop for each call to g()
-        self.check_tree_loop_count(8 + 20*2*2)
+        self.check_enter_count(8 + 20*2)
 
     def test_throw_away_old_loops(self):
         myjitdriver = JitDriver(greens=['m'], reds=['n'])
@@ -152,9 +171,9 @@
             return 21
         def f():
             for i in range(10):
-                g(1)   # g(1) gets a loop and an entry bridge, stays alive
-                g(2)   # (and an exit bridge, which does not count in
-                g(1)   # check_tree_loop_count)
+                g(1)   # g(1) gets a loop with an entry bridge
+                g(2)   # and an exit bridge, stays alive
+                g(1)   
                 g(3)
                 g(1)
                 g(4)   # g(2), g(3), g(4), g(5) are thrown away every iteration
@@ -164,7 +183,7 @@
 
         res = self.meta_interp(f, [], loop_longevity=3)
         assert res == 42
-        self.check_tree_loop_count(2 + 10*4*2)
+        self.check_enter_count(2 + 10*4)
 
     def test_call_assembler_keep_alive(self):
         myjitdriver1 = JitDriver(greens=['m'], reds=['n'])
@@ -187,7 +206,7 @@
             return 21
         def f(u):
             for i in range(8):
-                h(u, 32)  # make a loop and an entry bridge for h(u)
+                h(u, 32)  # make a loop and an exit bridge for h(u)
             g(u, 8)       # make a loop for g(u) with a call_assembler
             g(u, 0); g(u+1, 0)     # \
             g(u, 0); g(u+2, 0)     #  \  make more loops for g(u+1) to g(u+4),
@@ -198,7 +217,12 @@
 
         res = self.meta_interp(f, [1], loop_longevity=4, inline=True)
         assert res == 42
-        self.check_tree_loop_count(12)
+        self.check_jitcell_token_count(6)
+        tokens = [t() for t in get_stats().jitcell_token_wrefs]
+        # Some loops have been freed
+        assert None in tokens
+        # Loop with number 0, h(), has not been freed
+        assert 0 in [t.number for t in tokens if t]
 
 # ____________________________________________________________
 
@@ -217,10 +241,17 @@
 if __name__ == '__main__':
     # occurs in the subprocess
     for test in [_TestMemoryManager(), _TestIntegration()]:
-        for name in dir(test):
-            if name.startswith('test_'):
-                print
-                print '-'*79
-                print '----- Now running test', name, '-----'
-                print
-                getattr(test, name)()
+        if hasattr(test, 'setup_class'):
+            test.setup_class()
+        try:
+            for name in dir(test):
+                if name.startswith('test_'):
+                    print
+                    print '-'*79
+                    print '----- Now running test', name, '-----'
+                    print
+                    getattr(test, name)()
+        finally:
+            if hasattr(test, 'teardown_class'):
+                test.teardown_class()
+            
diff --git a/pypy/jit/metainterp/test/test_quasiimmut.py b/pypy/jit/metainterp/test/test_quasiimmut.py
--- a/pypy/jit/metainterp/test/test_quasiimmut.py
+++ b/pypy/jit/metainterp/test/test_quasiimmut.py
@@ -294,7 +294,8 @@
             return total
 
         res = self.meta_interp(main, [])
-        self.check_tree_loop_count(6)
+        self.check_trace_count(6)
+        self.check_jitcell_token_count(3)
         assert res == main()
 
     def test_change_during_running(self):
@@ -305,7 +306,7 @@
                 self.a = a
         @dont_look_inside
         def residual_call(foo, x):
-            if x == 5:
+            if x == 10:
                 foo.a += 1


More information about the pypy-commit mailing list