[pypy-commit] pypy vmprof2: Backed out changeset a69d4a5a9638

Fri Apr 17 10:03:03 CEST 2015

Author: Maciej Fijalkowski <fijall at gmail.com>
Branch: vmprof2
Changeset: r76813:4b972671f5e2
Date: 2015-04-17 10:02 +0200
http://bitbucket.org/pypy/pypy/changeset/4b972671f5e2/

Log:	Backed out changeset a69d4a5a9638

diff too long, truncating to 2000 out of 2680 lines

diff --git a/LICENSE b/LICENSE
--- a/LICENSE
+++ b/LICENSE
@@ -420,3 +420,10 @@
 the terms of the GPL license version 2 or any later version.  Thus the
 gdbm module, provided in the file lib_pypy/gdbm.py, is redistributed
 under the terms of the GPL license as well.
+
+License for 'pypy/module/_vmprof/src'
+--------------------------------------
+
+The code is based on gperftools. You may see a copy of the License for it at
+
+    https://code.google.com/p/gperftools/source/browse/COPYING
diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py
--- a/pypy/config/pypyoption.py
+++ b/pypy/config/pypyoption.py
@@ -38,6 +38,9 @@
     "_csv", "cppyy", "_pypyjson"
 ])
 
+if sys.platform.startswith('linux') and sys.maxint > 2147483647:
+    working_modules.add('_vmprof')
+
 translation_modules = default_modules.copy()
 translation_modules.update([
     "fcntl", "time", "select", "signal", "_rawffi", "zlib", "struct", "_md5",
@@ -99,6 +102,7 @@
     "_hashlib"  : ["pypy.module._ssl.interp_ssl"],
     "_minimal_curses": ["pypy.module._minimal_curses.fficurses"],
     "_continuation": ["rpython.rlib.rstacklet"],
+    "_vmprof" : ["pypy.module._vmprof.interp_vmprof"],
     }
 
 def get_module_validator(modname):
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -9,6 +9,7 @@
 from rpython.rlib.signature import signature
 from rpython.rlib.rarithmetic import r_uint, SHRT_MIN, SHRT_MAX, \
     INT_MIN, INT_MAX, UINT_MAX, USHRT_MAX
+from rpython.rlib.rweaklist import RWeakListMixin
 
 from pypy.interpreter.executioncontext import (ExecutionContext, ActionFlag,
     UserDelAction)
@@ -366,6 +367,10 @@
 
 # ____________________________________________________________
 
+class CodeObjWeakList(RWeakListMixin):
+    def __init__(self):
+        self.initialize()
+
 class ObjSpace(object):
     """Base class for the interpreter-level implementations of object spaces.
     http://pypy.readthedocs.org/en/latest/objspace.html"""
@@ -389,6 +394,7 @@
         self.check_signal_action = None   # changed by the signal module
         self.user_del_action = UserDelAction(self)
         self._code_of_sys_exc_info = None
+        self.all_code_objs = CodeObjWeakList()
 
         # can be overridden to a subclass
         self.initialize()
@@ -666,6 +672,16 @@
             assert ec is not None
             return ec
 
+    def register_code_object(self, pycode):
+        callback = self.getexecutioncontext().register_code_callback
+        if callback is not None:
+            callback(self, pycode)
+        self.all_code_objs.add_handle(pycode)
+
+    def set_code_callback(self, callback):
+        ec = self.getexecutioncontext()
+        ec.register_code_callback = callback
+        
     def _freeze_(self):
         return True
 
diff --git a/pypy/interpreter/executioncontext.py b/pypy/interpreter/executioncontext.py
--- a/pypy/interpreter/executioncontext.py
+++ b/pypy/interpreter/executioncontext.py
@@ -33,6 +33,11 @@
         self.profilefunc = None
         self.w_profilefuncarg = None
         self.thread_disappeared = False   # might be set to True after os.fork()
+        self.register_code_callback = None
+        if sys.maxint == 2147483647:
+            self._code_unique_id = 0 # XXX this is wrong, it won't work on 32bit
+        else:
+            self._code_unique_id = 0x7000000000000000
 
     @staticmethod
     def _mark_thread_disappeared(space):
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -14,9 +14,10 @@
     CO_OPTIMIZED, CO_NEWLOCALS, CO_VARARGS, CO_VARKEYWORDS, CO_NESTED,
     CO_GENERATOR, CO_KILL_DOCSTRING, CO_YIELD_INSIDE_TRY)
 from pypy.tool.stdlib_opcode import opcodedesc, HAVE_ARGUMENT
-from rpython.rlib.rarithmetic import intmask
+from rpython.rlib.rarithmetic import intmask, r_longlong
 from rpython.rlib.objectmodel import compute_hash
 from rpython.rlib import jit
+from rpython.rlib.debug import debug_start, debug_stop, debug_print
 
 
 class BytecodeCorruption(Exception):
@@ -54,8 +55,9 @@
     "CPython-style code objects."
     _immutable_ = True
     _immutable_fields_ = ["co_consts_w[*]", "co_names_w[*]", "co_varnames[*]",
-                          "co_freevars[*]", "co_cellvars[*]", "_args_as_cellvars[*]"]
-
+                          "co_freevars[*]", "co_cellvars[*]",
+                          "_args_as_cellvars[*]"]
+    
     def __init__(self, space,  argcount, nlocals, stacksize, flags,
                      code, consts, names, varnames, filename,
                      name, firstlineno, lnotab, freevars, cellvars,
@@ -83,6 +85,7 @@
         self.magic = magic
         self._signature = cpython_code_signature(self)
         self._initialize()
+        space.register_code_object(self)
 
     def _initialize(self):
         if self.co_cellvars:
@@ -124,6 +127,15 @@
             from pypy.objspace.std.mapdict import init_mapdict_cache
             init_mapdict_cache(self)
 
+        ec = self.space.getexecutioncontext()
+        self._unique_id = ec._code_unique_id
+        ec._code_unique_id += 2 # so we have one bit that we can mark stuff
+        # with
+
+    def _get_full_name(self):
+        return "py:%s:%d:%s" % (self.co_name, self.co_firstlineno,
+                                self.co_filename)
+
     def _cleanup_(self):
         if (self.magic == cpython_magic and
             '__pypy__' not in sys.builtin_module_names):
diff --git a/pypy/module/_vmprof/__init__.py b/pypy/module/_vmprof/__init__.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/__init__.py
@@ -0,0 +1,18 @@
+from pypy.interpreter.mixedmodule import MixedModule
+
+class Module(MixedModule):
+    """
+    Write me :)
+    """
+    appleveldefs = {
+    }
+
+    interpleveldefs = {
+        'enable': 'interp_vmprof.enable',
+        'disable': 'interp_vmprof.disable',
+    }
+
+    def setup_after_space_initialization(self):
+        # force the __extend__ hacks to occur early
+        from pypy.module._vmprof.interp_vmprof import VMProf
+        self.vmprof = VMProf()
diff --git a/pypy/module/_vmprof/interp_vmprof.py b/pypy/module/_vmprof/interp_vmprof.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/interp_vmprof.py
@@ -0,0 +1,223 @@
+import py, os, sys
+from rpython.rtyper.lltypesystem import lltype, rffi, llmemory
+from rpython.translator.tool.cbuild import ExternalCompilationInfo
+from rpython.rtyper.annlowlevel import cast_instance_to_gcref, cast_base_ptr_to_instance
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.rlib import jit, rposix, entrypoint
+from rpython.rtyper.tool import rffi_platform as platform
+from rpython.rlib.rstring import StringBuilder
+from pypy.interpreter.baseobjspace import W_Root
+from pypy.interpreter.error import oefmt, wrap_oserror, OperationError
+from pypy.interpreter.gateway import unwrap_spec
+from pypy.interpreter.pyframe import PyFrame
+
+ROOT = py.path.local(__file__).join('..')
+SRC = ROOT.join('src')
+
+# by default, we statically link vmprof.c into pypy; however, if you set
+# DYNAMIC_VMPROF to True, it will be dynamically linked to the libvmprof.so
+# which is expected to be inside pypy/module/_vmprof/src: this is very useful
+# during development. Note that you have to manually build libvmprof by
+# running make inside the src dir
+DYNAMIC_VMPROF = False
+
+eci_kwds = dict(
+    include_dirs = [SRC],
+    includes = ['vmprof.h', 'trampoline.h'],
+    separate_module_files = [SRC.join('trampoline.asmgcc.s')],
+    libraries = ['unwind'],
+    
+    post_include_bits=["""
+        void* pypy_vmprof_get_virtual_ip(void*);
+        void pypy_vmprof_init(void);
+    """],
+    
+    separate_module_sources=["""
+        void pypy_vmprof_init(void) {
+            vmprof_set_mainloop(pypy_execute_frame_trampoline, 0,
+                                pypy_vmprof_get_virtual_ip);
+        }
+    """],
+    )
+
+
+if DYNAMIC_VMPROF:
+    eci_kwds['libraries'] += ['vmprof']
+    eci_kwds['link_extra'] = ['-Wl,-rpath,%s' % SRC, '-L%s' % SRC]
+else:
+    eci_kwds['separate_module_files'] += [SRC.join('vmprof.c')]
+
+eci = ExternalCompilationInfo(**eci_kwds)
+
+check_eci = eci.merge(ExternalCompilationInfo(separate_module_files=[
+    SRC.join('fake_pypy_api.c')]))
+
+platform.verify_eci(check_eci)
+
+pypy_execute_frame_trampoline = rffi.llexternal(
+    "pypy_execute_frame_trampoline",
+    [llmemory.GCREF, llmemory.GCREF, llmemory.GCREF],
+    llmemory.GCREF,
+    compilation_info=eci,
+    _nowrapper=True, sandboxsafe=True,
+    random_effects_on_gcobjs=True)
+
+pypy_vmprof_init = rffi.llexternal("pypy_vmprof_init", [], lltype.Void,
+                                   compilation_info=eci)
+vmprof_enable = rffi.llexternal("vmprof_enable",
+                                [rffi.INT, rffi.LONG, rffi.INT,
+                                 rffi.CCHARP, rffi.INT],
+                                rffi.INT, compilation_info=eci,
+                                save_err=rffi.RFFI_SAVE_ERRNO)
+vmprof_disable = rffi.llexternal("vmprof_disable", [], rffi.INT,
+                                 compilation_info=eci,
+                                save_err=rffi.RFFI_SAVE_ERRNO)
+
+vmprof_register_virtual_function = rffi.llexternal(
+    "vmprof_register_virtual_function",
+    [rffi.CCHARP, rffi.VOIDP, rffi.VOIDP], lltype.Void,
+    compilation_info=eci, _nowrapper=True)
+
+original_execute_frame = PyFrame.execute_frame.im_func
+original_execute_frame.c_name = 'pypy_pyframe_execute_frame'
+original_execute_frame._dont_inline_ = True
+
+class __extend__(PyFrame):
+    def execute_frame(frame, w_inputvalue=None, operr=None):
+        # go through the asm trampoline ONLY if we are translated but not being JITted.
+        #
+        # If we are not translated, we obviously don't want to go through the
+        # trampoline because there is no C function it can call.
+        #
+        # If we are being JITted, we want to skip the trampoline, else the JIT
+        # cannot see throug it
+        if we_are_translated() and not jit.we_are_jitted():
+            # if we are translated, call the trampoline
+            gc_frame = cast_instance_to_gcref(frame)
+            gc_inputvalue = cast_instance_to_gcref(w_inputvalue)
+            gc_operr = cast_instance_to_gcref(operr)
+            gc_result = pypy_execute_frame_trampoline(gc_frame, gc_inputvalue, gc_operr)
+            return cast_base_ptr_to_instance(W_Root, gc_result)
+        else:
+            return original_execute_frame(frame, w_inputvalue, operr)
+
+
+ at entrypoint.entrypoint_lowlevel('main', [llmemory.GCREF],
+                                'pypy_vmprof_get_virtual_ip', True)
+def get_virtual_ip(gc_frame):
+    frame = cast_base_ptr_to_instance(PyFrame, gc_frame)
+    if jit._get_virtualizable_token(frame):
+        return rffi.cast(rffi.VOIDP, 0)
+    virtual_ip = do_get_virtual_ip(frame)
+    return rffi.cast(rffi.VOIDP, virtual_ip)
+
+def do_get_virtual_ip(frame):
+    return frame.pycode._unique_id
+
+def write_long_to_string_builder(l, b):
+    if sys.maxint == 2147483647:
+        b.append(chr(l & 0xff))
+        b.append(chr((l >> 8) & 0xff))
+        b.append(chr((l >> 16) & 0xff))
+        b.append(chr((l >> 24) & 0xff))
+    else:
+        b.append(chr(l & 0xff))
+        b.append(chr((l >> 8) & 0xff))
+        b.append(chr((l >> 16) & 0xff))
+        b.append(chr((l >> 24) & 0xff))
+        b.append(chr((l >> 32) & 0xff))
+        b.append(chr((l >> 40) & 0xff))
+        b.append(chr((l >> 48) & 0xff))
+        b.append(chr((l >> 56) & 0xff))
+
+class VMProf(object):
+    def __init__(self):
+        self.is_enabled = False
+        self.ever_enabled = False
+        self.mapping_so_far = [] # stored mapping in between runs
+        self.fileno = -1
+
+    def enable(self, space, fileno, period):
+        if self.is_enabled:
+            raise oefmt(space.w_ValueError, "_vmprof already enabled")
+        self.fileno = fileno
+        self.is_enabled = True
+        self.write_header(fileno, period)
+        if not self.ever_enabled:
+            if we_are_translated():
+                pypy_vmprof_init()
+            self.ever_enabled = True
+        for weakcode in space.all_code_objs.get_all_handles():
+            code = weakcode()
+            if code:
+                self.register_code(space, code)
+        space.set_code_callback(vmprof_register_code)
+        if we_are_translated():
+            # does not work untranslated
+            res = vmprof_enable(fileno, period, 0,
+                                lltype.nullptr(rffi.CCHARP.TO), 0)
+        else:
+            res = 0
+        if res == -1:
+            raise wrap_oserror(space, OSError(rposix.get_saved_errno(),
+                                              "_vmprof.enable"))
+
+    def write_header(self, fileno, period):
+        if period == -1:
+            period_usec = 1000000 / 100 #  100hz
+        else:
+            period_usec = period
+        b = StringBuilder()
+        write_long_to_string_builder(0, b)
+        write_long_to_string_builder(3, b)
+        write_long_to_string_builder(0, b)
+        write_long_to_string_builder(period_usec, b)
+        write_long_to_string_builder(0, b)
+        os.write(fileno, b.build())
+
+    def register_code(self, space, code):
+        if self.fileno == -1:
+            raise OperationError(space.w_RuntimeError,
+                                 space.wrap("vmprof not running"))
+        name = code._get_full_name()
+        b = StringBuilder()
+        b.append('\x02')
+        write_long_to_string_builder(code._unique_id, b)
+        write_long_to_string_builder(len(name), b)
+        b.append(name)
+        os.write(self.fileno, b.build())
+
+    def disable(self, space):
+        if not self.is_enabled:
+            raise oefmt(space.w_ValueError, "_vmprof not enabled")
+        self.is_enabled = False
+        self.fileno = -1
+        if we_are_translated():
+           # does not work untranslated
+            res = vmprof_disable()
+        else:
+            res = 0
+        space.set_code_callback(None)
+        if res == -1:
+            raise wrap_oserror(space, OSError(rposix.get_saved_errno(),
+                                              "_vmprof.disable"))
+
+def vmprof_register_code(space, code):
+    from pypy.module._vmprof import Module
+    mod_vmprof = space.getbuiltinmodule('_vmprof')
+    assert isinstance(mod_vmprof, Module)
+    mod_vmprof.vmprof.register_code(space, code)
+        
+ at unwrap_spec(fileno=int, period=int)
+def enable(space, fileno, period=-1):
+    from pypy.module._vmprof import Module
+    mod_vmprof = space.getbuiltinmodule('_vmprof')
+    assert isinstance(mod_vmprof, Module)
+    mod_vmprof.vmprof.enable(space, fileno, period)
+
+def disable(space):
+    from pypy.module._vmprof import Module
+    mod_vmprof = space.getbuiltinmodule('_vmprof')
+    assert isinstance(mod_vmprof, Module)
+    mod_vmprof.vmprof.disable(space)
+
diff --git a/pypy/module/_vmprof/src/config.h b/pypy/module/_vmprof/src/config.h
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/config.h
@@ -0,0 +1,2 @@
+#define HAVE_SYS_UCONTEXT_H
+#define PC_FROM_UCONTEXT uc_mcontext.gregs[REG_RIP]
diff --git a/pypy/module/_vmprof/src/fake_pypy_api.c b/pypy/module/_vmprof/src/fake_pypy_api.c
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/fake_pypy_api.c
@@ -0,0 +1,29 @@
+
+long pypy_jit_start_addr(void)
+{
+	return 3;
+}
+
+long pypy_jit_end_addr(void)
+{
+	return 3;
+}
+
+long pypy_jit_stack_depth_at_loc(long x)
+{
+	return 0;
+}
+
+long pypy_find_codemap_at_addr(long x)
+{
+	return 0;
+}
+
+long pypy_yield_codemap_at_addr(long x, long y, long *a)
+{
+	return 0;
+}
+
+void pypy_pyframe_execute_frame(void)
+{
+}
diff --git a/pypy/module/_vmprof/src/get_custom_offset.c b/pypy/module/_vmprof/src/get_custom_offset.c
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/get_custom_offset.c
@@ -0,0 +1,46 @@
+
+long pypy_jit_start_addr();
+long pypy_jit_end_addr();
+long pypy_jit_stack_depth_at_loc(long);
+long pypy_find_codemap_at_addr(long);
+long pypy_yield_codemap_at_addr(long, long, long*);
+
+void vmprof_set_tramp_range(void* start, void* end)
+{
+}
+
+static ptrdiff_t vmprof_unw_get_custom_offset(void* ip, unw_cursor_t *cp) {
+	intptr_t ip_l = (intptr_t)ip;
+
+	if (ip_l < pypy_jit_start_addr() || ip_l > pypy_jit_end_addr()) {
+		return -1;
+	}
+	return (void*)pypy_jit_stack_depth_at_loc(ip_l);
+}
+
+static long vmprof_write_header_for_jit_addr(void **result, long n,
+											 void *ip, int max_depth)
+{
+	long codemap_pos;
+	long current_pos = 0;
+	intptr_t id;
+	intptr_t addr = (intptr_t)ip;
+
+	if (addr < pypy_jit_start_addr() || addr > pypy_jit_end_addr()) {
+		return n;
+	}
+	codemap_pos = pypy_find_codemap_at_addr(addr);
+	if (codemap_pos == -1) {
+		return n;
+	}
+	while (1) {
+		id = pypy_yield_codemap_at_addr(codemap_pos, addr, &current_pos);
+		if (id == 0) {
+			return n;
+		}
+		result[n++] = id;
+		if (n >= max_depth) {
+			return n;
+		}
+	}
+}
diff --git a/pypy/module/_vmprof/src/getpc.h b/pypy/module/_vmprof/src/getpc.h
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/getpc.h
@@ -0,0 +1,187 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// This is an internal header file used by profiler.cc.  It defines
+// the single (inline) function GetPC.  GetPC is used in a signal
+// handler to figure out the instruction that was being executed when
+// the signal-handler was triggered.
+//
+// To get this, we use the ucontext_t argument to the signal-handler
+// callback, which holds the full context of what was going on when
+// the signal triggered.  How to get from a ucontext_t to a Program
+// Counter is OS-dependent.
+
+#ifndef BASE_GETPC_H_
+#define BASE_GETPC_H_
+
+#include "config.h"
+
+// On many linux systems, we may need _GNU_SOURCE to get access to
+// the defined constants that define the register we want to see (eg
+// REG_EIP).  Note this #define must come first!
+#define _GNU_SOURCE 1
+// If #define _GNU_SOURCE causes problems, this might work instead.
+// It will cause problems for FreeBSD though!, because it turns off
+// the needed __BSD_VISIBLE.
+//#define _XOPEN_SOURCE 500
+
+#include <string.h>         // for memcmp
+#if defined(HAVE_SYS_UCONTEXT_H)
+#include <sys/ucontext.h>
+#elif defined(HAVE_UCONTEXT_H)
+#include <ucontext.h>       // for ucontext_t (and also mcontext_t)
+#elif defined(HAVE_CYGWIN_SIGNAL_H)
+#include <cygwin/signal.h>
+typedef ucontext ucontext_t;
+#endif
+
+
+// Take the example where function Foo() calls function Bar().  For
+// many architectures, Bar() is responsible for setting up and tearing
+// down its own stack frame.  In that case, it's possible for the
+// interrupt to happen when execution is in Bar(), but the stack frame
+// is not properly set up (either before it's done being set up, or
+// after it's been torn down but before Bar() returns).  In those
+// cases, the stack trace cannot see the caller function anymore.
+//
+// GetPC can try to identify this situation, on architectures where it
+// might occur, and unwind the current function call in that case to
+// avoid false edges in the profile graph (that is, edges that appear
+// to show a call skipping over a function).  To do this, we hard-code
+// in the asm instructions we might see when setting up or tearing
+// down a stack frame.
+//
+// This is difficult to get right: the instructions depend on the
+// processor, the compiler ABI, and even the optimization level.  This
+// is a best effort patch -- if we fail to detect such a situation, or
+// mess up the PC, nothing happens; the returned PC is not used for
+// any further processing.
+struct CallUnrollInfo {
+  // Offset from (e)ip register where this instruction sequence
+  // should be matched. Interpreted as bytes. Offset 0 is the next
+  // instruction to execute. Be extra careful with negative offsets in
+  // architectures of variable instruction length (like x86) - it is
+  // not that easy as taking an offset to step one instruction back!
+  int pc_offset;
+  // The actual instruction bytes. Feel free to make it larger if you
+  // need a longer sequence.
+  unsigned char ins[16];
+  // How many bytes to match from ins array?
+  int ins_size;
+  // The offset from the stack pointer (e)sp where to look for the
+  // call return address. Interpreted as bytes.
+  int return_sp_offset;
+};
+
+
+// The dereferences needed to get the PC from a struct ucontext were
+// determined at configure time, and stored in the macro
+// PC_FROM_UCONTEXT in config.h.  The only thing we need to do here,
+// then, is to do the magic call-unrolling for systems that support it.
+
+// -- Special case 1: linux x86, for which we have CallUnrollInfo
+#if defined(__linux) && defined(__i386) && defined(__GNUC__)
+static const CallUnrollInfo callunrollinfo[] = {
+  // Entry to a function:  push %ebp;  mov  %esp,%ebp
+  // Top-of-stack contains the caller IP.
+  { 0,
+    {0x55, 0x89, 0xe5}, 3,
+    0
+  },
+  // Entry to a function, second instruction:  push %ebp;  mov  %esp,%ebp
+  // Top-of-stack contains the old frame, caller IP is +4.
+  { -1,
+    {0x55, 0x89, 0xe5}, 3,
+    4
+  },
+  // Return from a function: RET.
+  // Top-of-stack contains the caller IP.
+  { 0,
+    {0xc3}, 1,
+    0
+  }
+};
+
+inline void* GetPC(ucontext_t *signal_ucontext) {
+  // See comment above struct CallUnrollInfo.  Only try instruction
+  // flow matching if both eip and esp looks reasonable.
+  const int eip = signal_ucontext->uc_mcontext.gregs[REG_EIP];
+  const int esp = signal_ucontext->uc_mcontext.gregs[REG_ESP];
+  if ((eip & 0xffff0000) != 0 && (~eip & 0xffff0000) != 0 &&
+      (esp & 0xffff0000) != 0) {
+    char* eip_char = reinterpret_cast<char*>(eip);
+    for (int i = 0; i < sizeof(callunrollinfo)/sizeof(*callunrollinfo); ++i) {
+      if (!memcmp(eip_char + callunrollinfo[i].pc_offset,
+                  callunrollinfo[i].ins, callunrollinfo[i].ins_size)) {
+        // We have a match.
+        void **retaddr = (void**)(esp + callunrollinfo[i].return_sp_offset);
+        return *retaddr;
+      }
+    }
+  }
+  return (void*)eip;
+}
+
+// Special case #2: Windows, which has to do something totally different.
+#elif defined(_WIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__) || defined(__MINGW32__)
+// If this is ever implemented, probably the way to do it is to have
+// profiler.cc use a high-precision timer via timeSetEvent:
+//    http://msdn2.microsoft.com/en-us/library/ms712713.aspx
+// We'd use it in mode TIME_CALLBACK_FUNCTION/TIME_PERIODIC.
+// The callback function would be something like prof_handler, but
+// alas the arguments are different: no ucontext_t!  I don't know
+// how we'd get the PC (using StackWalk64?)
+//    http://msdn2.microsoft.com/en-us/library/ms680650.aspx
+
+#include "base/logging.h"   // for RAW_LOG
+#ifndef HAVE_CYGWIN_SIGNAL_H
+typedef int ucontext_t;
+#endif
+
+inline void* GetPC(ucontext_t *signal_ucontext) {
+  RAW_LOG(ERROR, "GetPC is not yet implemented on Windows\n");
+  return NULL;
+}
+
+// Normal cases.  If this doesn't compile, it's probably because
+// PC_FROM_UCONTEXT is the empty string.  You need to figure out
+// the right value for your system, and add it to the list in
+// configure.ac (or set it manually in your config.h).
+#else
+inline void* GetPC(ucontext_t *signal_ucontext) {
+  return (void*)signal_ucontext->PC_FROM_UCONTEXT;   // defined in config.h
+}
+
+#endif
+
+#endif  // BASE_GETPC_H_
diff --git a/pypy/module/_vmprof/src/trampoline.asmgcc.s b/pypy/module/_vmprof/src/trampoline.asmgcc.s
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/trampoline.asmgcc.s
@@ -0,0 +1,17 @@
+// NOTE: you need to use TABs, not spaces!
+        
+	.text
+	.p2align 4,,-1
+	.globl	pypy_execute_frame_trampoline
+	.type	pypy_execute_frame_trampoline, @function
+pypy_execute_frame_trampoline:
+	.cfi_startproc
+	pushq	%rdi
+	.cfi_def_cfa_offset 16
+	call pypy_pyframe_execute_frame at PLT
+	/* GCROOT 0(%rsp) */
+	popq	%rdi
+	.cfi_def_cfa_offset 8
+	ret
+	.cfi_endproc
+	.size	pypy_execute_frame_trampoline, .-pypy_execute_frame_trampoline
diff --git a/pypy/module/_vmprof/src/trampoline.h b/pypy/module/_vmprof/src/trampoline.h
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/trampoline.h
@@ -0,0 +1,1 @@
+void* pypy_execute_frame_trampoline(void*, void*, void*);
diff --git a/pypy/module/_vmprof/src/vmprof.c b/pypy/module/_vmprof/src/vmprof.c
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/vmprof.c
@@ -0,0 +1,340 @@
+/* VMPROF
+ *
+ * statistical sampling profiler specifically designed to profile programs
+ * which run on a Virtual Machine and/or bytecode interpreter, such as Python,
+ * etc.
+ *
+ * The logic to dump the C stack traces is partly stolen from the code in gperftools.
+ * The file "getpc.h" has been entirely copied from gperftools.
+ *
+ * Tested only on gcc, linux, x86_64.
+ *
+ * Copyright (C) 2014-2015
+ *   Antonio Cuni - anto.cuni at gmail.com
+ *   Maciej Fijalkowski - fijall at gmail.com
+ *
+ */
+
+
+#include "getpc.h"      // should be first to get the _GNU_SOURCE dfn
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <stddef.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#define UNW_LOCAL_ONLY
+#include <libunwind.h>
+
+#include "vmprof.h"
+
+#define _unused(x) ((void)x)
+
+#define MAX_FUNC_NAME 128
+#define MAX_STACK_DEPTH 64
+
+static FILE* profile_file = NULL;
+void* vmprof_mainloop_func;
+static ptrdiff_t mainloop_sp_offset;
+static vmprof_get_virtual_ip_t mainloop_get_virtual_ip;
+
+
+/* *************************************************************
+ * functions to write a profile file compatible with gperftools
+ * *************************************************************
+ */
+
+#define MARKER_STACKTRACE '\x01'
+#define MARKER_VIRTUAL_IP '\x02'
+#define MARKER_TRAILER '\x03'
+
+static void prof_word(FILE* f, long x) {
+    fwrite(&x, sizeof(x), 1, f);
+}
+
+static void prof_header(FILE* f, long period_usec) {
+    prof_word(f, 0);
+    prof_word(f, 3);
+    prof_word(f, 0);
+    prof_word(f, period_usec);
+    prof_word(f, 0);
+}
+
+static void prof_write_stacktrace(FILE* f, void** stack, int depth, int count) {
+    int i;
+	char marker = MARKER_STACKTRACE;
+
+	fwrite(&marker, 1, 1, f);
+    prof_word(f, count);
+    prof_word(f, depth);
+    for(i=0; i<depth; i++)
+        prof_word(f, (long)stack[i]);
+}
+
+
+/* ******************************************************
+ * libunwind workaround for process JIT frames correctly
+ * ******************************************************
+ */
+
+#include "get_custom_offset.c"
+
+typedef struct {
+    void* _unused1;
+    void* _unused2;
+    void* sp;
+    void* ip;
+    void* _unused3[sizeof(unw_cursor_t)/sizeof(void*) - 4];
+} vmprof_hacked_unw_cursor_t;
+
+static int vmprof_unw_step(unw_cursor_t *cp) {
+	void* ip;
+    void* sp;
+    ptrdiff_t sp_offset;
+    unw_get_reg (cp, UNW_REG_IP, (unw_word_t*)&ip);
+    unw_get_reg (cp, UNW_REG_SP, (unw_word_t*)&sp);
+    sp_offset = vmprof_unw_get_custom_offset(ip, cp);
+
+    if (sp_offset == -1) {
+        // it means that the ip is NOT in JITted code, so we can use the
+        // stardard unw_step
+        return unw_step(cp);
+    }
+    else {
+        // this is a horrible hack to manually walk the stack frame, by
+        // setting the IP and SP in the cursor
+        vmprof_hacked_unw_cursor_t *cp2 = (vmprof_hacked_unw_cursor_t*)cp;
+        void* bp = (void*)sp + sp_offset;
+        cp2->sp = bp;
+		bp -= sizeof(void*);
+        cp2->ip = ((void**)bp)[0];
+        // the ret is on the top of the stack minus WORD
+        return 1;
+    }
+}
+
+
+/* *************************************************************
+ * functions to dump the stack trace
+ * *************************************************************
+ */
+
+// stolen from pprof:
+// Sometimes, we can try to get a stack trace from within a stack
+// trace, because libunwind can call mmap (maybe indirectly via an
+// internal mmap based memory allocator), and that mmap gets trapped
+// and causes a stack-trace request.  If were to try to honor that
+// recursive request, we'd end up with infinite recursion or deadlock.
+// Luckily, it's safe to ignore those subsequent traces.  In such
+// cases, we return 0 to indicate the situation.
+//static __thread int recursive;
+static int recursive; // XXX antocuni: removed __thread
+
+int get_stack_trace(void** result, int max_depth, ucontext_t *ucontext) {
+    void *ip;
+    int n = 0;
+    unw_cursor_t cursor;
+    unw_context_t uc = *ucontext;
+    if (recursive) {
+        return 0;
+    }
+    ++recursive;
+
+    int ret = unw_init_local(&cursor, &uc);
+    assert(ret >= 0);
+    _unused(ret);
+
+    while (n < max_depth) {
+        if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) {
+            break;
+        }
+
+        unw_proc_info_t pip;
+        unw_get_proc_info(&cursor, &pip);
+
+        /* char funcname[4096]; */
+        /* unw_word_t offset; */
+        /* unw_get_proc_name(&cursor, funcname, 4096, &offset); */
+        /* printf("%s+%#lx <%p>\n", funcname, offset, ip); */
+
+        /* if n==0, it means that the signal handler interrupted us while we
+           were in the trampoline, so we are not executing (yet) the real main
+           loop function; just skip it */
+        if (vmprof_mainloop_func && 
+            (void*)pip.start_ip == (void*)vmprof_mainloop_func &&
+            n > 0) {
+          // found main loop stack frame
+          void* sp;
+          unw_get_reg(&cursor, UNW_REG_SP, (unw_word_t *) &sp);
+          void *arg_addr = (char*)sp + mainloop_sp_offset;
+          void **arg_ptr = (void**)arg_addr;
+          // fprintf(stderr, "stacktrace mainloop: rsp %p   &f2 %p   offset %ld\n", 
+          //         sp, arg_addr, mainloop_sp_offset);
+          ip = mainloop_get_virtual_ip(*arg_ptr);
+        }
+
+        result[n++] = ip;
+		n = vmprof_write_header_for_jit_addr(result, n, ip, max_depth);
+        if (vmprof_unw_step(&cursor) <= 0) {
+            break;
+        }
+    }
+    --recursive;
+    return n;
+}
+
+
+static int __attribute__((noinline)) frame_forcer(int rv) {
+    return rv;
+}
+
+static void sigprof_handler(int sig_nr, siginfo_t* info, void *ucontext) {
+    void* stack[MAX_STACK_DEPTH];
+    stack[0] = GetPC((ucontext_t*)ucontext);
+    int depth = frame_forcer(get_stack_trace(stack+1, MAX_STACK_DEPTH-1, ucontext));
+    depth++;  // To account for pc value in stack[0];
+    prof_write_stacktrace(profile_file, stack, depth, 1);
+}
+
+/* *************************************************************
+ * functions to enable/disable the profiler
+ * *************************************************************
+ */
+
+static int open_profile(int fd, long period_usec, int write_header, char *s,
+						int slen) {
+	if ((fd = dup(fd)) == -1) {
+		return -1;
+	}
+    profile_file = fdopen(fd, "wb");
+	if (!profile_file) {
+		return -1;
+	}
+	if (write_header)
+		prof_header(profile_file, period_usec);
+	if (s)
+		fwrite(s, slen, 1, profile_file);
+	return 0;
+}
+
+static int close_profile(void) {
+	// XXX all of this can happily fail
+    FILE* src;
+    char buf[BUFSIZ];
+    size_t size;
+	int marker = MARKER_TRAILER;
+	fwrite(&marker, 1, 1, profile_file);
+
+    // copy /proc/PID/maps to the end of the profile file
+    sprintf(buf, "/proc/%d/maps", getpid());
+    src = fopen(buf, "r");    
+    while ((size = fread(buf, 1, BUFSIZ, src))) {
+        fwrite(buf, 1, size, profile_file);
+    }
+    fclose(src);
+    fclose(profile_file);
+	return 0;
+}
+
+
+static int install_sigprof_handler(void) {
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_sigaction = sigprof_handler;
+    sa.sa_flags = SA_RESTART | SA_SIGINFO;
+    if (sigemptyset(&sa.sa_mask) == -1 ||
+		sigaction(SIGPROF, &sa, NULL) == -1) {
+		return -1;
+	}
+	return 0;
+}
+
+static int remove_sigprof_handler(void) {
+    //sighandler_t res = signal(SIGPROF, SIG_DFL);
+	//if (res == SIG_ERR) {
+	//	return -1;
+	//}
+	return 0;
+};
+
+static int install_sigprof_timer(long period_usec) {
+    static struct itimerval timer;
+    timer.it_interval.tv_sec = 0;
+    timer.it_interval.tv_usec = period_usec;
+    timer.it_value = timer.it_interval;
+    if (setitimer(ITIMER_PROF, &timer, NULL) != 0) {
+		return -1;
+    }
+	return 0;
+}
+
+static int remove_sigprof_timer(void) {
+    static struct itimerval timer;
+    timer.it_interval.tv_sec = 0;
+    timer.it_interval.tv_usec = 0;
+    timer.it_value = timer.it_interval;
+    if (setitimer(ITIMER_PROF, &timer, NULL) != 0) {
+		return -1;
+    }
+	return 0;
+}
+
+/* *************************************************************
+ * public API
+ * *************************************************************
+ */
+
+void vmprof_set_mainloop(void* func, ptrdiff_t sp_offset, 
+                         vmprof_get_virtual_ip_t get_virtual_ip) {
+    mainloop_sp_offset = sp_offset;
+    mainloop_get_virtual_ip = get_virtual_ip;
+    vmprof_mainloop_func = func;
+}
+
+int vmprof_enable(int fd, long period_usec, int write_header, char *s,
+				  int slen)
+{
+    if (period_usec == -1)
+        period_usec = 1000000 / 100; /* 100hz */
+    if (open_profile(fd, period_usec, write_header, s, slen) == -1) {
+		return -1;
+	}
+    if (install_sigprof_handler() == -1) {
+		return -1;
+	}
+    if (install_sigprof_timer(period_usec) == -1) {
+		return -1;
+	}
+	return 0;
+}
+
+int vmprof_disable(void) {
+    if (remove_sigprof_timer() == -1) {
+		return -1;
+	}
+    if (remove_sigprof_handler() == -1) {
+		return -1;
+	}
+    if (close_profile() == -1) {
+		return -1;
+	}
+	return 0;
+}
+
+void vmprof_register_virtual_function(const char* name, void* start, void* end) {
+    // for now *end is simply ignored
+	char buf[1024];
+	int lgt = strlen(name) + 2 * sizeof(long) + 1;
+
+	if (lgt > 1024) {
+		lgt = 1024;
+	}
+	buf[0] = MARKER_VIRTUAL_IP;
+	((void **)(((void*)buf) + 1))[0] = start;
+	((long *)(((void*)buf) + 1 + sizeof(long)))[0] = lgt - 2 * sizeof(long) - 1;
+	strncpy(buf + 2 * sizeof(long) + 1, name, 1024 - 2 * sizeof(long) - 1);
+	fwrite(buf, lgt, 1, profile_file);
+}
diff --git a/pypy/module/_vmprof/src/vmprof.h b/pypy/module/_vmprof/src/vmprof.h
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/vmprof.h
@@ -0,0 +1,22 @@
+#ifndef VMPROF_VMPROF_H_
+#define VMPROF_VMPROF_H_
+
+#include <stddef.h>
+
+typedef void* (*vmprof_get_virtual_ip_t)(void*);
+
+extern void* vmprof_mainloop_func;
+void vmprof_set_mainloop(void* func, ptrdiff_t sp_offset, 
+                         vmprof_get_virtual_ip_t get_virtual_ip);
+
+void vmprof_register_virtual_function(const char* name, void* start, void* end);
+
+
+int vmprof_enable(int fd, long period_usec, int write_header, char* vips,
+				  int vips_len);
+int vmprof_disable(void);
+
+// XXX: this should be part of _vmprof (the CPython extension), not vmprof (the library)
+void vmprof_set_tramp_range(void* start, void* end);
+
+#endif
diff --git a/pypy/module/_vmprof/test/__init__.py b/pypy/module/_vmprof/test/__init__.py
new file mode 100644
diff --git a/pypy/module/_vmprof/test/test__vmprof.py b/pypy/module/_vmprof/test/test__vmprof.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/test/test__vmprof.py
@@ -0,0 +1,55 @@
+
+import tempfile
+from pypy.tool.pytest.objspace import gettestobjspace
+
+class AppTestVMProf(object):
+    def setup_class(cls):
+        cls.space = gettestobjspace(usemodules=['_vmprof', 'struct'])
+        cls.tmpfile = tempfile.NamedTemporaryFile()
+        cls.w_tmpfileno = cls.space.wrap(cls.tmpfile.fileno())
+        cls.w_tmpfilename = cls.space.wrap(cls.tmpfile.name)
+        cls.tmpfile2 = tempfile.NamedTemporaryFile()
+        cls.w_tmpfileno2 = cls.space.wrap(cls.tmpfile2.fileno())
+        cls.w_tmpfilename2 = cls.space.wrap(cls.tmpfile2.name)
+
+    def test_import_vmprof(self):
+        import struct, sys
+
+        WORD = struct.calcsize('l')
+        
+        def count(s):
+            i = 0
+            count = 0
+            i += 5 * WORD # header
+            while i < len(s):
+                assert s[i] == '\x02'
+                i += 1
+                _, size = struct.unpack("ll", s[i:i + 2 * WORD])
+                count += 1
+                i += 2 * WORD + size
+            return count
+        
+        import _vmprof
+        _vmprof.enable(self.tmpfileno)
+        _vmprof.disable()
+        s = open(self.tmpfilename).read()
+        no_of_codes = count(s)
+        assert no_of_codes > 10
+        d = {}
+
+        exec """def foo():
+            pass
+        """ in d
+
+        _vmprof.enable(self.tmpfileno2)
+
+        exec """def foo2():
+            pass
+        """ in d
+
+        _vmprof.disable()
+        s = open(self.tmpfilename2).read()
+        no_of_codes2 = count(s)
+        assert "py:foo:" in s
+        assert "py:foo2:" in s
+        assert no_of_codes2 >= no_of_codes + 2 # some extra codes from tests
diff --git a/pypy/module/pypyjit/interp_jit.py b/pypy/module/pypyjit/interp_jit.py
--- a/pypy/module/pypyjit/interp_jit.py
+++ b/pypy/module/pypyjit/interp_jit.py
@@ -35,6 +35,9 @@
     name = opcode_method_names[ord(bytecode.co_code[next_instr])]
     return '%s #%d %s' % (bytecode.get_repr(), next_instr, name)
 
+def get_unique_id(next_instr, is_being_profiled, bytecode):
+    return bytecode._unique_id
+
 
 def should_unroll_one_iteration(next_instr, is_being_profiled, bytecode):
     return (bytecode.co_flags & CO_GENERATOR) != 0
@@ -45,6 +48,7 @@
     virtualizables = ['frame']
 
 pypyjitdriver = PyPyJitDriver(get_printable_location = get_printable_location,
+                              get_unique_id = get_unique_id,
                               should_unroll_one_iteration =
                               should_unroll_one_iteration,
                               name='pypyjit')
diff --git a/rpython/jit/backend/arm/assembler.py b/rpython/jit/backend/arm/assembler.py
--- a/rpython/jit/backend/arm/assembler.py
+++ b/rpython/jit/backend/arm/assembler.py
@@ -57,6 +57,7 @@
         BaseAssembler.setup_once(self)
 
     def setup(self, looptoken):
+        BaseAssembler.setup(self, looptoken)
         assert self.memcpy_addr != 0, 'setup_once() not called?'
         if we_are_translated():
             self.debug = False
@@ -71,7 +72,6 @@
         self.mc.datablockwrapper = self.datablockwrapper
         self.target_tokens_currently_compiling = {}
         self.frame_depth_to_patch = []
-        self._finish_gcmap = lltype.nullptr(jitframe.GCMAP)
 
     def teardown(self):
         self.current_clt = None
@@ -659,6 +659,7 @@
             assert len(set(inputargs)) == len(inputargs)
 
         self.setup(original_loop_token)
+        self.codemap.inherit_code_from_position(faildescr.adr_jump_offset)
         descr_number = compute_unique_id(faildescr)
         if log:
             operations = self._inject_debugging_code(faildescr, operations,
@@ -879,8 +880,12 @@
         self.datablockwrapper.done()      # finish using cpu.asmmemmgr
         self.datablockwrapper = None
         allblocks = self.get_asmmemmgr_blocks(looptoken)
-        return self.mc.materialize(self.cpu.asmmemmgr, allblocks,
+        size = self.mc.get_relative_pos() 
+        res = self.mc.materialize(self.cpu.asmmemmgr, allblocks,
                                    self.cpu.gc_ll_descr.gcrootmap)
+        self.cpu.asmmemmgr.register_codemap(
+            self.codemap.get_final_bytecode(res, size))
+        return res
 
     def update_frame_depth(self, frame_depth):
         baseofs = self.cpu.get_baseofs_of_frame_field()
diff --git a/rpython/jit/backend/arm/regalloc.py b/rpython/jit/backend/arm/regalloc.py
--- a/rpython/jit/backend/arm/regalloc.py
+++ b/rpython/jit/backend/arm/regalloc.py
@@ -392,6 +392,9 @@
         else:
             self.rm._sync_var(v)
 
+    def prepare_op_debug_merge_point(self, op, fcond):
+        self.assembler.codemap.debug_merge_point(op)
+
     def _prepare_op_int_add(self, op, fcond):
         boxes = op.getarglist()
         a0, a1 = boxes
diff --git a/rpython/jit/backend/llsupport/asmmemmgr.py b/rpython/jit/backend/llsupport/asmmemmgr.py
--- a/rpython/jit/backend/llsupport/asmmemmgr.py
+++ b/rpython/jit/backend/llsupport/asmmemmgr.py
@@ -4,7 +4,10 @@
 from rpython.rlib import rmmap
 from rpython.rlib.debug import debug_start, debug_print, debug_stop
 from rpython.rlib.debug import have_debug_prints
-from rpython.rtyper.lltypesystem import lltype, llmemory, rffi
+from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rlib.rbisect import bisect, bisect_tuple
+
+_memmngr = None # global reference so we can use @entrypoint :/
 
 
 class AsmMemoryManager(object):
@@ -24,6 +27,12 @@
         self.free_blocks = {}      # map {start: stop}
         self.free_blocks_end = {}  # map {stop: start}
         self.blocks_by_size = [[] for i in range(self.num_indices)]
+        # two lists of jit addresses (sorted) and the corresponding stack
+        # depths
+        self.jit_addr_map = []
+        self.jit_frame_depth_map = []
+        self.jit_codemap = []
+        # see codemap.py
 
     def malloc(self, minsize, maxsize):
         """Allocate executable memory, between minsize and maxsize bytes,
@@ -45,6 +54,13 @@
         if r_uint is not None:
             self.total_mallocs -= r_uint(stop - start)
         self._add_free_block(start, stop)
+        # fix up jit_addr_map
+        jit_adr_start = bisect(self.jit_addr_map, start)
+        jit_adr_stop = bisect(self.jit_addr_map, stop)
+        self.jit_addr_map = (self.jit_addr_map[:jit_adr_start] +
+                             self.jit_addr_map[jit_adr_stop:])
+        self.jit_frame_depth_map = (self.jit_frame_depth_map[:jit_adr_start] +
+                                    self.jit_frame_depth_map[jit_adr_stop:])
 
     def open_malloc(self, minsize):
         """Allocate at least minsize bytes.  Returns (start, stop)."""
@@ -151,6 +167,35 @@
         del self.free_blocks_end[stop]
         return (start, stop)
 
+    def register_frame_depth_map(self, rawstart, frame_positions,
+                                 frame_assignments):
+        if not frame_positions:
+            return
+        if not self.jit_addr_map or rawstart > self.jit_addr_map[-1]:
+            start = len(self.jit_addr_map)
+            self.jit_addr_map += [0] * len(frame_positions)
+            self.jit_frame_depth_map += [0] * len(frame_positions)
+        else:
+            start = bisect(self.jit_addr_map, rawstart)
+            self.jit_addr_map = (self.jit_addr_map[:start] +
+                                 [0] * len(frame_positions) +
+                                 self.jit_addr_map[start:])
+            self.jit_frame_depth_map = (self.jit_frame_depth_map[:start] +
+                                 [0] * len(frame_positions) +
+                                 self.jit_frame_depth_map[start:])
+        for i, pos in enumerate(frame_positions):
+            self.jit_addr_map[i + start] = pos + rawstart
+            self.jit_frame_depth_map[i + start] = frame_assignments[i]
+
+    def register_codemap(self, codemap):
+        start = codemap[0]
+        pos = bisect_tuple(self.jit_codemap, start)
+        if pos == len(self.jit_codemap): # common case
+            self.jit_codemap.append(codemap)
+        else:
+            self.jit_codemap = (self.jit_codemap[:pos] + [codemap] +
+                                self.jit_codemap[pos:])
+
     def _delete(self):
         "NOT_RPYTHON"
         if self._allocated:
@@ -212,6 +257,9 @@
 
     gcroot_markers = None
 
+    frame_positions = None
+    frame_assignments = None
+
     def __init__(self, translated=None):
         if translated is None:
             translated = we_are_translated()
@@ -316,6 +364,10 @@
             assert gcrootmap is not None
             for pos, mark in self.gcroot_markers:
                 gcrootmap.register_asm_addr(rawstart + pos, mark)
+        asmmemmgr.register_frame_depth_map(rawstart, self.frame_positions,
+                                           self.frame_assignments)
+        self.frame_positions = None
+        self.frame_assignments = None
         return rawstart
 
     def _become_a_plain_block_builder(self):
diff --git a/rpython/jit/backend/llsupport/assembler.py b/rpython/jit/backend/llsupport/assembler.py
--- a/rpython/jit/backend/llsupport/assembler.py
+++ b/rpython/jit/backend/llsupport/assembler.py
@@ -1,6 +1,7 @@
 from rpython.jit.backend.llsupport import jitframe
 from rpython.jit.backend.llsupport.memcpy import memcpy_fn, memset_fn
 from rpython.jit.backend.llsupport.symbolic import WORD
+from rpython.jit.backend.llsupport.codemap import CodemapBuilder
 from rpython.jit.metainterp.history import (INT, REF, FLOAT, JitCellToken,
     ConstInt, BoxInt, AbstractFailDescr)
 from rpython.jit.metainterp.resoperation import ResOperation, rop
@@ -128,6 +129,10 @@
                                               track_allocation=False)
         self.gcmap_for_finish[0] = r_uint(1)
 
+    def setup(self, looptoken):
+        self.codemap = CodemapBuilder()
+        self._finish_gcmap = lltype.nullptr(jitframe.GCMAP)
+
     def set_debug(self, v):
         r = self._debug
         self._debug = v
@@ -194,6 +199,9 @@
         guardtok.faildescr.rd_locs = positions
         return fail_descr, target
 
+    def debug_merge_point(self, op):
+        self.codemap.debug_merge_point(op, self.mc.get_relative_pos())
+
     def call_assembler(self, op, guard_op, argloc, vloc, result_loc, tmploc):
         self._store_force_index(guard_op)
         descr = op.getdescr()
@@ -276,6 +284,9 @@
         # YYY very minor leak -- we need the counters to stay alive
         # forever, just because we want to report them at the end
         # of the process
+
+        # XXX the numbers here are ALMOST unique, but not quite, use a counter
+        #     or something
         struct = lltype.malloc(DEBUG_COUNTER, flavor='raw',
                                track_allocation=False)
         struct.i = 0
diff --git a/rpython/jit/backend/llsupport/codemap.py b/rpython/jit/backend/llsupport/codemap.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/llsupport/codemap.py
@@ -0,0 +1,142 @@
+
+""" Bytecode for storage in asmmemmgr.jit_codemap. Format is as follows:
+
+ list of tuples of shape (addr, machine code size, bytecode info)
+ where bytecode info is a string made up of:
+    8 bytes unique_id, 4 bytes start_addr (relative), 4 bytes size (relative),
+    2 bytes how many items to skip to go to the next on similar level
+    [so far represented by a list of integers for simplicity]
+
+"""
+
+from rpython.rlib import rgc
+from rpython.rlib.entrypoint import jit_entrypoint
+from rpython.jit.backend.llsupport import asmmemmgr
+from rpython.rlib.rbisect import bisect, bisect_tuple
+from rpython.rtyper.lltypesystem import lltype, rffi
+
+ at jit_entrypoint([lltype.Signed], lltype.Signed,
+                c_name='pypy_jit_stack_depth_at_loc')
+ at rgc.no_collect
+def stack_depth_at_loc(loc):
+    _memmngr = asmmemmgr._memmngr
+
+    pos = bisect(_memmngr.jit_addr_map, loc)
+    if pos == 0 or pos == len(_memmngr.jit_addr_map):
+        return -1
+    return _memmngr.jit_frame_depth_map[pos-1]
+
+ at jit_entrypoint([], lltype.Signed, c_name='pypy_jit_start_addr')
+def jit_start_addr():
+    _memmngr = asmmemmgr._memmngr
+
+    return _memmngr.jit_addr_map[0]
+
+ at jit_entrypoint([], lltype.Signed, c_name='pypy_jit_end_addr')
+def jit_end_addr():
+    _memmngr = asmmemmgr._memmngr
+
+    return _memmngr.jit_addr_map[-1]
+
+ at jit_entrypoint([lltype.Signed], lltype.Signed,
+                c_name='pypy_find_codemap_at_addr')
+def find_codemap_at_addr(addr):
+    _memmngr = asmmemmgr._memmngr
+
+    res = bisect_tuple(_memmngr.jit_codemap, addr) - 1
+    if res == len(_memmngr.jit_codemap):
+        return -1
+    return res
+
+ at jit_entrypoint([lltype.Signed, lltype.Signed,
+                 rffi.CArrayPtr(lltype.Signed)], lltype.Signed,
+                 c_name='pypy_yield_codemap_at_addr')
+def yield_bytecode_at_addr(codemap_no, addr, current_pos_addr):
+    """ will return consecutive unique_ids from codemap, starting from position
+    `pos` until addr
+    """
+    _memmngr = asmmemmgr._memmngr
+
+    codemap = _memmngr.jit_codemap[codemap_no]
+    current_pos = current_pos_addr[0]
+    start_addr = codemap[0]
+    rel_addr = addr - start_addr
+    while True:
+        if current_pos >= len(codemap[2]):
+            return 0
+        next_start = codemap[2][current_pos + 1]
+        if next_start > rel_addr:
+            return 0
+        next_stop = codemap[2][current_pos + 2]
+        if next_stop > rel_addr:
+            current_pos_addr[0] = current_pos + 4
+            return codemap[2][current_pos]
+        # we need to skip potentially more than one
+        current_pos = codemap[2][current_pos + 3]
+
+def unpack_traceback(addr):
+    codemap_pos = find_codemap_at_addr(addr)
+    assert codemap_pos >= 0
+    storage = lltype.malloc(rffi.CArray(lltype.Signed), 1, flavor='raw')
+    storage[0] = 0
+    res = []
+    while True:
+        item = yield_bytecode_at_addr(codemap_pos, addr, storage)
+        if item == 0:
+            break
+        res.append(item)
+    lltype.free(storage, flavor='raw')
+    return res
+
+
+class CodemapBuilder(object):
+    def __init__(self):
+        self.l = []
+        self.patch_position = []
+        self.last_call_depth = -1
+
+    def debug_merge_point(self, op, pos):
+        call_depth = op.getarg(1).getint()
+        if call_depth != self.last_call_depth:
+            unique_id = op.getarg(3).getint()
+            if unique_id == 0: # uninteresting case
+                return
+            assert unique_id & 1 == 0
+            if call_depth > self.last_call_depth:
+                self.l.append(unique_id)
+                self.l.append(pos) # <- this is a relative pos
+                self.patch_position.append(len(self.l))
+                self.l.append(0) # marker
+                self.l.append(0) # second marker
+            else:
+                for i in range(self.last_call_depth - call_depth):
+                    to_patch = self.patch_position.pop()
+                    self.l[to_patch] = pos
+                    self.l[to_patch + 1] = len(self.l)
+            self.last_call_depth = call_depth
+
+    def inherit_code_from_position(self, pos):
+        lst = unpack_traceback(pos)
+        self.last_call_depth = len(lst) - 1
+        for item in lst:
+            self.l.append(item)
+            self.l.append(0)
+            self.patch_position.append(len(self.l))
+            self.l.append(0) # marker
+            self.l.append(0) # second marker
+
+    def get_final_bytecode(self, addr, size):
+        while self.patch_position:
+            pos = self.patch_position.pop()
+            self.l[pos] = size
+            self.l[pos + 1] = len(self.l)
+        # at the end there should be no zeros
+        for i in range(len(self.l) / 4):
+            item = self.l[i * 4] # unique_id
+            assert item > 0 # no zeros here
+            item = self.l[i * 4 + 2] # end in asm
+            assert item > 0
+            item = self.l[i * 4 + 3] # end in l
+            assert item > 0
+        return (addr, size, self.l) # XXX compact self.l
+
diff --git a/rpython/jit/backend/llsupport/llmodel.py b/rpython/jit/backend/llsupport/llmodel.py
--- a/rpython/jit/backend/llsupport/llmodel.py
+++ b/rpython/jit/backend/llsupport/llmodel.py
@@ -16,7 +16,7 @@
     FieldDescr, ArrayDescr, CallDescr, InteriorFieldDescr,
     FLAG_POINTER, FLAG_FLOAT)
 from rpython.jit.backend.llsupport.memcpy import memset_fn
-from rpython.jit.backend.llsupport.asmmemmgr import AsmMemoryManager
+from rpython.jit.backend.llsupport import asmmemmgr
 from rpython.rlib.unroll import unrolling_iterable
 
 
@@ -48,7 +48,8 @@
             self._setup_exception_handling_translated()
         else:
             self._setup_exception_handling_untranslated()
-        self.asmmemmgr = AsmMemoryManager()
+        self.asmmemmgr = asmmemmgr.AsmMemoryManager()
+        asmmemmgr._memmngr = self.asmmemmgr
         self._setup_frame_realloc(translate_support_code)
         ad = self.gc_ll_descr.getframedescrs(self).arraydescr
         self.signedarraydescr = ad
diff --git a/rpython/jit/backend/llsupport/rewrite.py b/rpython/jit/backend/llsupport/rewrite.py
--- a/rpython/jit/backend/llsupport/rewrite.py
+++ b/rpython/jit/backend/llsupport/rewrite.py
@@ -17,8 +17,6 @@
 class GcRewriterAssembler(object):
     """ This class performs the following rewrites on the list of operations:
 
-     - Remove the DEBUG_MERGE_POINTs.
-
      - Turn all NEW_xxx to either a CALL_MALLOC_GC, or a CALL_MALLOC_NURSERY
        followed by SETFIELDs in order to initialize their GC fields.  The
        two advantages of CALL_MALLOC_NURSERY is that it inlines the common
@@ -60,8 +58,6 @@
         #
         for i in range(len(operations)):
             op = operations[i]
-            if op.getopnum() == rop.DEBUG_MERGE_POINT:
-                continue
             # ---------- turn NEWxxx into CALL_MALLOC_xxx ----------
             if op.is_malloc():
                 self.handle_malloc_operation(op)
diff --git a/rpython/jit/backend/llsupport/test/test_asmmemmgr.py b/rpython/jit/backend/llsupport/test/test_asmmemmgr.py
--- a/rpython/jit/backend/llsupport/test/test_asmmemmgr.py
+++ b/rpython/jit/backend/llsupport/test/test_asmmemmgr.py
@@ -2,6 +2,7 @@
 from rpython.jit.backend.llsupport.asmmemmgr import AsmMemoryManager
 from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from rpython.jit.backend.llsupport.asmmemmgr import BlockBuilderMixin
+from rpython.jit.backend.llsupport import asmmemmgr
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.rlib import debug
 
@@ -157,6 +158,7 @@
         class FakeGcRootMap:
             def register_asm_addr(self, retaddr, mark):
                 puts.append((retaddr, mark))
+
         #
         mc = BlockBuilderMixin()
         mc.writechar('X')
@@ -266,3 +268,16 @@
     md.done()
     assert allblocks == [(1597, 1697), (1797, 1835)]
     assert ops == [('free', 1835, 1897)]
+
+def test_find_jit_frame_depth():
+    mgr = AsmMemoryManager()
+    mgr.register_frame_depth_map(11, [0, 5, 10], [1, 2, 3])
+    mgr.register_frame_depth_map(30, [0, 5, 10], [4, 5, 6])
+    mgr.register_frame_depth_map(0, [0, 5, 10], [7, 8, 9])
+    asmmemmgr._memmngr = mgr
+    assert asmmemmgr.stack_depth_at_loc(13) == 1
+    assert asmmemmgr.stack_depth_at_loc(-3) == -1
+    assert asmmemmgr.stack_depth_at_loc(41) == -1
+    assert asmmemmgr.stack_depth_at_loc(5) == 8
+    assert asmmemmgr.stack_depth_at_loc(17) == 2
+    assert asmmemmgr.stack_depth_at_loc(38) == 5
diff --git a/rpython/jit/backend/llsupport/test/ztranslation_test.py b/rpython/jit/backend/llsupport/test/ztranslation_test.py
--- a/rpython/jit/backend/llsupport/test/ztranslation_test.py
+++ b/rpython/jit/backend/llsupport/test/ztranslation_test.py
@@ -2,7 +2,7 @@
 from rpython.tool.udir import udir
 from rpython.rlib.jit import JitDriver, unroll_parameters, set_param
 from rpython.rlib.jit import PARAMETERS, dont_look_inside
-from rpython.rlib.jit import promote
+from rpython.rlib.jit import promote, _get_virtualizable_token
 from rpython.rlib import jit_hooks, rposix
 from rpython.rlib.objectmodel import keepalive_until_here
 from rpython.rlib.rthread import ThreadLocalReference, ThreadLocalField
@@ -28,12 +28,15 @@
         # - floats neg and abs
         # - llexternal with macro=True
 
-        class Frame(object):
+        class BasicFrame(object):
             _virtualizable_ = ['i']
 
             def __init__(self, i):
                 self.i = i
 
+        class Frame(BasicFrame):
+            pass
+
         eci = ExternalCompilationInfo(post_include_bits=['''
 #define pypy_my_fabs(x)  fabs(x)
 '''])
@@ -59,6 +62,7 @@
             while frame.i > 3:
                 jitdriver.can_enter_jit(frame=frame, total=total, j=j)
                 jitdriver.jit_merge_point(frame=frame, total=total, j=j)
+                _get_virtualizable_token(frame)
                 total += frame.i
                 if frame.i >= 20:
                     frame.i -= 2
diff --git a/rpython/jit/backend/x86/arch.py b/rpython/jit/backend/x86/arch.py
--- a/rpython/jit/backend/x86/arch.py
+++ b/rpython/jit/backend/x86/arch.py
@@ -51,3 +51,6 @@
     THREADLOCAL_OFS = (FRAME_FIXED_SIZE - 1) * WORD
 
 assert PASS_ON_MY_FRAME >= 12       # asmgcc needs at least JIT_USE_WORDS + 3
+
+# return address, followed by FRAME_FIXED_SIZE words
+DEFAULT_FRAME_BYTES = (1 + FRAME_FIXED_SIZE) * WORD
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -18,7 +18,8 @@
 from rpython.jit.backend.llsupport.regalloc import (get_scale, valid_addressing_size)
 from rpython.jit.backend.x86.arch import (FRAME_FIXED_SIZE, WORD, IS_X86_64,
                                        JITFRAME_FIXED_SIZE, IS_X86_32,
-                                       PASS_ON_MY_FRAME, THREADLOCAL_OFS)
+                                       PASS_ON_MY_FRAME, THREADLOCAL_OFS,
+                                       DEFAULT_FRAME_BYTES)
 from rpython.jit.backend.x86.regloc import (eax, ecx, edx, ebx, esp, ebp, esi,
     xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, r8, r9, r10, r11, edi,
     r12, r13, r14, r15, X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG,
@@ -66,6 +67,7 @@
             self._build_float_constants()
 
     def setup(self, looptoken):
+        BaseAssembler.setup(self, looptoken)
         assert self.memcpy_addr != 0, "setup_once() not called?"
         self.current_clt = looptoken.compiled_loop_token
         self.pending_guard_tokens = []
@@ -80,7 +82,6 @@
                                                         allblocks)
         self.target_tokens_currently_compiling = {}
         self.frame_depth_to_patch = []
-        self._finish_gcmap = lltype.nullptr(jitframe.GCMAP)
 
     def teardown(self):
         self.pending_guard_tokens = None
@@ -267,6 +268,10 @@
         # the correct "ret" arg
         offset = mc.get_relative_pos() - jz_location
         mc.overwrite32(jz_location-4, offset)
+        # From now on this function is basically "merged" with
+        # its caller and so contains DEFAULT_FRAME_BYTES bytes
+        # plus my own return address, which we'll ignore next
+        mc.force_frame_size(DEFAULT_FRAME_BYTES + WORD)
         mc.ADD_ri(esp.value, WORD)
         mc.JMP(imm(self.propagate_exception_path))
         #
@@ -278,6 +283,7 @@
             return      # not supported (for tests, or non-translated)
         #
         self.mc = codebuf.MachineCodeBlockWrapper()
+        self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
         #
         # read and reset the current exception
 
@@ -331,7 +337,10 @@
         offset = mc.get_relative_pos() - jnz_location
         assert 0 < offset <= 127
         mc.overwrite(jnz_location-1, chr(offset))
-        # adjust the esp to point back to the previous return
+        # From now on this function is basically "merged" with
+        # its caller and so contains DEFAULT_FRAME_BYTES bytes
+        # plus my own return address, which we'll ignore next
+        mc.force_frame_size(DEFAULT_FRAME_BYTES + WORD)
         mc.ADD_ri(esp.value, WORD)
         mc.JMP(imm(self.propagate_exception_path))
         #
@@ -409,6 +418,8 @@
                 mc.LEA_rs(esp.value, 2 * WORD)
             self._pop_all_regs_from_frame(mc, [], withfloats, callee_only=True)
             mc.RET16_i(WORD)
+            # Note that wb_slowpath[0..3] end with a RET16_i, which must be
+            # taken care of in the caller by stack_frame_size_delta(-WORD)
         else:
             if IS_X86_32:
                 mc.MOV_rs(edx.value, 4 * WORD)
@@ -514,6 +525,8 @@
             assert len(set(inputargs)) == len(inputargs)
 
         self.setup(original_loop_token)
+        self.codemap.inherit_code_from_position(faildescr.adr_jump_offset)
+        self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
         descr_number = compute_unique_id(faildescr)
         if log:
             operations = self._inject_debugging_code(faildescr, operations,
@@ -674,8 +687,12 @@
         self.datablockwrapper.done()      # finish using cpu.asmmemmgr
         self.datablockwrapper = None
         allblocks = self.get_asmmemmgr_blocks(looptoken)
-        return self.mc.materialize(self.cpu.asmmemmgr, allblocks,
-                                   self.cpu.gc_ll_descr.gcrootmap)
+        size = self.mc.get_relative_pos()
+        res = self.mc.materialize(self.cpu.asmmemmgr, allblocks,
+                                  self.cpu.gc_ll_descr.gcrootmap)
+        self.cpu.asmmemmgr.register_codemap(
+            self.codemap.get_final_bytecode(res, size))
+        return res
 
     def patch_jump_for_descr(self, faildescr, adr_new_target):
         adr_jump_offset = faildescr.adr_jump_offset
@@ -686,6 +703,7 @@
         # place, but clobber the recovery stub with a jump to the real
         # target.
         mc = codebuf.MachineCodeBlockWrapper()
+        mc.force_frame_size(DEFAULT_FRAME_BYTES)
         if rx86.fits_in_32bits(offset):
             mc.writeimm32(offset)
             mc.copy_to_raw_memory(adr_jump_offset)
@@ -1756,6 +1774,7 @@
 
     def generate_propagate_error_64(self):
         assert WORD == 8
+        self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
         startpos = self.mc.get_relative_pos()
         self.mc.JMP(imm(self.propagate_exception_path))
         return startpos
@@ -1763,6 +1782,7 @@
     def generate_quick_failure(self, guardtok):
         """ Gather information about failure
         """
+        self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
         startpos = self.mc.get_relative_pos()
         fail_descr, target = self.store_info_on_descr(startpos, guardtok)
         self.mc.PUSH(imm(fail_descr))
@@ -1838,6 +1858,9 @@
 
     def _build_failure_recovery(self, exc, withfloats=False):
         mc = codebuf.MachineCodeBlockWrapper()
+        # this is jumped to, from a stack that has DEFAULT_FRAME_BYTES
+        # followed by 2 extra words just pushed
+        mc.force_frame_size(DEFAULT_FRAME_BYTES + 2 * WORD)
         self.mc = mc
 
         self._push_all_regs_to_frame(mc, [], withfloats)
@@ -1909,6 +1932,7 @@
             self.mc.J_il(rx86.Conditions[condition], 0)
         else:
             self.mc.JMP_l(0)
+            self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
         guard_token.pos_jump_offset = self.mc.get_relative_pos() - 4
         self.pending_guard_tokens.append(guard_token)
 
@@ -2020,6 +2044,7 @@
         offset = jmp_location - je_location
         assert 0 < offset <= 127
         self.mc.overwrite(je_location - 1, chr(offset))
+        self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
         #
         return jmp_location
 
@@ -2104,6 +2129,8 @@
         if is_frame and align_stack:
             mc.SUB_ri(esp.value, 16 - WORD) # erase the return address
         mc.CALL(imm(self.wb_slowpath[helper_num]))
+        if not is_frame:
+            mc.stack_frame_size_delta(-WORD)
         if is_frame and align_stack:
             mc.ADD_ri(esp.value, 16 - WORD) # erase the return address
 
@@ -2340,6 +2367,7 @@
         offset = self.mc.get_relative_pos() - jmp_adr1
         assert 0 < offset <= 127
         self.mc.overwrite(jmp_adr1-1, chr(offset))
+        self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
         # write down the tid, but not if it's the result of the CALL
         self.mc.MOV(mem(eax, 0), imm(arraydescr.tid))
         # while we're at it, this line is not needed if we've done the CALL
diff --git a/rpython/jit/backend/x86/callbuilder.py b/rpython/jit/backend/x86/callbuilder.py
--- a/rpython/jit/backend/x86/callbuilder.py
+++ b/rpython/jit/backend/x86/callbuilder.py
@@ -58,7 +58,7 @@
             self.fnloc_is_immediate = False
             self.fnloc = None
             self.arglocs = arglocs + [fnloc]
-        self.current_esp = 0     # 0 or (usually) negative, counted in bytes
+        self.start_frame_size = self.mc._frame_size
 
     def select_call_release_gil_mode(self):
         AbstractCallBuilder.select_call_release_gil_mode(self)
@@ -70,13 +70,15 @@
     def subtract_esp_aligned(self, count):
         if count > 0:
             align = align_stack_words(count)
-            self.current_esp -= align * WORD
             self.mc.SUB_ri(esp.value, align * WORD)
 
+    def get_current_esp(self):
+        return self.start_frame_size - self.mc._frame_size
+
     def restore_stack_pointer(self, target_esp=0):
-        if self.current_esp != target_esp:
-            self.mc.ADD_ri(esp.value, target_esp - self.current_esp)
-            self.current_esp = target_esp
+        current_esp = self.get_current_esp()
+        if current_esp != target_esp:
+            self.mc.ADD_ri(esp.value, target_esp - current_esp)
 
     def load_result(self):
         """Overridden in CallBuilder32 and CallBuilder64"""
@@ -99,9 +101,10 @@
         # after the rearrangements done just before, ignoring the return
         # value eax, if necessary
         assert not self.is_call_release_gil
-        self.change_extra_stack_depth = (self.current_esp != 0)
+        current_esp = self.get_current_esp()
+        self.change_extra_stack_depth = (current_esp != 0)
         if self.change_extra_stack_depth:
-            self.asm.set_extra_stack_depth(self.mc, -self.current_esp)
+            self.asm.set_extra_stack_depth(self.mc, -current_esp)
         noregs = self.asm.cpu.gc_ll_descr.is_shadow_stack()
         gcmap = self.asm._regalloc.get_gcmap([eax], noregs=noregs)
         self.asm.push_gcmap(self.mc, gcmap, store=True)
@@ -142,7 +145,7 @@
             # and 5/7 words as described for asmgcroot.ASM_FRAMEDATA, for a
             # total size of JIT_USE_WORDS.  This structure is found at
             # [ESP+css].
-            css = -self.current_esp + (
+            css = -self.get_current_esp() + (
                 WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS))
             assert css >= 2 * WORD
             # Save ebp
@@ -178,7 +181,7 @@
             else:
                 self.tlofs_reg = r12
             self.mc.MOV_rs(self.tlofs_reg.value,
-                           THREADLOCAL_OFS - self.current_esp)
+                           THREADLOCAL_OFS - self.get_current_esp())
             if self.asm._is_asmgcc():
                 self.mc.AND_ri(self.tlofs_reg.value, ~1)
         return self.tlofs_reg
@@ -451,7 +454,10 @@
         else:
             self.mc.CALL(self.fnloc)
             if self.callconv != FFI_DEFAULT_ABI:
-                self.current_esp += self._fix_stdcall(self.callconv)
+                # in the STDCALL ABI, the CALL above has an effect on
+                # the stack depth.  Adjust 'mc._frame_size'.
+                delta = self._fix_stdcall(self.callconv)
+                self.mc.stack_frame_size_delta(-delta)
 
     def _fix_stdcall(self, callconv):
         from rpython.rlib.clibffi import FFI_STDCALL
diff --git a/rpython/jit/backend/x86/codebuf.py b/rpython/jit/backend/x86/codebuf.py
--- a/rpython/jit/backend/x86/codebuf.py
+++ b/rpython/jit/backend/x86/codebuf.py
@@ -23,6 +23,7 @@
                               codebuilder_cls):
     def __init__(self):
         self.init_block_builder()
+        codebuilder_cls.__init__(self)
         # a list of relative positions; for each position p, the bytes
         # at [p-4:p] encode an absolute address that will need to be
         # made relative.  Only works on 32-bit!
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -12,7 +12,7 @@
      valid_addressing_size)
 from rpython.jit.backend.x86 import rx86
 from rpython.jit.backend.x86.arch import (WORD, JITFRAME_FIXED_SIZE, IS_X86_32,
-    IS_X86_64)
+    IS_X86_64, DEFAULT_FRAME_BYTES)
 from rpython.jit.backend.x86.jump import remap_frame_layout_mixed
 from rpython.jit.backend.x86.regloc import (FrameLoc, RegLoc, ConstFloatLoc,
     FloatImmedLoc, ImmedLoc, imm, imm0, imm1, ecx, eax, edx, ebx, esi, edi,
@@ -314,6 +314,7 @@
         while i < len(operations):
             op = operations[i]
             self.assembler.mc.mark_op(op)
+            assert self.assembler.mc._frame_size == DEFAULT_FRAME_BYTES
             self.rm.position = i
             self.xrm.position = i
             if op.has_no_side_effect() and op.result not in self.longevity:
@@ -1323,7 +1324,7 @@
         assembler.closing_jump(self.jump_target_descr)
 
     def consider_debug_merge_point(self, op):
-        pass
+        self.assembler.debug_merge_point(op)
 
     def consider_jit_debug(self, op):
         pass
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -454,6 +454,11 @@
 class AbstractX86CodeBuilder(object):
     """Abstract base class."""
 
+    def __init__(self):
+        self.frame_positions = []
+        self.frame_assignments = []
+        self.force_frame_size(self.WORD)
+
     def writechar(self, char):
         raise NotImplementedError
 
@@ -471,6 +476,23 @@
         self.writechar(chr((imm >> 16) & 0xFF))
         self.writechar(chr((imm >> 24) & 0xFF))
 
+    def force_frame_size(self, frame_size):
+        self.frame_positions.append(self.get_relative_pos())
+        self.frame_assignments.append(frame_size)
+        self._frame_size = frame_size
+
+    def stack_frame_size_delta(self, delta):
+        "Called when we generate an instruction that changes the value of ESP"
+        self._frame_size += delta
+        self.frame_positions.append(self.get_relative_pos()) 
+        self.frame_assignments.append(self._frame_size)
+        assert self._frame_size >= self.WORD
+
+    def check_stack_size_at_ret(self):
+        assert self._frame_size == self.WORD
+        if not we_are_translated():
+            self._frame_size = None
+
     # ------------------------------ MOV ------------------------------
 
     MOV_ri = insn(register(1), '\xB8', immediate(2))
@@ -481,14 +503,24 @@
     INC_m = insn(rex_w, '\xFF', orbyte(0), mem_reg_plus_const(1))
     INC_j = insn(rex_w, '\xFF', orbyte(0), abs_(1))
 
-    ADD_ri,ADD_rr,ADD_rb,_,_,ADD_rm,ADD_rj,_,_ = common_modes(0)
+    AD1_ri,ADD_rr,ADD_rb,_,_,ADD_rm,ADD_rj,_,_ = common_modes(0)
     OR_ri, OR_rr, OR_rb, _,_,OR_rm, OR_rj, _,_ = common_modes(1)
     AND_ri,AND_rr,AND_rb,_,_,AND_rm,AND_rj,_,_ = common_modes(4)
-    SUB_ri,SUB_rr,SUB_rb,_,_,SUB_rm,SUB_rj,SUB_ji8,SUB_mi8 = common_modes(5)
+    SU1_ri,SUB_rr,SUB_rb,_,_,SUB_rm,SUB_rj,SUB_ji8,SUB_mi8 = common_modes(5)
     SBB_ri,SBB_rr,SBB_rb,_,_,SBB_rm,SBB_rj,_,_ = common_modes(3)
     XOR_ri,XOR_rr,XOR_rb,_,_,XOR_rm,XOR_rj,_,_ = common_modes(6)
     CMP_ri,CMP_rr,CMP_rb,CMP_bi,CMP_br,CMP_rm,CMP_rj,_,_ = common_modes(7)
 
+    def ADD_ri(self, reg, immed):
+        self.AD1_ri(reg, immed)
+        if reg == R.esp:
+            self.stack_frame_size_delta(-immed)
+
+    def SUB_ri(self, reg, immed):
+        self.SU1_ri(reg, immed)
+        if reg == R.esp:
+            self.stack_frame_size_delta(+immed)
+
     CMP_mi8 = insn(rex_w, '\x83', orbyte(7<<3), mem_reg_plus_const(1), immediate(2, 'b'))
     CMP_mi32 = insn(rex_w, '\x81', orbyte(7<<3), mem_reg_plus_const(1), immediate(2))
     CMP_mi = select_8_or_32_bit_immed(CMP_mi8, CMP_mi32)
@@ -538,30 +570,65 @@
     # ------------------------------ Misc stuff ------------------------------
 
     NOP = insn('\x90')
-    RET = insn('\xC3')
-    RET16_i = insn('\xC2', immediate(1, 'h'))
+    RE1 = insn('\xC3')
+    RE116_i = insn('\xC2', immediate(1, 'h'))
 
-    PUSH_r = insn(rex_nw, register(1), '\x50')
-    PUSH_b = insn(rex_nw, '\xFF', orbyte(6<<3), stack_bp(1))
-    PUSH_m = insn(rex_nw, '\xFF', orbyte(6<<3), mem_reg_plus_const(1))
-    PUSH_i8 = insn('\x6A', immediate(1, 'b'))
-    PUSH_i32 = insn('\x68', immediate(1, 'i'))
-    def PUSH_i(mc, immed):
+    def RET(self):
+        self.check_stack_size_at_ret()
+        self.RE1()
+
+    def RET16_i(self, immed):
+        self.check_stack_size_at_ret()
+        self.RE116_i(immed)
+
+    PUS1_r = insn(rex_nw, register(1), '\x50')
+    PUS1_b = insn(rex_nw, '\xFF', orbyte(6<<3), stack_bp(1))