[pypy-commit] pypy remove-list-smm-2: hg merge default

Tue May 21 15:10:14 CEST 2013

Author: Manuel Jacob
Branch: remove-list-smm-2
Changeset: r64384:b9d6357b218a
Date: 2013-05-21 14:53 +0200
http://bitbucket.org/pypy/pypy/changeset/b9d6357b218a/

Log:	hg merge default

diff --git a/lib-python/2.7/distutils/sysconfig.py b/lib-python/2.7/distutils/sysconfig.py
--- a/lib-python/2.7/distutils/sysconfig.py
+++ b/lib-python/2.7/distutils/sysconfig.py
@@ -1,30 +1,16 @@
-"""Provide access to Python's configuration information.  The specific
-configuration variables available depend heavily on the platform and
-configuration.  The values may be retrieved using
-get_config_var(name), and the list of variables is available via
-get_config_vars().keys().  Additional convenience functions are also
-available.
-
-Written by:   Fred L. Drake, Jr.
-Email:        <fdrake at acm.org>
-"""
-
-__revision__ = "$Id: sysconfig.py 85358 2010-10-10 09:54:59Z antoine.pitrou $"
-
-import sys
-
 
 # The content of this file is redirected from
 # sysconfig_cpython or sysconfig_pypy.
+# All underscore names are imported too, because
+# people like to use undocumented sysconfig._xxx
+# directly.
 
+import sys
 if '__pypy__' in sys.builtin_module_names:
-    from distutils.sysconfig_pypy import *
-    from distutils.sysconfig_pypy import _config_vars # needed by setuptools
-    from distutils.sysconfig_pypy import _variable_rx # read_setup_file()
+    from distutils import sysconfig_pypy as _sysconfig_module
 else:
-    from distutils.sysconfig_cpython import *
-    from distutils.sysconfig_cpython import _config_vars # needed by setuptools
-    from distutils.sysconfig_cpython import _variable_rx # read_setup_file()
+    from distutils import sysconfig_cpython as _sysconfig_module
+globals().update(_sysconfig_module.__dict__)
 
 _USE_CLANG = None
 
diff --git a/lib-python/2.7/distutils/sysconfig_cpython.py b/lib-python/2.7/distutils/sysconfig_cpython.py
--- a/lib-python/2.7/distutils/sysconfig_cpython.py
+++ b/lib-python/2.7/distutils/sysconfig_cpython.py
@@ -9,7 +9,7 @@
 Email:        <fdrake at acm.org>
 """
 
-__revision__ = "$Id$"
+__revision__ = "$Id: sysconfig.py 85358 2010-10-10 09:54:59Z antoine.pitrou $"
 
 import os
 import re
diff --git a/lib-python/2.7/distutils/sysconfig_pypy.py b/lib-python/2.7/distutils/sysconfig_pypy.py
--- a/lib-python/2.7/distutils/sysconfig_pypy.py
+++ b/lib-python/2.7/distutils/sysconfig_pypy.py
@@ -1,6 +1,15 @@
-"""PyPy's minimal configuration information.
+"""Provide access to Python's configuration information.
+This is actually PyPy's minimal configuration information.
+
+The specific configuration variables available depend heavily on the
+platform and configuration.  The values may be retrieved using
+get_config_var(name), and the list of variables is available via
+get_config_vars().keys().  Additional convenience functions are also
+available.
 """
 
+__revision__ = "$Id: sysconfig.py 85358 2010-10-10 09:54:59Z antoine.pitrou $"
+
 import sys
 import os
 import imp
diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py
--- a/pypy/config/pypyoption.py
+++ b/pypy/config/pypyoption.py
@@ -363,6 +363,9 @@
     # ignore names from 'essential_modules', notably 'exceptions', which
     # may not be present in config.objspace.usemodules at all
     modules = [name for name in modules if name not in essential_modules]
+
+    if config.translation.platform == 'arm' and '_continuation' in modules:
+        modules.remove('_continuation')
     config.objspace.usemodules.suggest(**dict.fromkeys(modules, True))
 
 def enable_translationmodules(config):
diff --git a/pypy/doc/conf.py b/pypy/doc/conf.py
--- a/pypy/doc/conf.py
+++ b/pypy/doc/conf.py
@@ -47,7 +47,7 @@
 # The short X.Y version.
 version = '2.0'
 # The full version, including alpha/beta/rc tags.
-release = '2.0.1'
+release = '2.0.2'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/pypy/doc/index.rst b/pypy/doc/index.rst
--- a/pypy/doc/index.rst
+++ b/pypy/doc/index.rst
@@ -40,7 +40,7 @@
 
 * `FAQ`_: some frequently asked questions.
 
-* `Release 2.0.1`_: the latest official release
+* `Release 2.0.2`_: the latest official release
 
 * `PyPy Blog`_: news and status info about PyPy 
 
@@ -110,7 +110,7 @@
 .. _`Getting Started`: getting-started.html
 .. _`Papers`: extradoc.html
 .. _`Videos`: video-index.html
-.. _`Release 2.0.1`: http://pypy.org/download.html
+.. _`Release 2.0.2`: http://pypy.org/download.html
 .. _`speed.pypy.org`: http://speed.pypy.org
 .. _`RPython toolchain`: translation.html
 .. _`potential project ideas`: project-ideas.html
diff --git a/pypy/doc/release-2.0.2.rst b/pypy/doc/release-2.0.2.rst
new file mode 100644
--- /dev/null
+++ b/pypy/doc/release-2.0.2.rst
@@ -0,0 +1,46 @@
+=========================
+PyPy 2.0.2 - Fermi Panini
+=========================
+
+We're pleased to announce PyPy 2.0.2.  This is a stable bugfix release
+over `2.0`_ and `2.0.1`_.  You can download it here:
+
+    http://pypy.org/download.html
+
+It fixes a crash in the JIT when calling external C functions (with
+ctypes/cffi) in a multithreaded context.
+
+.. _2.0: release-2.0.0.html
+.. _2.0.1: release-2.0.1.html
+
+What is PyPy?
+=============
+
+PyPy is a very compliant Python interpreter, almost a drop-in replacement for
+CPython 2.7. It's fast (`pypy 2.0 and cpython 2.7.3`_ performance comparison)
+due to its integrated tracing JIT compiler.
+
+This release supports x86 machines running Linux 32/64, Mac OS X 64 or
+Windows 32.  Support for ARM is progressing but not bug-free yet.
+
+.. _`pypy 2.0 and cpython 2.7.3`: http://speed.pypy.org
+
+Highlights
+==========
+
+This release contains only the fix described above.  A crash (or wrong
+results) used to occur if all these conditions were true:
+
+- your program is multithreaded;
+
+- it runs on a single-core machine or a heavily-loaded multi-core one;
+
+- it uses ctypes or cffi to issue external calls to C functions.
+
+This was fixed in the branch `emit-call-x86`__ (see the example file
+``bug1.py``).
+
+.. __: https://bitbucket.org/pypy/pypy/commits/7c80121abbf4
+
+Cheers,
+arigo et. al. for the PyPy team
diff --git a/pypy/module/pypyjit/test_pypy_c/bug1.py b/pypy/module/pypyjit/test_pypy_c/bug1.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/pypyjit/test_pypy_c/bug1.py
@@ -0,0 +1,57 @@
+import cffi, thread, time, sys
+
+
+ffi = cffi.FFI()
+
+ffi.cdef("""
+    long foobar(long a, long b, long c, long d, long e, long f,
+                long a2, long b2, long c2, long d2, long e2, long f2,
+                long a3, long b3, long c3, long d3, long e3, long f3,
+                long a4, long b4, long c4, long d4, long e4, long f4);
+""")
+
+lib = ffi.verify("""
+    long foobar(long a, long b, long c, long d, long e, long f,
+                long a2, long b2, long c2, long d2, long e2, long f2,
+                long a3, long b3, long c3, long d3, long e3, long f3,
+                long a4, long b4, long c4, long d4, long e4, long f4)
+    {
+        return a * 1 + b * 2 + c * 3 + d * 4 + e * 5 + f * 6 +
+               (a2 * 1 + b2 * 2 + c2 * 3 + d2 * 4 + e2 * 5 + f2 * 6) * (-3) +
+               (a3 * 1 + b3 * 2 + c3 * 3 + d3 * 4 + e3 * 5 + f3 * 6) * (-5) +
+               (a4 * 1 + b4 * 2 + c4 * 3 + d4 * 4 + e4 * 5 + f4 * 6) * (-7);
+    }
+""")
+
+
+def runme():
+    for j in range(10):
+        for i in range(10000):
+            args = [i-k for k in range(24)]
+            x = lib.foobar(*args)
+            (a,b,c,d,e,f,a2,b2,c2,d2,e2,f2,
+             a3,b3,c3,d3,e3,f3,a4,b4,c4,d4,e4,f4) = args
+            assert x == (
+                a * 1 + b * 2 + c * 3 + d * 4 + e * 5 + f * 6 +
+                (a2 * 1 + b2 * 2 + c2 * 3 + d2 * 4 + e2 * 5 + f2 * 6) * (-3) +
+                (a3 * 1 + b3 * 2 + c3 * 3 + d3 * 4 + e3 * 5 + f3 * 6) * (-5) +
+                (a4 * 1 + b4 * 2 + c4 * 3 + d4 * 4 + e4 * 5 + f4 * 6) * (-7))
+
+done = []
+
+def submain():
+    try:
+        runme()
+        err = None
+    except:
+        err = sys.exc_info()
+    done.append(err)
+
+for i in range(2):
+    thread.start_new_thread(submain, ())
+while len(done) < 2:
+    time.sleep(0.1)
+
+for err in done:
+    if err is not None:
+        raise err[0], err[1], err[2]
diff --git a/pypy/module/pypyjit/test_pypy_c/test_array.py b/pypy/module/pypyjit/test_pypy_c/test_array.py
--- a/pypy/module/pypyjit/test_pypy_c/test_array.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_array.py
@@ -105,6 +105,7 @@
         assert loop.match("""
             i10 = int_lt(i6, 1000)
             guard_true(i10, descr=...)
+            guard_not_invalidated(descr=...)
             i11 = int_lt(i6, i7)
             guard_true(i11, descr=...)
             f13 = getarrayitem_raw(i8, i6, descr=<ArrayF 8>)
@@ -141,6 +142,7 @@
         assert loop.match("""
             i10 = int_lt(i6, 1000)
             guard_true(i10, descr=...)
+            guard_not_invalidated(descr=...)
             i11 = int_lt(i6, i7)
             guard_true(i11, descr=...)
             i13 = getarrayitem_raw(i8, i6, descr=<Array. 4>)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_bug.py b/pypy/module/pypyjit/test_pypy_c/test_bug.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/pypyjit/test_pypy_c/test_bug.py
@@ -0,0 +1,14 @@
+import os, sys, py, subprocess
+
+localdir = os.path.dirname(os.path.abspath(__file__))
+
+
+def test_bug1():
+    if not sys.platform.startswith('linux'):
+        py.test.skip("linux-only test")
+
+    cmdline = ['taskset', '-c', '0',
+               sys.executable, os.path.join(localdir, 'bug1.py')]
+    popen = subprocess.Popen(cmdline)
+    err = popen.wait()
+    assert err == 0
diff --git a/pypy/module/pypyjit/test_pypy_c/test_call.py b/pypy/module/pypyjit/test_pypy_c/test_call.py
--- a/pypy/module/pypyjit/test_pypy_c/test_call.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_call.py
@@ -487,6 +487,7 @@
         assert loop.match("""
             i2 = int_lt(i0, i1)
             guard_true(i2, descr=...)
+            guard_not_invalidated(descr=...)
             i3 = force_token()
             i4 = int_add(i0, 1)
             --TICK--
@@ -586,7 +587,6 @@
         """, [1000])
         loop, = log.loops_by_id('call')
         assert loop.match_by_id('call', '''
-        guard_not_invalidated(descr=<.*>)
         i1 = force_token()
         ''')
 
diff --git a/pypy/module/pypyjit/test_pypy_c/test_containers.py b/pypy/module/pypyjit/test_pypy_c/test_containers.py
--- a/pypy/module/pypyjit/test_pypy_c/test_containers.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_containers.py
@@ -44,6 +44,7 @@
         # gc_id call is hoisted out of the loop, the id of a value obviously
         # can't change ;)
         assert loop.match_by_id("getitem", """
+            ...
             i26 = call(ConstClass(ll_dict_lookup), p18, p6, i25, descr=...)
             ...
             p33 = getinteriorfield_gc(p31, i26, descr=<InteriorFieldDescr <FieldP dictentry.value .*>>)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_misc.py b/pypy/module/pypyjit/test_pypy_c/test_misc.py
--- a/pypy/module/pypyjit/test_pypy_c/test_misc.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_misc.py
@@ -36,7 +36,7 @@
         assert loop0.match(expected)
         # XXX: The retracing fails to form a loop since j
         # becomes constant 0 after the bridge and constant 1 at the end of the
-        # loop. A bridge back to the peramble is produced instead.        
+        # loop. A bridge back to the peramble is produced instead.
         #assert loop1.match(expected)
 
     def test_factorial(self):
@@ -242,6 +242,7 @@
             i19 = int_add(i12, 1)
             setfield_gc(p9, i19, descr=<FieldS .*W_AbstractSeqIterObject.inst_index .*>)
             guard_nonnull_class(p17, ..., descr=...)
+            guard_not_invalidated(descr=...)
             i21 = getfield_gc(p17, descr=<FieldS .*W_Array.*.inst_len .*>)
             i23 = int_lt(0, i21)
             guard_true(i23, descr=...)
diff --git a/rpython/jit/backend/arm/assembler.py b/rpython/jit/backend/arm/assembler.py
--- a/rpython/jit/backend/arm/assembler.py
+++ b/rpython/jit/backend/arm/assembler.py
@@ -355,54 +355,63 @@
 
     def _push_all_regs_to_jitframe(self, mc, ignored_regs, withfloats,
                                 callee_only=False):
+        # Push general purpose registers
         base_ofs = self.cpu.get_baseofs_of_frame_field()
         if callee_only:
             regs = CoreRegisterManager.save_around_call_regs
         else:
             regs = CoreRegisterManager.all_regs
-        # XXX use STMDB ops here
-        for i, gpr in enumerate(regs):
-            if gpr in ignored_regs:
-                continue
-            self.store_reg(mc, gpr, r.fp, base_ofs + i * WORD)
+        # XXX add special case if ignored_regs are a block at the start of regs
+        if not ignored_regs:  # we want to push a contiguous block of regs
+            assert check_imm_arg(base_ofs)
+            mc.ADD_ri(r.ip.value, r.fp.value, base_ofs)
+            mc.STM(r.ip.value, [reg.value for reg in regs])
+        else:
+            for reg in ignored_regs:
+                assert not reg.is_vfp_reg()  # sanity check
+            # we can have holes in the list of regs
+            for i, gpr in enumerate(regs):
+                if gpr in ignored_regs:
+                    continue
+                self.store_reg(mc, gpr, r.fp, base_ofs + i * WORD)
+
         if withfloats:
-            if callee_only:
-                regs = VFPRegisterManager.save_around_call_regs
-            else:
-                regs = VFPRegisterManager.all_regs
-            for i, vfpr in enumerate(regs):
-                if vfpr in ignored_regs:
-                    continue
-                ofs = len(CoreRegisterManager.all_regs) * WORD
-                ofs += i * DOUBLE_WORD + base_ofs
-                self.store_reg(mc, vfpr, r.fp, ofs)
+            # Push VFP regs
+            regs = VFPRegisterManager.all_regs
+            ofs = len(CoreRegisterManager.all_regs) * WORD
+            assert check_imm_arg(ofs+base_ofs)
+            mc.ADD_ri(r.ip.value, r.fp.value, imm=ofs+base_ofs)
+            mc.VSTM(r.ip.value, [vfpr.value for vfpr in regs])
 
     def _pop_all_regs_from_jitframe(self, mc, ignored_regs, withfloats,
                                  callee_only=False):
-        # Pop all general purpose registers
+        # Pop general purpose registers
         base_ofs = self.cpu.get_baseofs_of_frame_field()
         if callee_only:
             regs = CoreRegisterManager.save_around_call_regs
         else:
             regs = CoreRegisterManager.all_regs
-        # XXX use LDMDB ops here
-        for i, gpr in enumerate(regs):
-            if gpr in ignored_regs:
-                continue
-            ofs = i * WORD + base_ofs
-            self.load_reg(mc, gpr, r.fp, ofs)
+        # XXX add special case if ignored_regs are a block at the start of regs
+        if not ignored_regs:  # we want to pop a contiguous block of regs
+            assert check_imm_arg(base_ofs)
+            mc.ADD_ri(r.ip.value, r.fp.value, base_ofs)
+            mc.LDM(r.ip.value, [reg.value for reg in regs])
+        else:
+            for reg in ignored_regs:
+                assert not reg.is_vfp_reg()  # sanity check
+            # we can have holes in the list of regs
+            for i, gpr in enumerate(regs):
+                if gpr in ignored_regs:
+                    continue
+                ofs = i * WORD + base_ofs
+                self.load_reg(mc, gpr, r.fp, ofs)
         if withfloats:
-            # Pop all XMM regs
-            if callee_only:
-                regs = VFPRegisterManager.save_around_call_regs
-            else:
-                regs = VFPRegisterManager.all_regs
-            for i, vfpr in enumerate(regs):
-                if vfpr in ignored_regs:
-                    continue
-                ofs = len(CoreRegisterManager.all_regs) * WORD
-                ofs += i * DOUBLE_WORD + base_ofs
-                self.load_reg(mc, vfpr, r.fp, ofs)
+            # Pop VFP regs
+            regs = VFPRegisterManager.all_regs
+            ofs = len(CoreRegisterManager.all_regs) * WORD
+            assert check_imm_arg(ofs+base_ofs)
+            mc.ADD_ri(r.ip.value, r.fp.value, imm=ofs+base_ofs)
+            mc.VLDM(r.ip.value, [vfpr.value for vfpr in regs])
 
     def _build_failure_recovery(self, exc, withfloats=False):
         mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
diff --git a/rpython/jit/backend/arm/test/test_push_pop_frame.py b/rpython/jit/backend/arm/test/test_push_pop_frame.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/arm/test/test_push_pop_frame.py
@@ -0,0 +1,177 @@
+import py
+from rpython.jit.backend.arm import conditions as c
+from rpython.jit.backend.arm import registers as r
+from rpython.jit.backend.arm.arch import WORD
+from rpython.jit.backend.arm.test.test_regalloc_mov import BaseMovTest, mi
+
+base_ofs = 23
+class MockCPU(object):
+    def get_baseofs_of_frame_field(self):
+        return base_ofs
+
+
+class TestRegallocPush(BaseMovTest):
+    def setup_method(self, method):
+        BaseMovTest.setup_method(self, method)
+        self.asm.cpu = MockCPU()
+
+    def test_callee_only(self):
+        expected = [
+                mi('ADD_ri', r.ip.value, r.fp.value, base_ofs),
+                mi('STM', r.ip.value, [r.r0.value, r.r1.value,
+                    r.r2.value, r.r3.value]),
+        ]
+        self.asm._push_all_regs_to_jitframe(self.asm.mc, ignored_regs=[],
+                withfloats=False, callee_only=True)
+        self.validate(expected)
+
+    def test_callee_only_with_holes(self):
+        expected = [
+                mi('STR_ri', r.r0.value, r.fp.value, cond=c.AL, imm=base_ofs),
+                mi('STR_ri', r.r2.value, r.fp.value, cond=c.AL, imm=base_ofs + 8),
+        ]
+        self.asm._push_all_regs_to_jitframe(self.asm.mc, ignored_regs=[r.r1, r.r3],
+                withfloats=False, callee_only=True)
+        self.validate(expected)
+
+    def test_callee_only_with_holes_in_front(self):
+        expected = [
+                mi('STR_ri', r.r2.value, r.fp.value, cond=c.AL, imm=base_ofs + 8),
+                mi('STR_ri', r.r3.value, r.fp.value, cond=c.AL, imm=base_ofs + 12),
+        ]
+        self.asm._push_all_regs_to_jitframe(self.asm.mc, ignored_regs=[r.r0, r.r1],
+                withfloats=False, callee_only=True)
+        self.validate(expected)
+
+    def test_callee_only_ignore_more_than_saved(self):
+        expected = [
+                mi('STR_ri', r.r0.value, r.fp.value, cond=c.AL, imm=base_ofs),
+        ]
+        self.asm._push_all_regs_to_jitframe(self.asm.mc,
+                ignored_regs=[r.r1, r.r2, r.r3, r.r4, r.r5],
+                withfloats=False, callee_only=True)
+        self.validate(expected)
+
+    def test_with_floats(self):
+        expected = [
+            mi('STR_ri', r.r0.value, r.fp.value, cond=c.AL, imm=base_ofs),
+            mi('ADD_ri', r.ip.value, r.fp.value, imm=base_ofs + len(r.all_regs) * WORD),
+            mi('VSTM', r.ip.value, [v.value for v in r.all_vfp_regs])
+        ]
+        self.asm._push_all_regs_to_jitframe(self.asm.mc,
+                ignored_regs=[r.r1, r.r2, r.r3],
+                withfloats=True, callee_only=True)
+        self.validate(expected)
+
+    def test_try_ignore_vfp_reg(self):
+        py.test.raises(AssertionError, self.asm._push_all_regs_to_jitframe, self.asm.mc,
+                ignored_regs=[r.d0, r.r2, r.r3], withfloats=True, callee_only=True)
+
+    def test_all_regs(self):
+        expected = [
+                mi('ADD_ri', r.ip.value, r.fp.value, base_ofs),
+                mi('STM', r.ip.value, [reg.value for reg in r.all_regs]),
+        ]
+        self.asm._push_all_regs_to_jitframe(self.asm.mc, ignored_regs=[],
+                withfloats=False, callee_only=False)
+        self.validate(expected)
+
+    def test_all_regs_with_holes(self):
+        ignored = [r.r1, r.r6]
+        expected = [mi('STR_ri', reg.value, r.fp.value, cond=c.AL, imm=base_ofs + reg.value * WORD)
+                                    for reg in r.all_regs if reg not in ignored]
+        self.asm._push_all_regs_to_jitframe(self.asm.mc, ignored_regs=ignored,
+                withfloats=False, callee_only=False)
+        self.validate(expected)
+
+    def test_all_regs_with_holes_in_front(self):
+        ignored = [r.r0, r.r1]
+        expected = [mi('STR_ri', reg.value, r.fp.value, cond=c.AL, imm=base_ofs + reg.value * WORD)
+                                    for reg in r.all_regs if reg not in ignored]
+        self.asm._push_all_regs_to_jitframe(self.asm.mc, ignored_regs=ignored,
+                withfloats=False, callee_only=False)
+        self.validate(expected)
+
+
+
+class TestRegallocPop(BaseMovTest):
+    def setup_method(self, method):
+        BaseMovTest.setup_method(self, method)
+        self.asm.cpu = MockCPU()
+
+    def test_callee_only(self):
+        expected = [
+                mi('ADD_ri', r.ip.value, r.fp.value, base_ofs),
+                mi('LDM', r.ip.value, [r.r0.value, r.r1.value,
+                    r.r2.value, r.r3.value]),
+        ]
+        self.asm._pop_all_regs_from_jitframe(self.asm.mc, ignored_regs=[],
+                withfloats=False, callee_only=True)
+        self.validate(expected)
+
+    def test_callee_only_with_holes(self):
+        expected = [
+                mi('LDR_ri', r.r0.value, r.fp.value, cond=c.AL, imm=base_ofs),
+                mi('LDR_ri', r.r2.value, r.fp.value, cond=c.AL, imm=base_ofs + 8),
+        ]
+        self.asm._pop_all_regs_from_jitframe(self.asm.mc, ignored_regs=[r.r1, r.r3],
+                withfloats=False, callee_only=True)
+        self.validate(expected)
+
+    def test_callee_only_with_holes_in_front(self):
+        expected = [
+                mi('LDR_ri', r.r2.value, r.fp.value, cond=c.AL, imm=base_ofs + 8),
+                mi('LDR_ri', r.r3.value, r.fp.value, cond=c.AL, imm=base_ofs + 12),
+        ]
+        self.asm._pop_all_regs_from_jitframe(self.asm.mc, ignored_regs=[r.r0, r.r1],
+                withfloats=False, callee_only=True)
+        self.validate(expected)
+
+    def test_callee_only_ignore_more_than_saved(self):
+        expected = [
+                mi('LDR_ri', r.r0.value, r.fp.value, cond=c.AL, imm=base_ofs),
+        ]
+        self.asm._pop_all_regs_from_jitframe(self.asm.mc,
+                ignored_regs=[r.r1, r.r2, r.r3, r.r4, r.r5],
+                withfloats=False, callee_only=True)
+        self.validate(expected)
+
+    def test_with_floats(self):
+        expected = [
+            mi('LDR_ri', r.r0.value, r.fp.value, cond=c.AL, imm=base_ofs),
+            mi('ADD_ri', r.ip.value, r.fp.value, imm=base_ofs + len(r.all_regs) * WORD),
+            mi('VLDM', r.ip.value, [v.value for v in r.all_vfp_regs])
+        ]
+        self.asm._pop_all_regs_from_jitframe(self.asm.mc,
+                ignored_regs=[r.r1, r.r2, r.r3],
+                withfloats=True, callee_only=True)
+        self.validate(expected)
+
+    def test_try_ignore_vfp_reg(self):
+        py.test.raises(AssertionError, self.asm._pop_all_regs_from_jitframe, self.asm.mc,
+                ignored_regs=[r.d0, r.r2, r.r3], withfloats=True, callee_only=True)
+
+    def test_all_regs(self):
+        expected = [
+                mi('ADD_ri', r.ip.value, r.fp.value, base_ofs),
+                mi('LDM', r.ip.value, [reg.value for reg in r.all_regs]),
+        ]
+        self.asm._pop_all_regs_from_jitframe(self.asm.mc, ignored_regs=[],
+                withfloats=False, callee_only=False)
+        self.validate(expected)
+
+    def test_all_regs_with_holes(self):
+        ignored = [r.r1, r.r6]
+        expected = [mi('LDR_ri', reg.value, r.fp.value, cond=c.AL, imm=base_ofs + reg.value * WORD)
+                                    for reg in r.all_regs if reg not in ignored]
+        self.asm._pop_all_regs_from_jitframe(self.asm.mc, ignored_regs=ignored,
+                withfloats=False, callee_only=False)
+        self.validate(expected)
+
+    def test_all_regs_with_holes_in_front(self):
+        ignored = [r.r0, r.r1]
+        expected = [mi('LDR_ri', reg.value, r.fp.value, cond=c.AL, imm=base_ofs + reg.value * WORD)
+                                    for reg in r.all_regs if reg not in ignored]
+        self.asm._pop_all_regs_from_jitframe(self.asm.mc, ignored_regs=ignored,
+                withfloats=False, callee_only=False)
+        self.validate(expected)
diff --git a/rpython/jit/backend/llgraph/test/test_llgraph.py b/rpython/jit/backend/llgraph/test/test_llgraph.py
--- a/rpython/jit/backend/llgraph/test/test_llgraph.py
+++ b/rpython/jit/backend/llgraph/test/test_llgraph.py
@@ -15,6 +15,9 @@
     def test_memoryerror(self):
         py.test.skip("does not make much sense on the llgraph backend")
 
+    def test_call_release_gil_variable_function_and_arguments(self):
+        py.test.skip("the arguments seem not correctly casted")
+
 
 def test_cast_adr_to_int_and_back():
     X = lltype.Struct('X', ('foo', lltype.Signed))
diff --git a/rpython/jit/backend/llsupport/assembler.py b/rpython/jit/backend/llsupport/assembler.py
--- a/rpython/jit/backend/llsupport/assembler.py
+++ b/rpython/jit/backend/llsupport/assembler.py
@@ -108,8 +108,7 @@
             self.malloc_slowpath_unicode = None
 
         self._build_stack_check_slowpath()
-        if gc_ll_descr.gcrootmap:
-            self._build_release_gil(gc_ll_descr.gcrootmap)
+        self._build_release_gil(gc_ll_descr.gcrootmap)
         if not self._debug:
             # if self._debug is already set it means that someone called
             # set_debug by hand before initializing the assembler. Leave it
@@ -348,12 +347,19 @@
         if after:
             after()
 
+    @staticmethod
+    def _no_op():
+        pass
+
     _NOARG_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
     _CLOSESTACK_FUNC = lltype.Ptr(lltype.FuncType([rffi.LONGP],
                                                   lltype.Void))
 
     def _build_release_gil(self, gcrootmap):
-        if gcrootmap.is_shadow_stack:
+        if gcrootmap is None:
+            releasegil_func = llhelper(self._NOARG_FUNC, self._no_op)
+            reacqgil_func = llhelper(self._NOARG_FUNC, self._no_op)
+        elif gcrootmap.is_shadow_stack:
             releasegil_func = llhelper(self._NOARG_FUNC,
                                        self._release_gil_shadowstack)
             reacqgil_func = llhelper(self._NOARG_FUNC,
diff --git a/rpython/jit/backend/test/runner_test.py b/rpython/jit/backend/test/runner_test.py
--- a/rpython/jit/backend/test/runner_test.py
+++ b/rpython/jit/backend/test/runner_test.py
@@ -2532,6 +2532,219 @@
         assert rffi.charp2strn(buffer, buflen) == cwd
         lltype.free(buffer, flavor='raw')
 
+    def test_call_release_gil_return_types(self):
+        from rpython.rlib.libffi import types
+        from rpython.rlib.rarithmetic import r_uint, r_longlong, r_ulonglong
+        from rpython.rlib.rarithmetic import r_singlefloat
+        cpu = self.cpu
+
+        for ffitype, result, TP in [
+            (types.ulong,  r_uint(sys.maxint + 10), lltype.Unsigned),
+            (types.slong,  -4321, lltype.Signed),
+            (types.uint8,  200, rffi.UCHAR),
+            (types.sint8,  -42, rffi.SIGNEDCHAR),
+            (types.uint16, 50000, rffi.USHORT),
+            (types.sint16, -20000, rffi.SHORT),
+            (types.uint32, r_uint(3000000000), rffi.UINT),
+            (types.sint32, -2000000000, rffi.INT),
+            (types.uint64, r_ulonglong(9999999999999999999),
+                                                   lltype.UnsignedLongLong),
+            (types.sint64, r_longlong(-999999999999999999),
+                                                   lltype.SignedLongLong),
+            (types.double, 12.3475226, rffi.DOUBLE),
+            (types.float,  r_singlefloat(-592.75), rffi.FLOAT),
+            ]:
+            if sys.maxint < 2**32 and TP in (lltype.SignedLongLong,
+                                             lltype.UnsignedLongLong):
+                if not cpu.supports_longlong:
+                    continue
+            if TP == rffi.DOUBLE:
+                if not cpu.supports_floats:
+                    continue
+            if TP == rffi.FLOAT:
+                if not cpu.supports_singlefloats:
+                    continue
+            #
+            result = rffi.cast(TP, result)
+            #
+            def pseudo_c_function():
+                return result
+            #
+            FPTR = self.Ptr(self.FuncType([], TP))
+            func_ptr = llhelper(FPTR, pseudo_c_function)
+            funcbox = self.get_funcbox(cpu, func_ptr)
+            calldescr = cpu._calldescr_dynamic_for_tests([], ffitype)
+            faildescr = BasicFailDescr(1)
+            kind = types.getkind(ffitype)
+            if kind in 'uis':
+                b3 = BoxInt()
+            elif kind in 'fUI':
+                b3 = BoxFloat()
+            else:
+                assert 0, kind
+            #
+            ops = [
+                ResOperation(rop.CALL_RELEASE_GIL, [funcbox], b3,
+                             descr=calldescr),
+                ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+                ResOperation(rop.FINISH, [b3], None, descr=BasicFinalDescr(0))
+                ]
+            ops[1].setfailargs([])
+            looptoken = JitCellToken()
+            self.cpu.compile_loop([], ops, looptoken)
+
+            deadframe = self.cpu.execute_token(looptoken)
+            fail = self.cpu.get_latest_descr(deadframe)
+            assert fail.identifier == 0
+            if isinstance(b3, BoxInt):
+                r = self.cpu.get_int_value(deadframe, 0)
+                if isinstance(result, r_singlefloat):
+                    assert -sys.maxint-1 <= r <= 0xFFFFFFFF
+                    r, = struct.unpack("f", struct.pack("I", r & 0xFFFFFFFF))
+                    result = float(result)
+                else:
+                    r = rffi.cast(TP, r)
+                assert r == result
+            elif isinstance(b3, BoxFloat):
+                r = self.cpu.get_float_value(deadframe, 0)
+                if isinstance(result, float):
+                    r = longlong.getrealfloat(r)
+                else:
+                    r = rffi.cast(TP, r)
+                assert r == result
+
+    def test_call_release_gil_variable_function_and_arguments(self):
+        from rpython.rlib.libffi import types
+        from rpython.rlib.rarithmetic import r_uint, r_longlong, r_ulonglong
+        from rpython.rlib.rarithmetic import r_singlefloat
+
+        cpu = self.cpu
+        rnd = random.Random(525)
+
+        ALL_TYPES = [
+            (types.ulong,  lltype.Unsigned),
+            (types.slong,  lltype.Signed),
+            (types.uint8,  rffi.UCHAR),
+            (types.sint8,  rffi.SIGNEDCHAR),
+            (types.uint16, rffi.USHORT),
+            (types.sint16, rffi.SHORT),
+            (types.uint32, rffi.UINT),
+            (types.sint32, rffi.INT),
+            ]
+        if sys.maxint < 2**32 and cpu.supports_longlong:
+            ALL_TYPES += [
+                (types.uint64, lltype.UnsignedLongLong),
+                (types.sint64, lltype.SignedLongLong),
+                ] * 2
+        if cpu.supports_floats:
+            ALL_TYPES += [
+                (types.double, rffi.DOUBLE),
+                ] * 4
+        if cpu.supports_singlefloats:
+            ALL_TYPES += [
+                (types.float,  rffi.FLOAT),
+                ] * 4
+
+        for k in range(100):
+            POSSIBLE_TYPES = [rnd.choice(ALL_TYPES)
+                              for i in range(random.randrange(2, 5))]
+            load_factor = rnd.random()
+            keepalive_factor = rnd.random()
+            #
+            def pseudo_c_function(*args):
+                seen.append(list(args))
+            #
+            ffitypes = []
+            ARGTYPES = []
+            for i in range(rnd.randrange(4, 20)):
+                ffitype, TP = rnd.choice(POSSIBLE_TYPES)
+                ffitypes.append(ffitype)
+                ARGTYPES.append(TP)
+            #
+            FPTR = self.Ptr(self.FuncType(ARGTYPES, lltype.Void))
+            func_ptr = llhelper(FPTR, pseudo_c_function)
+            funcbox = self.get_funcbox(cpu, func_ptr)
+            calldescr = cpu._calldescr_dynamic_for_tests(ffitypes, types.void)
+            faildescr = BasicFailDescr(1)
+            #
+            argboxes = [BoxInt()]   # for the function to call
+            codes = ['X']
+            for ffitype in ffitypes:
+                kind = types.getkind(ffitype)
+                codes.append(kind)
+                if kind in 'uis':
+                    b1 = BoxInt()
+                elif kind in 'fUI':
+                    b1 = BoxFloat()
+                else:
+                    assert 0, kind
+                argboxes.append(b1)
+            codes = ''.join(codes)     # useful for pdb
+            print
+            print codes
+            #
+            argvalues = [funcbox.getint()]
+            for TP in ARGTYPES:
+                r = (rnd.random() - 0.5) * 999999999999.9
+                r = rffi.cast(TP, r)
+                argvalues.append(r)
+            #
+            argvalues_normal = argvalues[:1]
+            for ffitype, r in zip(ffitypes, argvalues[1:]):
+                kind = types.getkind(ffitype)
+                if kind in 'ui':
+                    r = rffi.cast(lltype.Signed, r)
+                elif kind in 's':
+                    r, = struct.unpack("i", struct.pack("f", float(r)))
+                elif kind in 'f':
+                    r = longlong.getfloatstorage(r)
+                elif kind in 'UI':   # 32-bit only
+                    r = rffi.cast(lltype.SignedLongLong, r)
+                else:
+                    assert 0
+                argvalues_normal.append(r)
+            #
+            ops = []
+            loadcodes = []
+            insideboxes = []
+            for b1 in argboxes:
+                load = rnd.random() < load_factor
+                loadcodes.append(' ^'[load])
+                if load:
+                    b2 = b1.clonebox()
+                    ops.insert(rnd.randrange(0, len(ops)+1),
+                               ResOperation(rop.SAME_AS, [b1], b2))
+                    b1 = b2
+                insideboxes.append(b1)
+            loadcodes = ''.join(loadcodes)
+            print loadcodes
+            ops += [
+                ResOperation(rop.CALL_RELEASE_GIL, insideboxes, None,
+                             descr=calldescr),
+                ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+                ResOperation(rop.FINISH, [], None, descr=BasicFinalDescr(0))
+                ]
+            ops[-2].setfailargs([])
+            # keep alive a random subset of the insideboxes
+            for b1 in insideboxes:
+                if rnd.random() < keepalive_factor:
+                    ops.insert(-1, ResOperation(rop.SAME_AS, [b1],
+                                                b1.clonebox()))
+            looptoken = JitCellToken()
+            self.cpu.compile_loop(argboxes, ops, looptoken)
+            #
+            seen = []
+            deadframe = self.cpu.execute_token(looptoken, *argvalues_normal)
+            fail = self.cpu.get_latest_descr(deadframe)
+            assert fail.identifier == 0
+            expected = argvalues[1:]
+            [got] = seen
+            different_values = ['%r != %r' % (a, b)
+                                    for a, b in zip(got, expected)
+                                        if a != b]
+            assert got == expected, ', '.join(different_values)
+
+
     def test_guard_not_invalidated(self):
         cpu = self.cpu
         i0 = BoxInt()
diff --git a/rpython/jit/backend/x86/arch.py b/rpython/jit/backend/x86/arch.py
--- a/rpython/jit/backend/x86/arch.py
+++ b/rpython/jit/backend/x86/arch.py
@@ -40,4 +40,4 @@
     PASS_ON_MY_FRAME = 12
     JITFRAME_FIXED_SIZE = 28 # 13 GPR + 15 XMM
 
-assert PASS_ON_MY_FRAME >= 11       # asmgcc needs at least JIT_USE_WORDS + 2
+assert PASS_ON_MY_FRAME >= 12       # asmgcc needs at least JIT_USE_WORDS + 3
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -6,7 +6,7 @@
                                                 DEBUG_COUNTER, debug_bridge)
 from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from rpython.jit.backend.llsupport.gcmap import allocate_gcmap
-from rpython.jit.metainterp.history import Const, Box
+from rpython.jit.metainterp.history import Const, Box, VOID
 from rpython.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
 from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory
 from rpython.rtyper.lltypesystem.lloperation import llop
@@ -25,28 +25,17 @@
     RegLoc, FrameLoc, ConstFloatLoc, ImmedLoc, AddressLoc, imm,
     imm0, imm1, FloatImmedLoc, RawEbpLoc, RawEspLoc)
 from rpython.rlib.objectmodel import we_are_translated
-from rpython.jit.backend.x86 import rx86, codebuf
+from rpython.jit.backend.x86 import rx86, codebuf, callbuilder
 from rpython.jit.metainterp.resoperation import rop
 from rpython.jit.backend.x86 import support
 from rpython.rlib.debug import debug_print, debug_start, debug_stop
 from rpython.rlib import rgc
-from rpython.rlib.clibffi import FFI_DEFAULT_ABI
-from rpython.jit.backend.x86.jump import remap_frame_layout
 from rpython.jit.codewriter.effectinfo import EffectInfo
 from rpython.jit.codewriter import longlong
 from rpython.rlib.rarithmetic import intmask, r_uint
 from rpython.rlib.objectmodel import compute_unique_id
 
 
-# darwin requires the stack to be 16 bytes aligned on calls. Same for gcc 4.5.0,
-# better safe than sorry
-CALL_ALIGN = 16 // WORD
-
-
-def align_stack_words(words):
-    return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
-
-
 class Assembler386(BaseAssembler):
     _regalloc = None
     _output_loop_log = None
@@ -131,10 +120,10 @@
             mc.MOV_rs(esi.value, WORD*2)
             # push first arg
             mc.MOV_rr(edi.value, ebp.value)
-            align = align_stack_words(1)
+            align = callbuilder.align_stack_words(1)
             mc.SUB_ri(esp.value, (align - 1) * WORD)
         else:
-            align = align_stack_words(3)
+            align = callbuilder.align_stack_words(3)
             mc.MOV_rs(eax.value, WORD * 2)
             mc.SUB_ri(esp.value, (align - 1) * WORD)
             mc.MOV_sr(WORD, eax.value)
@@ -1014,175 +1003,24 @@
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
         return bool(gcrootmap) and not gcrootmap.is_shadow_stack
 
-    def _emit_call(self, x, arglocs, start=0, tmp=eax,
-                   argtypes=None, callconv=FFI_DEFAULT_ABI,
-                   # whether to worry about a CALL that can collect; this
-                   # is always true except in call_release_gil
-                   can_collect=True,
-                   # max number of arguments we can pass on esp; if more,
-                   # we need to decrease esp temporarily
-                   stack_max=PASS_ON_MY_FRAME):
-        #
-        if IS_X86_64:
-            return self._emit_call_64(x, arglocs, start, argtypes,
-                                      can_collect, stack_max)
-        stack_depth = 0
-        n = len(arglocs)
-        for i in range(start, n):
-            loc = arglocs[i]
-            stack_depth += loc.get_width() // WORD
-        if stack_depth > stack_max:
-            align = align_stack_words(stack_depth - stack_max)
-            self.mc.SUB_ri(esp.value, align * WORD)
-            if can_collect:
-                self.set_extra_stack_depth(self.mc, align * WORD)
+    def simple_call(self, fnloc, arglocs, result_loc=eax):
+        if result_loc is xmm0:
+            result_type = FLOAT
+            result_size = 8
+        elif result_loc is None:
+            result_type = VOID
+            result_size = 0
         else:
-            align = 0
-        p = 0
-        for i in range(start, n):
-            loc = arglocs[i]
-            if isinstance(loc, RegLoc):
-                if loc.is_xmm:
-                    self.mc.MOVSD_sx(p, loc.value)
-                else:
-                    self.mc.MOV_sr(p, loc.value)
-            p += loc.get_width()
-        p = 0
-        for i in range(start, n):
-            loc = arglocs[i]
-            if not isinstance(loc, RegLoc):
-                if loc.get_width() == 8:
-                    self.mc.MOVSD(xmm0, loc)
-                    self.mc.MOVSD_sx(p, xmm0.value)
-                else:
-                    self.mc.MOV(tmp, loc)
-                    self.mc.MOV_sr(p, tmp.value)
-            p += loc.get_width()
-        # x is a location
-        if can_collect:
-            # we push *now* the gcmap, describing the status of GC registers
-            # after the rearrangements done just above, ignoring the return
-            # value eax, if necessary
-            noregs = self.cpu.gc_ll_descr.is_shadow_stack()
-            gcmap = self._regalloc.get_gcmap([eax], noregs=noregs)
-            self.push_gcmap(self.mc, gcmap, store=True)
-        #
-        self.mc.CALL(x)
-        if callconv != FFI_DEFAULT_ABI:
-            self._fix_stdcall(callconv, p - align * WORD)
-        elif align:
-            self.mc.ADD_ri(esp.value, align * WORD)
-        #
-        if can_collect:
-            self._reload_frame_if_necessary(self.mc)
-            if align:
-                self.set_extra_stack_depth(self.mc, 0)
-            self.pop_gcmap(self.mc)
+            result_type = INT
+            result_size = WORD
+        cb = callbuilder.CallBuilder(self, fnloc, arglocs,
+                                     result_loc, result_type,
+                                     result_size)
+        cb.emit()
 
-    def _fix_stdcall(self, callconv, p):
-        from rpython.rlib.clibffi import FFI_STDCALL
-        assert callconv == FFI_STDCALL
-        # it's a bit stupid, but we're just going to cancel the fact that
-        # the called function just added 'p' to ESP, by subtracting it again.
-        self.mc.SUB_ri(esp.value, p)
-
-    def _emit_call_64(self, x, arglocs, start, argtypes,
-                      can_collect, stack_max):
-        src_locs = []
-        dst_locs = []
-        xmm_src_locs = []
-        xmm_dst_locs = []
-        singlefloats = None
-
-        # In reverse order for use with pop()
-        unused_gpr = [r9, r8, ecx, edx, esi, edi]
-        unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
-
-        on_stack = 0
-        # count the stack depth
-        floats = 0
-        for i in range(start, len(arglocs)):
-            arg = arglocs[i]
-            if arg.is_float() or argtypes and argtypes[i - start] == 'S':
-                floats += 1
-        all_args = len(arglocs) - start
-        stack_depth = (max(all_args - floats - len(unused_gpr), 0) +
-                       max(floats - len(unused_xmm), 0))
-        align = 0
-        if stack_depth > stack_max:
-            align = align_stack_words(stack_depth - stack_max)
-            if can_collect:
-                self.set_extra_stack_depth(self.mc, align * WORD)
-            self.mc.SUB_ri(esp.value, align * WORD)
-        for i in range(start, len(arglocs)):
-            loc = arglocs[i]
-            if loc.is_float():
-                xmm_src_locs.append(loc)
-                if len(unused_xmm) > 0:
-                    xmm_dst_locs.append(unused_xmm.pop())
-                else:
-                    xmm_dst_locs.append(RawEspLoc(on_stack * WORD, FLOAT))
-                    on_stack += 1
-            elif argtypes is not None and argtypes[i-start] == 'S':
-                # Singlefloat argument
-                if singlefloats is None:
-                    singlefloats = []
-                if len(unused_xmm) > 0:
-                    singlefloats.append((loc, unused_xmm.pop()))
-                else:
-                    singlefloats.append((loc, RawEspLoc(on_stack * WORD, INT)))
-                    on_stack += 1
-            else:
-                src_locs.append(loc)
-                if len(unused_gpr) > 0:
-                    dst_locs.append(unused_gpr.pop())
-                else:
-                    dst_locs.append(RawEspLoc(on_stack * WORD, INT))
-                    on_stack += 1
-
-        # Handle register arguments: first remap the xmm arguments
-        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs,
-                           X86_64_XMM_SCRATCH_REG)
-        # Load the singlefloat arguments from main regs or stack to xmm regs
-        if singlefloats is not None:
-            for src, dst in singlefloats:
-                if isinstance(dst, RawEspLoc):
-                    # XXX too much special logic
-                    if isinstance(src, RawEbpLoc):
-                        self.mc.MOV32(X86_64_SCRATCH_REG, src)
-                        self.mc.MOV32(dst, X86_64_SCRATCH_REG)
-                    else:
-                        self.mc.MOV32(dst, src)
-                    continue
-                if isinstance(src, ImmedLoc):
-                    self.mc.MOV(X86_64_SCRATCH_REG, src)
-                    src = X86_64_SCRATCH_REG
-                self.mc.MOVD(dst, src)
-        # Finally remap the arguments in the main regs
-        # If x is a register and is in dst_locs, then oups, it needs to
-        # be moved away:
-        if x in dst_locs:
-            src_locs.append(x)
-            dst_locs.append(r10)
-            x = r10
-        remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
-        if can_collect:
-            # we push *now* the gcmap, describing the status of GC registers
-            # after the rearrangements done just above, ignoring the return
-            # value eax, if necessary
-            noregs = self.cpu.gc_ll_descr.is_shadow_stack()
-            gcmap = self._regalloc.get_gcmap([eax], noregs=noregs)
-            self.push_gcmap(self.mc, gcmap, store=True)
-        #
-        self.mc.CALL(x)
-        if align:
-            self.mc.ADD_ri(esp.value, align * WORD)
-        #
-        if can_collect:
-            self._reload_frame_if_necessary(self.mc)
-            if align:
-                self.set_extra_stack_depth(self.mc, 0)
-            self.pop_gcmap(self.mc)
+    def simple_call_no_collect(self, fnloc, arglocs):
+        cb = callbuilder.CallBuilder(self, fnloc, arglocs)
+        cb.emit_no_collect()
 
     def _reload_frame_if_necessary(self, mc, align_stack=False):
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
@@ -1198,10 +1036,6 @@
             self._write_barrier_fastpath(mc, wbdescr, [ebp], array=False,
                                          is_frame=True, align_stack=align_stack)
 
-    def call(self, addr, args, res):
-        self._emit_call(imm(addr), args)
-        assert res is eax
-
     genop_int_neg = _unaryop("NEG")
     genop_int_invert = _unaryop("NOT")
     genop_int_add = _binaryop_or_lea("ADD", True)
@@ -1446,7 +1280,7 @@
     # ----------
 
     def genop_call_malloc_gc(self, op, arglocs, result_loc):
-        self.genop_call(op, arglocs, result_loc)
+        self._genop_call(op, arglocs, result_loc)
         self.propagate_memoryerror_if_eax_is_null()
 
     def propagate_memoryerror_if_eax_is_null(self):
@@ -1993,75 +1827,29 @@
         self.pending_guard_tokens.append(guard_token)
 
     def genop_call(self, op, arglocs, resloc):
-        return self._genop_call(op, arglocs, resloc)
+        self._genop_call(op, arglocs, resloc)
 
     def _genop_call(self, op, arglocs, resloc, is_call_release_gil=False):
         from rpython.jit.backend.llsupport.descr import CallDescr
 
-        sizeloc = arglocs[0]
-        assert isinstance(sizeloc, ImmedLoc)
-        size = sizeloc.value
-        signloc = arglocs[1]
-
-        x = arglocs[2]     # the function address
-        if x is eax:
-            tmp = ecx
-        else:
-            tmp = eax
+        cb = callbuilder.CallBuilder(self, arglocs[2], arglocs[3:], resloc)
 
         descr = op.getdescr()
         assert isinstance(descr, CallDescr)
+        cb.callconv = descr.get_call_conv()
+        cb.argtypes = descr.get_arg_types()
+        cb.restype  = descr.get_result_type()
+        sizeloc = arglocs[0]
+        assert isinstance(sizeloc, ImmedLoc)
+        cb.ressize = sizeloc.value
+        signloc = arglocs[1]
+        assert isinstance(signloc, ImmedLoc)
+        cb.ressign = signloc.value
 
-        stack_max = PASS_ON_MY_FRAME
         if is_call_release_gil:
-            if self._is_asmgcc():
-                from rpython.memory.gctransform import asmgcroot
-                stack_max -= asmgcroot.JIT_USE_WORDS
-            can_collect = False
+            cb.emit_call_release_gil()
         else:
-            can_collect = True
-
-        self._emit_call(x, arglocs, 3, tmp=tmp,
-                        argtypes=descr.get_arg_types(),
-                        callconv=descr.get_call_conv(),
-                        can_collect=can_collect,
-                        stack_max=stack_max)
-
-        if IS_X86_32 and isinstance(resloc, FrameLoc) and resloc.type == FLOAT:
-            # a float or a long long return
-            if descr.get_result_type() == 'L':
-                self.mc.MOV_br(resloc.value, eax.value)      # long long
-                self.mc.MOV_br(resloc.value + 4, edx.value)
-                # XXX should ideally not move the result on the stack,
-                #     but it's a mess to load eax/edx into a xmm register
-                #     and this way is simpler also because the result loc
-                #     can just be always a stack location
-            else:
-                self.mc.FSTPL_b(resloc.value)   # float return
-        elif descr.get_result_type() == 'S':
-            # singlefloat return
-            assert resloc is eax
-            if IS_X86_32:
-                # must convert ST(0) to a 32-bit singlefloat and load it into EAX
-                # mess mess mess
-                self.mc.SUB_ri(esp.value, 4)
-                self.mc.FSTPS_s(0)
-                self.mc.POP_r(eax.value)
-            elif IS_X86_64:
-                # must copy from the lower 32 bits of XMM0 into eax
-                self.mc.MOVD_rx(eax.value, xmm0.value)
-        elif size == WORD:
-            assert resloc is eax or resloc is xmm0    # a full word
-        elif size == 0:
-            pass    # void return
-        else:
-            # use the code in load_from_mem to do the zero- or sign-extension
-            assert resloc is eax
-            if size == 1:
-                srcloc = eax.lowest8bits()
-            else:
-                srcloc = eax
-            self.load_from_mem(eax, srcloc, sizeloc, signloc)
+            cb.emit()
 
     def _store_force_index(self, guard_op):
         faildescr = guard_op.getdescr()
@@ -2077,64 +1865,15 @@
     def genop_guard_call_may_force(self, op, guard_op, guard_token,
                                    arglocs, result_loc):
         self._store_force_index(guard_op)
-        self.genop_call(op, arglocs, result_loc)
+        self._genop_call(op, arglocs, result_loc)
         self._emit_guard_not_forced(guard_token)
 
     def genop_guard_call_release_gil(self, op, guard_op, guard_token,
                                      arglocs, result_loc):
         self._store_force_index(guard_op)
-        # first, close the stack in the sense of the asmgcc GC root tracker
-        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
-        if gcrootmap:
-            # we put the gcmap now into the frame before releasing the GIL,
-            # and pop it below after reacquiring the GIL.  The assumption
-            # is that this gcmap describes correctly the situation at any
-            # point in-between: all values containing GC pointers should
-            # be safely saved out of registers by now, and will not be
-            # manipulated by any of the following CALLs.
-            gcmap = self._regalloc.get_gcmap(noregs=True)
-            self.push_gcmap(self.mc, gcmap, store=True)
-            self.call_release_gil(gcrootmap, arglocs)
-        # do the call
         self._genop_call(op, arglocs, result_loc, is_call_release_gil=True)
-        # then reopen the stack
-        if gcrootmap:
-            self.call_reacquire_gil(gcrootmap, result_loc)
-            self.pop_gcmap(self.mc)     # remove the gcmap saved above
-        # finally, the guard_not_forced
         self._emit_guard_not_forced(guard_token)
 
-    def call_release_gil(self, gcrootmap, save_registers):
-        if gcrootmap.is_shadow_stack:
-            args = []
-        else:
-            from rpython.memory.gctransform import asmgcroot
-            # build a 'css' structure on the stack: 2 words for the linkage,
-            # and 5/7 words as described for asmgcroot.ASM_FRAMEDATA, for a
-            # total size of JIT_USE_WORDS.  This structure is found at
-            # [ESP+css].
-            css = WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS)
-            assert css >= 2
-            # Save ebp
-            index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
-            self.mc.MOV_sr(index_of_ebp, ebp.value)  # MOV [css.ebp], EBP
-            # Save the "return address": we pretend that it's css
-            if IS_X86_32:
-                reg = eax
-            elif IS_X86_64:
-                reg = edi
-            self.mc.LEA_rs(reg.value, css)           # LEA reg, [css]
-            frame_ptr = css + WORD * (2+asmgcroot.FRAME_PTR)
-            self.mc.MOV_sr(frame_ptr, reg.value)     # MOV [css.frame], reg
-            # Set up jf_extra_stack_depth to pretend that the return address
-            # was at css, and so our stack frame is supposedly shorter by
-            # (css+WORD) bytes
-            self.set_extra_stack_depth(self.mc, -css-WORD)
-            # Call the closestack() function (also releasing the GIL)
-            args = [reg]
-        #
-        self._emit_call(imm(self.releasegil_addr), args, can_collect=False)
-
     def call_reacquire_gil(self, gcrootmap, save_loc):
         # save the previous result (eax/xmm0) into the stack temporarily.
         # XXX like with call_release_gil(), we assume that we don't need
@@ -2186,11 +1925,11 @@
         self.call_assembler(op, guard_op, argloc, vloc, result_loc, eax)
         self._emit_guard_not_forced(guard_token)
 
-    def _call_assembler_emit_call(self, addr, argloc, tmploc):
-        self._emit_call(addr, [argloc], 0, tmp=tmploc)
+    def _call_assembler_emit_call(self, addr, argloc, _):
+        self.simple_call(addr, [argloc])
 
-    def _call_assembler_emit_helper_call(self, addr, arglocs, _):
-         self._emit_call(addr, arglocs, 0, tmp=self._second_tmp_reg)
+    def _call_assembler_emit_helper_call(self, addr, arglocs, result_loc):
+        self.simple_call(addr, arglocs, result_loc)
 
     def _call_assembler_check_descr(self, value, tmploc):
         ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
diff --git a/rpython/jit/backend/x86/callbuilder.py b/rpython/jit/backend/x86/callbuilder.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/x86/callbuilder.py
@@ -0,0 +1,577 @@
+from rpython.rlib.clibffi import FFI_DEFAULT_ABI
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.jit.metainterp.history import INT, FLOAT
+from rpython.jit.backend.x86.arch import (WORD, IS_X86_64, IS_X86_32,
+                                          PASS_ON_MY_FRAME)
+from rpython.jit.backend.x86.regloc import (eax, ecx, edx, ebx, esp, ebp, esi,
+    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, r8, r9, r10, r11, edi,
+    r12, r13, r14, r15, X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG,
+    RegLoc, RawEspLoc, RawEbpLoc, imm, ImmedLoc)
+from rpython.jit.backend.x86.jump import remap_frame_layout
+
+
+# darwin requires the stack to be 16 bytes aligned on calls.
+# Same for gcc 4.5.0, better safe than sorry
+CALL_ALIGN = 16 // WORD
+
+def align_stack_words(words):
+    return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
+
+
+
+class AbstractCallBuilder(object):
+
+    # max number of words we have room in esp; if we need more for
+    # arguments, we need to decrease esp temporarily
+    stack_max = PASS_ON_MY_FRAME
+
+    # this can be set to guide more complex calls: gives the detailed
+    # type of the arguments
+    argtypes = ""
+    ressign = False
+
+    # this is the calling convention (can be FFI_STDCALL on Windows)
+    callconv = FFI_DEFAULT_ABI
+
+    # is it for the main CALL of a call_release_gil?
+    is_call_release_gil = False
+
+    # set by save_result_value()
+    tmpresloc = None
+
+
+    def __init__(self, assembler, fnloc, arglocs,
+                 resloc=eax, restype=INT, ressize=WORD):
+        # Avoid tons of issues with a non-immediate fnloc by sticking it
+        # as an extra argument if needed
+        self.fnloc_is_immediate = isinstance(fnloc, ImmedLoc)
+        if self.fnloc_is_immediate:
+            self.fnloc = fnloc
+            self.arglocs = arglocs
+        else:
+            self.arglocs = arglocs + [fnloc]
+        self.asm = assembler
+        self.mc = assembler.mc
+        self.resloc = resloc
+        self.restype = restype
+        self.ressize = ressize
+        self.current_esp = 0     # 0 or (usually) negative, counted in bytes
+
+    def emit_no_collect(self):
+        """Emit a call that cannot collect."""
+        self.prepare_arguments()
+        self.emit_raw_call()
+        self.restore_esp()
+        self.load_result()
+
+    def emit(self):
+        """Emit a regular call; not for CALL_RELEASE_GIL."""
+        self.prepare_arguments()
+        self.push_gcmap()
+        self.emit_raw_call()
+        self.restore_esp()
+        self.pop_gcmap()
+        self.load_result()
+
+    def emit_call_release_gil(self):
+        """Emit a CALL_RELEASE_GIL, including calls to releasegil_addr
+        and reacqgil_addr."""
+        self.select_call_release_gil_mode()
+        self.prepare_arguments()
+        self.push_gcmap_for_call_release_gil()
+        self.call_releasegil_addr_and_move_real_arguments()
+        self.emit_raw_call()
+        self.restore_esp()
+        self.move_real_result_and_call_reacqgil_addr()
+        self.pop_gcmap()
+        self.load_result()
+
+    def select_call_release_gil_mode(self):
+        """Overridden in CallBuilder64"""
+        self.is_call_release_gil = True
+        if self.asm._is_asmgcc():
+            from rpython.memory.gctransform import asmgcroot
+            self.stack_max = PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS
+            assert self.stack_max >= 3
+
+    def emit_raw_call(self):
+        self.mc.CALL(self.fnloc)
+        if self.callconv != FFI_DEFAULT_ABI:
+            self.current_esp += self._fix_stdcall(self.callconv)
+
+    def subtract_esp_aligned(self, count):
+        if count > 0:
+            align = align_stack_words(count)
+            self.current_esp -= align * WORD
+            self.mc.SUB_ri(esp.value, align * WORD)
+
+    def restore_esp(self, target_esp=0):
+        if self.current_esp != target_esp:
+            self.mc.ADD_ri(esp.value, target_esp - self.current_esp)
+            self.current_esp = target_esp
+
+    def load_result(self):
+        """Overridden in CallBuilder32 and CallBuilder64"""
+        if self.ressize == 0:
+            return      # void result
+        # use the code in load_from_mem to do the zero- or sign-extension
+        srcloc = self.tmpresloc
+        if srcloc is None:
+            if self.restype == FLOAT:
+                srcloc = xmm0
+            else:
+                srcloc = eax
+        if self.ressize >= WORD and self.resloc is srcloc:
+            return      # no need for any MOV
+        if self.ressize == 1 and isinstance(srcloc, RegLoc):
+            srcloc = srcloc.lowest8bits()
+        self.asm.load_from_mem(self.resloc, srcloc,
+                               imm(self.ressize), imm(self.ressign))
+
+    def push_gcmap(self):
+        # we push *now* the gcmap, describing the status of GC registers
+        # after the rearrangements done just before, ignoring the return
+        # value eax, if necessary
+        assert not self.is_call_release_gil
+        self.change_extra_stack_depth = (self.current_esp != 0)
+        if self.change_extra_stack_depth:
+            self.asm.set_extra_stack_depth(self.mc, -self.current_esp)
+        noregs = self.asm.cpu.gc_ll_descr.is_shadow_stack()
+        gcmap = self.asm._regalloc.get_gcmap([eax], noregs=noregs)
+        self.asm.push_gcmap(self.mc, gcmap, store=True)
+
+    def push_gcmap_for_call_release_gil(self):
+        assert self.is_call_release_gil
+        # we put the gcmap now into the frame before releasing the GIL,
+        # and pop it after reacquiring the GIL.  The assumption
+        # is that this gcmap describes correctly the situation at any
+        # point in-between: all values containing GC pointers should
+        # be safely saved out of registers by now, and will not be
+        # manipulated by any of the following CALLs.
+        gcmap = self.asm._regalloc.get_gcmap(noregs=True)
+        self.asm.push_gcmap(self.mc, gcmap, store=True)
+
+    def pop_gcmap(self):
+        self.asm._reload_frame_if_necessary(self.mc)
+        if self.change_extra_stack_depth:
+            self.asm.set_extra_stack_depth(self.mc, 0)
+        self.asm.pop_gcmap(self.mc)
+
+    def call_releasegil_addr_and_move_real_arguments(self):
+        initial_esp = self.current_esp
+        self.save_register_arguments()
+        #
+        if not self.asm._is_asmgcc():
+            # the helper takes no argument
+            self.change_extra_stack_depth = False
+        else:
+            from rpython.memory.gctransform import asmgcroot
+            # build a 'css' structure on the stack: 2 words for the linkage,
+            # and 5/7 words as described for asmgcroot.ASM_FRAMEDATA, for a
+            # total size of JIT_USE_WORDS.  This structure is found at
+            # [ESP+css].
+            css = -self.current_esp + (
+                WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS))
+            assert css >= 2 * WORD
+            # Save ebp
+            index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
+            self.mc.MOV_sr(index_of_ebp, ebp.value)  # MOV [css.ebp], EBP
+            # Save the "return address": we pretend that it's css
+            if IS_X86_32:
+                reg = eax
+            elif IS_X86_64:
+                reg = edi
+            self.mc.LEA_rs(reg.value, css)           # LEA reg, [css]
+            frame_ptr = css + WORD * (2+asmgcroot.FRAME_PTR)
+            self.mc.MOV_sr(frame_ptr, reg.value)     # MOV [css.frame], reg
+            # Set up jf_extra_stack_depth to pretend that the return address
+            # was at css, and so our stack frame is supposedly shorter by
+            # (PASS_ON_MY_FRAME-JIT_USE_WORDS+1) words
+            delta = PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS + 1
+            self.change_extra_stack_depth = True
+            self.asm.set_extra_stack_depth(self.mc, -delta * WORD)
+            # Call the closestack() function (also releasing the GIL)
+            # with 'reg' as argument
+            if IS_X86_32:
+                self.subtract_esp_aligned(1)
+                self.mc.MOV_sr(0, reg.value)
+            #else:
+            #   on x86_64, reg is edi so that it is already correct
+        #
+        self.mc.CALL(imm(self.asm.releasegil_addr))
+        #
+        if not we_are_translated():        # for testing: we should not access
+            self.mc.ADD(ebp, imm(1))       # ebp any more
+        #
+        self.restore_register_arguments()
+        self.restore_esp(initial_esp)
+
+    def save_register_arguments(self):
+        """Overridden in CallBuilder64"""
+
+    def restore_register_arguments(self):
+        """Overridden in CallBuilder64"""
+
+    def move_real_result_and_call_reacqgil_addr(self):
+        # save the result we just got (in eax/eax+edx/st(0)/xmm0)
+        self.save_result_value()
+        # call the reopenstack() function (also reacquiring the GIL)
+        if not self.asm._is_asmgcc():
+            css = 0     # the helper takes no argument
+        else:
+            from rpython.memory.gctransform import asmgcroot
+            css = WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS)
+            if IS_X86_32:
+                reg = eax
+            elif IS_X86_64:
+                reg = edi
+            self.mc.LEA_rs(reg.value, css)
+            if IS_X86_32:
+                self.mc.MOV_sr(0, reg.value)
+        #
+        self.mc.CALL(imm(self.asm.reacqgil_addr))
+        #
+        if not we_are_translated():        # for testing: now we can accesss
+            self.mc.SUB(ebp, imm(1))       # ebp again
+        #
+        # Now that we required the GIL, we can reload a possibly modified ebp
+        if self.asm._is_asmgcc():
+            # special-case: reload ebp from the css
+            from rpython.memory.gctransform import asmgcroot
+            index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
+            self.mc.MOV_rs(ebp.value, index_of_ebp)  # MOV EBP, [css.ebp]
+        #else:
+        #   for shadowstack, done for us by _reload_frame_if_necessary()
+
+    def save_result_value(self):
+        """Overridden in CallBuilder32 and CallBuilder64"""
+        raise NotImplementedError
+
+
+class CallBuilder32(AbstractCallBuilder):
+
+    def prepare_arguments(self):
+        arglocs = self.arglocs
+        stack_depth = 0
+        n = len(arglocs)
+        for i in range(n):
+            loc = arglocs[i]
+            stack_depth += loc.get_width() // WORD
+        self.subtract_esp_aligned(stack_depth - self.stack_max)
+        #
+        p = 0
+        for i in range(n):
+            loc = arglocs[i]
+            if isinstance(loc, RegLoc):
+                if loc.is_xmm:
+                    self.mc.MOVSD_sx(p, loc.value)
+                else:
+                    self.mc.MOV_sr(p, loc.value)
+            p += loc.get_width()
+        p = 0
+        for i in range(n):
+            loc = arglocs[i]
+            if not isinstance(loc, RegLoc):
+                if loc.get_width() == 8:
+                    self.mc.MOVSD(xmm0, loc)
+                    self.mc.MOVSD_sx(p, xmm0.value)
+                elif isinstance(loc, ImmedLoc):
+                    self.mc.MOV_si(p, loc.value)
+                else:
+                    self.mc.MOV(eax, loc)
+                    self.mc.MOV_sr(p, eax.value)
+            p += loc.get_width()
+        self.total_stack_used_by_arguments = p
+        #
+        if not self.fnloc_is_immediate:    # the last "argument" pushed above
+            self.fnloc = RawEspLoc(p - WORD, INT)
+
+
+    def _fix_stdcall(self, callconv):
+        from rpython.rlib.clibffi import FFI_STDCALL
+        assert callconv == FFI_STDCALL
+        return self.total_stack_used_by_arguments
+
+    def load_result(self):
+        resloc = self.resloc
+        if resloc is not None and resloc.is_float():
+            # a float or a long long return
+            if self.tmpresloc is None:
+                if self.restype == 'L':     # long long
+                    # move eax/edx -> xmm0
+                    self.mc.MOVD_xr(resloc.value^1, edx.value)
+                    self.mc.MOVD_xr(resloc.value,   eax.value)
+                    self.mc.PUNPCKLDQ_xx(resloc.value, resloc.value^1)
+                else:
+                    # float: we have to go via the stack
+                    self.mc.FSTPL_s(0)
+                    self.mc.MOVSD_xs(resloc.value, 0)
+            else:
+                self.mc.MOVSD(resloc, self.tmpresloc)
+            #
+        elif self.restype == 'S':
+            # singlefloat return: must convert ST(0) to a 32-bit singlefloat
+            # and load it into self.resloc.  mess mess mess
+            if self.tmpresloc is None:
+                self.mc.FSTPS_s(0)
+                self.mc.MOV_rs(resloc.value, 0)
+            else:
+                self.mc.MOV(resloc, self.tmpresloc)
+        else:
+            AbstractCallBuilder.load_result(self)
+
+    def save_result_value(self):
+        # Temporarily save the result value into [ESP+4].  We use "+4"
+        # in order to leave the word at [ESP+0] free, in case it's needed
+        if self.ressize == 0:      # void return
+            return
+        if self.resloc.is_float():
+            # a float or a long long return
+            self.tmpresloc = RawEspLoc(4, FLOAT)
+            if self.restype == 'L':
+                self.mc.MOV_sr(4, eax.value)      # long long
+                self.mc.MOV_sr(8, edx.value)
+            else:
+                self.mc.FSTPL_s(4)                # float return
+        else:
+            self.tmpresloc = RawEspLoc(4, INT)
+            if self.restype == 'S':
+                self.mc.FSTPS_s(4)
+            else:
+                assert self.restype == INT
+                assert self.ressize <= WORD
+                self.mc.MOV_sr(4, eax.value)
+
+
+class CallBuilder64(AbstractCallBuilder):
+
+    ARGUMENTS_GPR = [edi, esi, edx, ecx, r8, r9]
+    ARGUMENTS_XMM = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7]
+    DONT_MOVE_GPR = []
+    _ALL_CALLEE_SAVE_GPR = [ebx, r12, r13, r14, r15]
+
+    next_arg_gpr = 0
+    next_arg_xmm = 0
+
+    def _unused_gpr(self, hint):
+        i = self.next_arg_gpr
+        self.next_arg_gpr = i + 1
+        try:
+            res = self.ARGUMENTS_GPR[i]
+        except IndexError:
+            return None
+        if hint in self.DONT_MOVE_GPR:
+            self.ARGUMENTS_GPR[i] = hint
+            res = hint
+        return res
+
+    def _unused_xmm(self):
+        i = self.next_arg_xmm
+        self.next_arg_xmm = i + 1
+        try:
+            return self.ARGUMENTS_XMM[i]
+        except IndexError:
+            return None
+
+    def _permute_to_prefer_unused_registers(self, lst):
+        # permute 'lst' so that it starts with registers that are not
+        # in 'self.already_used', and ends with registers that are.
+        N = len(lst)
+        i = 0
+        while i < N:
+            reg = lst[i]
+            if reg in self.already_used:
+                # move this reg to the end, and decrement N
+                N -= 1
+                assert N >= i
+                lst[N], lst[i] = lst[i], lst[N]
+            else:
+                i += 1
+
+    def select_call_release_gil_mode(self):
+        AbstractCallBuilder.select_call_release_gil_mode(self)
+        # We have to copy the arguments around a bit more in this mode,
+        # but on the other hand we don't need prepare_arguments() moving
+        # them in precisely the final registers.  Here we look around for
+        # unused registers that may be more likely usable.
+        from rpython.jit.backend.x86.regalloc import X86_64_RegisterManager
+        from rpython.jit.backend.x86.regalloc import X86_64_XMMRegisterManager
+        self.already_used = {}
+        for loc in self.arglocs:
+            self.already_used[loc] = None
+        #
+        lst = X86_64_RegisterManager.save_around_call_regs[:]
+        self._permute_to_prefer_unused_registers(lst)
+        # <optimization>
+        extra = []
+        for reg in self.asm._regalloc.rm.free_regs:
+            if (reg not in self.already_used and
+                    reg in self._ALL_CALLEE_SAVE_GPR):
+                extra.append(reg)
+        self.free_callee_save_gprs = extra
+        lst = extra + lst
+        # </optimization>
+        self.ARGUMENTS_GPR = lst[:len(self.ARGUMENTS_GPR)]
+        self.DONT_MOVE_GPR = self._ALL_CALLEE_SAVE_GPR
+        #
+        lst = X86_64_XMMRegisterManager.save_around_call_regs[:]
+        self._permute_to_prefer_unused_registers(lst)
+        self.ARGUMENTS_XMM = lst[:len(self.ARGUMENTS_XMM)]
+
+    def prepare_arguments(self):
+        src_locs = []
+        dst_locs = []
+        xmm_src_locs = []
+        xmm_dst_locs = []
+        singlefloats = None
+
+        arglocs = self.arglocs
+        argtypes = self.argtypes
+
+        on_stack = 0
+        for i in range(len(arglocs)):
+            loc = arglocs[i]
+            if loc.is_float():
+                tgt = self._unused_xmm()
+                if tgt is None:
+                    tgt = RawEspLoc(on_stack * WORD, FLOAT)
+                    on_stack += 1
+                xmm_src_locs.append(loc)
+                xmm_dst_locs.append(tgt)
+            elif i < len(argtypes) and argtypes[i] == 'S':
+                # Singlefloat argument
+                if singlefloats is None:
+                    singlefloats = []
+                tgt = self._unused_xmm()
+                if tgt is None:
+                    tgt = RawEspLoc(on_stack * WORD, INT)
+                    on_stack += 1
+                singlefloats.append((loc, tgt))
+            else:
+                tgt = self._unused_gpr(hint=loc)
+                if tgt is None:
+                    tgt = RawEspLoc(on_stack * WORD, INT)
+                    on_stack += 1
+                src_locs.append(loc)
+                dst_locs.append(tgt)
+
+        if not self.fnloc_is_immediate:
+            self.fnloc = dst_locs[-1]     # the last "argument" prepared above
+
+        if not we_are_translated():  # assert that we got the right stack depth
+            floats = 0
+            for i in range(len(arglocs)):
+                arg = arglocs[i]
+                if arg.is_float() or (i < len(argtypes) and argtypes[i]=='S'):
+                    floats += 1
+            all_args = len(arglocs)
+            stack_depth = (max(all_args - floats - len(self.ARGUMENTS_GPR), 0)
+                           + max(floats - len(self.ARGUMENTS_XMM), 0))
+            assert stack_depth == on_stack
+
+        self.subtract_esp_aligned(on_stack - self.stack_max)
+
+        # Handle register arguments: first remap the xmm arguments
+        remap_frame_layout(self.asm, xmm_src_locs, xmm_dst_locs,
+                           X86_64_XMM_SCRATCH_REG)
+        # Load the singlefloat arguments from main regs or stack to xmm regs
+        if singlefloats is not None:
+            for src, dst in singlefloats:
+                if isinstance(dst, RawEspLoc):
+                    # XXX too much special logic
+                    if isinstance(src, RawEbpLoc):
+                        self.mc.MOV32(X86_64_SCRATCH_REG, src)
+                        self.mc.MOV32(dst, X86_64_SCRATCH_REG)
+                    else:
+                        self.mc.MOV32(dst, src)
+                    continue
+                if isinstance(src, ImmedLoc):
+                    self.mc.MOV(X86_64_SCRATCH_REG, src)
+                    src = X86_64_SCRATCH_REG
+                self.mc.MOVD(dst, src)
+        # Finally remap the arguments in the main regs
+        remap_frame_layout(self.asm, src_locs, dst_locs, X86_64_SCRATCH_REG)
+
+
+    def _fix_stdcall(self, callconv):
+        assert 0     # should not occur on 64-bit
+
+    def load_result(self):
+        if self.restype == 'S' and self.tmpresloc is None:
+            # singlefloat return: use MOVD to load the target register
+            # from the lower 32 bits of XMM0
+            self.mc.MOVD(self.resloc, xmm0)
+        else:
+            AbstractCallBuilder.load_result(self)
+
+    def save_result_value(self):
+        # Temporarily save the result value into [ESP].
+        if self.ressize == 0:      # void return
+            return
+        #
+        if self.restype == FLOAT:    # and not 'S'
+            self.mc.MOVSD_sx(0, xmm0.value)
+            self.tmpresloc = RawEspLoc(0, FLOAT)
+            return
+        #
+        if len(self.free_callee_save_gprs) == 0:
+            self.tmpresloc = RawEspLoc(0, INT)
+        else:
+            self.tmpresloc = self.free_callee_save_gprs[0]
+        #
+        if self.restype == 'S':
+            # singlefloat return: use MOVD to store the lower 32 bits
+            # of XMM0 into the tmpresloc (register or [ESP])
+            self.mc.MOVD(self.tmpresloc, xmm0)
+        else:
+            assert self.restype == INT
+            self.mc.MOV(self.tmpresloc, eax)
+
+    def save_register_arguments(self):
+        # Save the argument registers, which are given by self.ARGUMENTS_xxx.
+        n_gpr = min(self.next_arg_gpr, len(self.ARGUMENTS_GPR))
+        n_xmm = min(self.next_arg_xmm, len(self.ARGUMENTS_XMM))
+        n_saved_regs = n_gpr + n_xmm
+        for i in range(n_gpr):
+            if self.ARGUMENTS_GPR[i] in self._ALL_CALLEE_SAVE_GPR:
+                n_saved_regs -= 1     # don't need to save it
+        self.subtract_esp_aligned(n_saved_regs)
+        #
+        n = 0
+        for i in range(n_gpr):
+            if self.ARGUMENTS_GPR[i] not in self._ALL_CALLEE_SAVE_GPR:
+                self.mc.MOV_sr(n * WORD, self.ARGUMENTS_GPR[i].value)
+                n += 1
+        for i in range(n_xmm):
+            self.mc.MOVSD_sx(n * WORD, self.ARGUMENTS_XMM[i].value)
+            n += 1
+        assert n == n_saved_regs
+        self.n_saved_regs = n_saved_regs
+
+    def restore_register_arguments(self):
+        # Restore the saved values into the *real* registers used for calls
+        # --- which are not self.ARGUMENTS_xxx!
+        n_gpr = min(self.next_arg_gpr, len(self.ARGUMENTS_GPR))
+        n_xmm = min(self.next_arg_xmm, len(self.ARGUMENTS_XMM))
+        #
+        n = 0
+        for i in range(n_gpr):
+            tgtvalue = CallBuilder64.ARGUMENTS_GPR[i].value
+            if self.ARGUMENTS_GPR[i] not in self._ALL_CALLEE_SAVE_GPR:
+                self.mc.MOV_rs(tgtvalue, n * WORD)
+                n += 1
+            else:
+                self.mc.MOV_rr(tgtvalue, self.ARGUMENTS_GPR[i].value)
+        for i in range(n_xmm):
+            self.mc.MOVSD_xs(CallBuilder64.ARGUMENTS_XMM[i].value, n * WORD)
+            n += 1
+        assert n == self.n_saved_regs
+        #
+        if isinstance(self.fnloc, RegLoc):    # fix this register
+            self.fnloc = CallBuilder64.ARGUMENTS_GPR[n_gpr - 1]
+
+
+if IS_X86_32:
+    CallBuilder = CallBuilder32
+if IS_X86_64:
+    CallBuilder = CallBuilder64
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -79,26 +79,14 @@
         rffi.cast(rffi.CArrayPtr(longlong.FLOATSTORAGE), adr)[1] = y
         return ConstFloatLoc(adr)
 
-    def after_call(self, v):
-        # the result is stored in st0, but we don't have this around,
-        # so genop_call will move it to some frame location immediately
-        # after the call
-        return self.frame_manager.loc(v)
+    def call_result_location(self, v):
+        return xmm0
 
 class X86_64_XMMRegisterManager(X86XMMRegisterManager):
     # xmm15 reserved for scratch use
     all_regs = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14]
     save_around_call_regs = all_regs
 
-    def call_result_location(self, v):
-        return xmm0
-
-    def after_call(self, v):
-        # We use RegisterManager's implementation, since X86XMMRegisterManager
-        # places the result on the stack, which we don't need to do when the
-        # calling convention places the result in xmm0
-        return RegisterManager.after_call(self, v)
-
 class X86FrameManager(FrameManager):
     def __init__(self, base_ofs):
         FrameManager.__init__(self)
@@ -799,14 +787,6 @@
         self._consider_call(op, guard_op)
 
     def consider_call_release_gil(self, op, guard_op):
-        # We spill the arguments to the stack, because we need to do 3 calls:
-        # call_release_gil(), the_real_c_function(), and call_reacquire_gil().
-        # The arguments are used on the second call only.  XXX we assume
-        # that the XMM arguments won't be modified by call_release_gil().
-        for i in range(op.numargs()):
-            loc = self.loc(op.getarg(i))
-            if loc in self.rm.save_around_call_regs:
-                self.rm.force_spill_var(op.getarg(i))
         assert guard_op is not None
         self._consider_call(op, guard_op)
 
@@ -1151,9 +1131,8 @@
         # call memcpy()
         self.rm.before_call()
         self.xrm.before_call()
-        self.assembler._emit_call(imm(self.assembler.memcpy_addr),
-                                  [dstaddr_loc, srcaddr_loc, length_loc],
-                                  can_collect=False)
+        self.assembler.simple_call_no_collect(imm(self.assembler.memcpy_addr),
+                                        [dstaddr_loc, srcaddr_loc, length_loc])
         self.rm.possibly_free_var(length_box)
         self.rm.possibly_free_var(dstaddr_box)
         self.rm.possibly_free_var(srcaddr_box)
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -553,6 +553,7 @@
     CALL_l = insn('\xE8', relative(1))
     CALL_r = insn(rex_nw, '\xFF', register(1), chr(0xC0 | (2<<3)))
     CALL_b = insn('\xFF', orbyte(2<<3), stack_bp(1))
+    CALL_s = insn('\xFF', orbyte(2<<3), stack_sp(1))
 
     # XXX: Only here for testing purposes..."as" happens the encode the
     # registers in the opposite order that we would otherwise do in a
@@ -583,6 +584,7 @@
 
     # x87 instructions
     FSTPL_b = insn('\xDD', orbyte(3<<3), stack_bp(1)) # rffi.DOUBLE ('as' wants L??)
+    FSTPL_s = insn('\xDD', orbyte(3<<3), stack_sp(1)) # rffi.DOUBLE ('as' wants L??)
     FSTPS_s = insn('\xD9', orbyte(3<<3), stack_sp(1)) # lltype.SingleFloat
 
     # ------------------------------ Random mess -----------------------