Tue Aug 10 20:07:29 CEST 2010

Author: getxsick
Date: Tue Aug 10 20:07:15 2010
New Revision: 76575

Added:
   pypy/branch/fast-ctypes/lib_pypy/array.py
      - copied unchanged from r76574, pypy/trunk/lib_pypy/array.py
   pypy/branch/fast-ctypes/pypy/doc/config/objspace.usemodules.array.txt
      - copied unchanged from r76574, pypy/trunk/pypy/doc/config/objspace.usemodules.array.txt
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/arch.py
      - copied unchanged from r76574, pypy/trunk/pypy/jit/backend/x86/arch.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/regloc.py
      - copied unchanged from r76574, pypy/trunk/pypy/jit/backend/x86/regloc.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/rx86.py
      - copied unchanged from r76574, pypy/trunk/pypy/jit/backend/x86/rx86.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_regloc.py
      - copied unchanged from r76574, pypy/trunk/pypy/jit/backend/x86/test/test_regloc.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_rx86.py
      - copied unchanged from r76574, pypy/trunk/pypy/jit/backend/x86/test/test_rx86.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_rx86_32_auto_encoding.py
      - copied unchanged from r76574, pypy/trunk/pypy/jit/backend/x86/test/test_rx86_32_auto_encoding.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_rx86_64_auto_encoding.py
      - copied unchanged from r76574, pypy/trunk/pypy/jit/backend/x86/test/test_rx86_64_auto_encoding.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/tool/instruction_encoding.sh
      - copied unchanged from r76574, pypy/trunk/pypy/jit/backend/x86/tool/instruction_encoding.sh
   pypy/branch/fast-ctypes/pypy/module/array/
      - copied from r76574, pypy/trunk/pypy/module/array/
Removed:
   pypy/branch/fast-ctypes/lib_pypy/greenlet.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/ri386.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/ri386setup.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_ri386.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_ri386_auto_encoding.py
   pypy/branch/fast-ctypes/pypy/module/test_lib_pypy/test_array.py
Modified:
   pypy/branch/fast-ctypes/   (props changed)
   pypy/branch/fast-ctypes/lib_pypy/pypy_test/test_coroutine.py
   pypy/branch/fast-ctypes/lib_pypy/stackless.py
   pypy/branch/fast-ctypes/pypy/config/pypyoption.py
   pypy/branch/fast-ctypes/pypy/config/translationoption.py
   pypy/branch/fast-ctypes/pypy/doc/faq.txt
   pypy/branch/fast-ctypes/pypy/interpreter/astcompiler/assemble.py
   pypy/branch/fast-ctypes/pypy/interpreter/astcompiler/codegen.py
   pypy/branch/fast-ctypes/pypy/interpreter/pyopcode.py
   pypy/branch/fast-ctypes/pypy/interpreter/test/test_compiler.py
   pypy/branch/fast-ctypes/pypy/jit/backend/detect_cpu.py
   pypy/branch/fast-ctypes/pypy/jit/backend/llsupport/regalloc.py
   pypy/branch/fast-ctypes/pypy/jit/backend/llsupport/test/test_regalloc.py
   pypy/branch/fast-ctypes/pypy/jit/backend/test/runner_test.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/assembler.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/codebuf.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/jump.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/regalloc.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/runner.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/conftest.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_assembler.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_basic.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_gc_integration.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_jump.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_recompilation.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_regalloc.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_regalloc2.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_runner.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_symbolic_x86.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_zll_random.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_zrpy_gc.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_ztranslation.py
   pypy/branch/fast-ctypes/pypy/jit/backend/x86/tool/viewcode.py
   pypy/branch/fast-ctypes/pypy/jit/metainterp/optimizeopt.py
   pypy/branch/fast-ctypes/pypy/jit/metainterp/resoperation.py
   pypy/branch/fast-ctypes/pypy/jit/metainterp/test/test_executor.py
   pypy/branch/fast-ctypes/pypy/jit/metainterp/test/test_optimizeopt.py
   pypy/branch/fast-ctypes/pypy/jit/tl/pypyjit.py
   pypy/branch/fast-ctypes/pypy/jit/tl/pypyjit_demo.py
   pypy/branch/fast-ctypes/pypy/module/_demo/demo.py
   pypy/branch/fast-ctypes/pypy/module/_stackless/interp_coroutine.py
   pypy/branch/fast-ctypes/pypy/module/pypyjit/policy.py
   pypy/branch/fast-ctypes/pypy/module/pypyjit/test/test_pypy_c.py
   pypy/branch/fast-ctypes/pypy/module/signal/interp_signal.py
   pypy/branch/fast-ctypes/pypy/objspace/std/callmethod.py
   pypy/branch/fast-ctypes/pypy/objspace/std/itertype.py
   pypy/branch/fast-ctypes/pypy/objspace/std/model.py
   pypy/branch/fast-ctypes/pypy/objspace/std/test/test_callmethod.py
   pypy/branch/fast-ctypes/pypy/rlib/objectmodel.py
   pypy/branch/fast-ctypes/pypy/rlib/rmmap.py
   pypy/branch/fast-ctypes/pypy/rlib/test/test_objectmodel.py
   pypy/branch/fast-ctypes/pypy/rpython/lltypesystem/rstr.py
   pypy/branch/fast-ctypes/pypy/rpython/rstr.py
   pypy/branch/fast-ctypes/pypy/rpython/tool/rfficache.py
   pypy/branch/fast-ctypes/pypy/translator/c/genc.py
   pypy/branch/fast-ctypes/pypy/translator/c/node.py
   pypy/branch/fast-ctypes/pypy/translator/platform/posix.py
Log:
merge from trunk


Modified: pypy/branch/fast-ctypes/lib_pypy/pypy_test/test_coroutine.py
==============================================================================

--- pypy/branch/fast-ctypes/lib_pypy/pypy_test/test_coroutine.py	(original)
+++ pypy/branch/fast-ctypes/lib_pypy/pypy_test/test_coroutine.py	Tue Aug 10 20:07:15 2010
@@ -2,7 +2,7 @@
 from py.test import skip, raises
 
 try:
-    from ..stackless import coroutine
+    from ..stackless import coroutine, CoroutineExit
 except ImportError, e:
     skip('cannot import stackless: %s' % (e,))
 

Modified: pypy/branch/fast-ctypes/lib_pypy/stackless.py
==============================================================================
--- pypy/branch/fast-ctypes/lib_pypy/stackless.py	(original)
+++ pypy/branch/fast-ctypes/lib_pypy/stackless.py	Tue Aug 10 20:07:15 2010
@@ -14,9 +14,13 @@
 import traceback
 import sys
 try:
+    # If _stackless can be imported then TaskletExit and CoroutineExit are 
+    # automatically added to the builtins.
     from _stackless import coroutine, greenlet
 except ImportError: # we are running from CPython
-    from greenlet import greenlet
+    from greenlet import greenlet, GreenletExit
+    TaskletExit = CoroutineExit = GreenletExit
+    del GreenletExit
     try:
         from functools import partial
     except ImportError: # we are not running python 2.5

Modified: pypy/branch/fast-ctypes/pypy/config/pypyoption.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/config/pypyoption.py	(original)
+++ pypy/branch/fast-ctypes/pypy/config/pypyoption.py	Tue Aug 10 20:07:15 2010
@@ -30,7 +30,7 @@
       "rctime" , "select", "zipimport", "_lsprof",
      "crypt", "signal", "_rawffi", "termios", "zlib",
      "struct", "md5", "sha", "bz2", "_minimal_curses", "cStringIO",
-     "thread", "itertools", "pyexpat", "_ssl", "cpyext"]
+     "thread", "itertools", "pyexpat", "_ssl", "cpyext", "array"]
 ))
 
 working_oo_modules = default_modules.copy()

Modified: pypy/branch/fast-ctypes/pypy/config/translationoption.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/config/translationoption.py	(original)
+++ pypy/branch/fast-ctypes/pypy/config/translationoption.py	Tue Aug 10 20:07:15 2010
@@ -342,6 +342,10 @@
     'jit':  'hybrid      extraopts     jit',
     }
 
+# For now, 64-bit JIT requires boehm
+if IS_64_BITS:
+    OPT_TABLE['jit'] = OPT_TABLE['jit'].replace('hybrid', 'boehm')
+
 def set_opt_level(config, level):
     """Apply optimization suggestions on the 'config'.
     The optimizations depend on the selected level and possibly on the backend.

Modified: pypy/branch/fast-ctypes/pypy/doc/faq.txt
==============================================================================
--- pypy/branch/fast-ctypes/pypy/doc/faq.txt	(original)
+++ pypy/branch/fast-ctypes/pypy/doc/faq.txt	Tue Aug 10 20:07:15 2010
@@ -47,8 +47,8 @@
 
 There is also an experimental support for CPython extension modules, so
 they'll run without change (from current observation, rather with little
-change) on trunk. It has not been released yet, although it should be a major
-point of the next pypy release.
+change) on trunk. It has been a part of 1.3 release, but support is still
+in alpha phase.
 
 .. _`extension modules`: cpython_differences.html#extension-modules
 .. _`cpython_differences`: cpython_differences.html
@@ -373,7 +373,7 @@
 --------------------------------------------
 
 No.  PyPy always runs your code in its own interpreter, which is a
-full and compliant Python 2.4 interpreter.  RPython_ is only the
+full and compliant Python 2.5 interpreter.  RPython_ is only the
 language in which parts of PyPy itself are written and extension
 modules for it.  The answer to whether something needs to be written as
 an extension module, apart from the "gluing to external libraries" reason, will

Modified: pypy/branch/fast-ctypes/pypy/interpreter/astcompiler/assemble.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/interpreter/astcompiler/assemble.py	(original)
+++ pypy/branch/fast-ctypes/pypy/interpreter/astcompiler/assemble.py	Tue Aug 10 20:07:15 2010
@@ -566,22 +566,22 @@
     return (oparg % 256) + 2 * (oparg / 256)
 
 def _compute_CALL_FUNCTION(arg):
-    return _num_args(arg)
+    return -_num_args(arg)
 
 def _compute_CALL_FUNCTION_VAR(arg):
-    return _num_args(arg) - 1
+    return -_num_args(arg) - 1
 
 def _compute_CALL_FUNCTION_KW(arg):
-    return _num_args(arg) - 1
+    return -_num_args(arg) - 1
 
 def _compute_CALL_FUNCTION_VAR_KW(arg):
-    return _num_args(arg) - 2
+    return -_num_args(arg) - 2
 
 def _compute_CALL_LIKELY_BUILTIN(arg):
     return -(arg & 0xFF) + 1
 
 def _compute_CALL_METHOD(arg):
-    return -arg - 1
+    return -_num_args(arg) - 1
 
 
 _stack_effect_computers = {}

Modified: pypy/branch/fast-ctypes/pypy/interpreter/astcompiler/codegen.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/interpreter/astcompiler/codegen.py	(original)
+++ pypy/branch/fast-ctypes/pypy/interpreter/astcompiler/codegen.py	Tue Aug 10 20:07:15 2010
@@ -960,9 +960,12 @@
         elif call_type == 3:
             op = ops.CALL_FUNCTION_VAR_KW
         self.emit_op_arg(op, arg)
+    
+    def _call_has_no_star_args(self, call):
+        return not call.starargs and not call.kwargs
 
     def _call_has_simple_args(self, call):
-        return not call.starargs and not call.kwargs and not call.keywords
+        return self._call_has_no_star_args(call) and not call.keywords
 
     def _optimize_builtin_call(self, call):
         if not self.space.config.objspace.opcodes.CALL_LIKELY_BUILTIN or \
@@ -988,7 +991,7 @@
 
     def _optimize_method_call(self, call):
         if not self.space.config.objspace.opcodes.CALL_METHOD or \
-                not self._call_has_simple_args(call) or \
+                not self._call_has_no_star_args(call) or \
                 not isinstance(call.func, ast.Attribute):
             return False
         attr_lookup = call.func
@@ -1000,7 +1003,12 @@
             arg_count = len(call.args)
         else:
             arg_count = 0
-        self.emit_op_arg(ops.CALL_METHOD, arg_count)
+        if call.keywords:
+            self.visit_sequence(call.keywords)
+            kwarg_count = len(call.keywords)
+        else:
+            kwarg_count = 0
+        self.emit_op_arg(ops.CALL_METHOD, (kwarg_count << 8) | arg_count)
         return True
 
     def _listcomp_generator(self, list_name, gens, gen_index, elt):

Modified: pypy/branch/fast-ctypes/pypy/interpreter/pyopcode.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/interpreter/pyopcode.py	(original)
+++ pypy/branch/fast-ctypes/pypy/interpreter/pyopcode.py	Tue Aug 10 20:07:15 2010
@@ -988,23 +988,9 @@
             self.dropvalues(nargs)
         self.pushvalue(w_result)
 
-    def LOOKUP_METHOD(self, nameindex, next_instr):
-        # overridden by faster version in the standard object space.
-        space = self.space
-        w_obj = self.popvalue()
-        w_name = self.getname_w(nameindex)
-        w_value = space.getattr(w_obj, w_name)
-        self.pushvalue(w_value)
-
-    def CALL_METHOD(self, nargs, next_instr):
-        # overridden by faster version in the standard object space.
-        # 'nargs' is the argument count excluding the implicit 'self'
-        w_callable = self.peekvalue(nargs)
-        try:
-            w_result = self.space.call_valuestack(w_callable, nargs, self)
-        finally:
-            self.dropvalues(nargs + 1)
-        self.pushvalue(w_result)
+    # overridden by faster version in the standard object space.
+    LOOKUP_METHOD = LOAD_ATTR
+    CALL_METHOD = CALL_FUNCTION
 
     def MISSING_OPCODE(self, oparg, next_instr):
         ofs = self.last_instr

Modified: pypy/branch/fast-ctypes/pypy/interpreter/test/test_compiler.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/interpreter/test/test_compiler.py	(original)
+++ pypy/branch/fast-ctypes/pypy/interpreter/test/test_compiler.py	Tue Aug 10 20:07:15 2010
@@ -4,6 +4,7 @@
 from pypy.interpreter.pycode import PyCode
 from pypy.interpreter.error import OperationError
 from pypy.interpreter.argument import Arguments
+from pypy.conftest import gettestobjspace
 
 class BaseTestCompiler:
     def setup_method(self, method):
@@ -848,14 +849,38 @@
         
         import StringIO, sys, dis
         s = StringIO.StringIO()
+        out = sys.stdout
         sys.stdout = s
         try:
             dis.dis(code)
         finally:
-            sys.stdout = sys.__stdout__
+            sys.stdout = out
         output = s.getvalue()
         assert "LOAD_GLOBAL" not in output
 
+class AppTestCallMethod(object):
+    def setup_class(cls):
+        cls.space = gettestobjspace(**{'objspace.opcodes.CALL_METHOD': True})
+        
+    def test_call_method_kwargs(self):
+        source = """def _f(a):
+            return a.f(a=a)
+        """
+        exec source
+        code = _f.func_code
+        
+        import StringIO, sys, dis
+        s = StringIO.StringIO()
+        out = sys.stdout
+        sys.stdout = s
+        try:
+            dis.dis(code)
+        finally:
+            sys.stdout = out
+        output = s.getvalue()
+        assert "CALL_METHOD" in output
+            
+
 class AppTestExceptions:
     def test_indentation_error(self):
         source = """if 1:

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/detect_cpu.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/detect_cpu.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/detect_cpu.py	Tue Aug 10 20:07:15 2010
@@ -56,6 +56,8 @@
         return "pypy.jit.backend.x86.runner", "CPU"
     elif backend_name == 'x86-without-sse2':
         return "pypy.jit.backend.x86.runner", "CPU386_NO_SSE2"
+    elif backend_name == 'x86_64':
+        return "pypy.jit.backend.x86.runner", "CPU_X86_64"
     elif backend_name == 'cli':
         return "pypy.jit.backend.cli.runner", "CliCPU"
     elif backend_name == 'llvm':

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/llsupport/regalloc.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/llsupport/regalloc.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/llsupport/regalloc.py	Tue Aug 10 20:07:15 2010
@@ -22,18 +22,19 @@
     def get(self, box):
         return self.frame_bindings.get(box, None)
 
-    def loc(self, box, size):
+    def loc(self, box):
         res = self.get(box)
         if res is not None:
             return res
-        newloc = self.frame_pos(self.frame_depth, size)
+        newloc = self.frame_pos(self.frame_depth, box.type)
         self.frame_bindings[box] = newloc
-        self.frame_depth += size
+        # Objects returned by frame_pos must support frame_size()
+        self.frame_depth += newloc.frame_size()
         return newloc
 
     # abstract methods that need to be overwritten for specific assemblers
     @staticmethod
-    def frame_pos(loc, size):
+    def frame_pos(loc, type):
         raise NotImplementedError("Purely abstract")
 
 class RegisterManager(object):
@@ -43,7 +44,6 @@
     all_regs              = []
     no_lower_byte_regs    = []
     save_around_call_regs = []
-    reg_width             = 1 # in terms of stack space eaten
     
     def __init__(self, longevity, frame_manager=None, assembler=None):
         self.free_regs = self.all_regs[:]
@@ -148,7 +148,7 @@
         loc = self.reg_bindings[v_to_spill]
         del self.reg_bindings[v_to_spill]
         if self.frame_manager.get(v_to_spill) is None:
-            newloc = self.frame_manager.loc(v_to_spill, self.reg_width)
+            newloc = self.frame_manager.loc(v_to_spill)
             self.assembler.regalloc_mov(loc, newloc)
         return loc
 
@@ -204,7 +204,7 @@
         try:
             return self.reg_bindings[box]
         except KeyError:
-            return self.frame_manager.loc(box, self.reg_width)
+            return self.frame_manager.loc(box)
 
     def return_constant(self, v, forbidden_vars=[], selected_reg=None,
                         imm_fine=True):
@@ -260,7 +260,7 @@
             self.reg_bindings[v] = loc
             self.assembler.regalloc_mov(prev_loc, loc)
         else:
-            loc = self.frame_manager.loc(v, self.reg_width)
+            loc = self.frame_manager.loc(v)
             self.assembler.regalloc_mov(prev_loc, loc)
 
     def force_result_in_reg(self, result_v, v, forbidden_vars=[]):
@@ -280,7 +280,7 @@
             self.free_regs = [reg for reg in self.free_regs if reg is not loc]
             return loc
         if v not in self.reg_bindings:
-            prev_loc = self.frame_manager.loc(v, self.reg_width)
+            prev_loc = self.frame_manager.loc(v)
             loc = self.force_allocate_reg(v, forbidden_vars)
             self.assembler.regalloc_mov(prev_loc, loc)
         assert v in self.reg_bindings
@@ -300,7 +300,7 @@
     def _sync_var(self, v):
         if not self.frame_manager.get(v):
             reg = self.reg_bindings[v]
-            to = self.frame_manager.loc(v, self.reg_width)
+            to = self.frame_manager.loc(v)
             self.assembler.regalloc_mov(reg, to)
         # otherwise it's clean
 

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/llsupport/test/test_regalloc.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/llsupport/test/test_regalloc.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/llsupport/test/test_regalloc.py	Tue Aug 10 20:07:15 2010
@@ -1,5 +1,5 @@
 
-from pypy.jit.metainterp.history import BoxInt, ConstInt, BoxFloat
+from pypy.jit.metainterp.history import BoxInt, ConstInt, BoxFloat, INT, FLOAT
 from pypy.jit.backend.llsupport.regalloc import FrameManager
 from pypy.jit.backend.llsupport.regalloc import RegisterManager as BaseRegMan
 
@@ -26,9 +26,20 @@
     def convert_to_imm(self, v):
         return v
 
+class FakeFramePos(object):
+    def __init__(self, pos, box_type):
+        self.pos = pos
+        self.box_type = box_type
+
+    def frame_size(self):
+        if self.box_type == FLOAT:
+            return 2
+        else:
+            return 1
+
 class TFrameManager(FrameManager):
-    def frame_pos(self, i, size):
-        return i
+    def frame_pos(self, i, box_type):
+        return FakeFramePos(i, box_type)
 
 class MockAsm(object):
     def __init__(self):
@@ -146,8 +157,8 @@
         rm.next_instruction()
         # allocate a stack position
         b0, b1, b2, b3, b4 = boxes
-        sp = fm.loc(b0, 1)
-        assert sp == 0
+        sp = fm.loc(b0)
+        assert sp.pos == 0
         loc = rm.make_sure_var_in_reg(b0)
         assert isinstance(loc, FakeReg)
         rm._check_invariants()
@@ -207,13 +218,13 @@
         asm = MockAsm()
         rm = RegisterManager(longevity, frame_manager=fm, assembler=asm)
         rm.next_instruction()
-        fm.loc(b0, 1)
+        fm.loc(b0)
         rm.force_result_in_reg(b1, b0)
         rm._check_invariants()
         loc = rm.loc(b1)
         assert isinstance(loc, FakeReg)
         loc = rm.loc(b0)
-        assert isinstance(loc, int)
+        assert isinstance(loc, FakeFramePos)
         assert len(asm.moves) == 1
 
     def test_return_constant(self):
@@ -304,7 +315,7 @@
 
     def test_different_frame_width(self):
         class XRegisterManager(RegisterManager):
-            reg_width = 2
+            pass
 
         fm = TFrameManager()
         b0 = BoxInt()

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/test/runner_test.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/test/runner_test.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/test/runner_test.py	Tue Aug 10 20:07:15 2010
@@ -461,6 +461,25 @@
                                          [funcbox] + args,
                                          'float', descr=calldescr)
             assert abs(res.value - 4.6) < 0.0001
+
+    def test_call_many_arguments(self):
+        # Test calling a function with a large number of arguments (more than
+        # 6, which will force passing some arguments on the stack on 64-bit)
+
+        def func(*args):
+            assert len(args) == 16
+            # Try to sum up args in a way that would probably detect a
+            # transposed argument
+            return sum(arg * (2**i) for i, arg in enumerate(args))
+
+        FUNC = self.FuncType([lltype.Signed]*16, lltype.Signed)
+        FPTR = self.Ptr(FUNC)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        func_ptr = llhelper(FPTR, func)
+        args = range(16)
+        funcbox = self.get_funcbox(self.cpu, func_ptr)
+        res = self.execute_operation(rop.CALL, [funcbox] + map(BoxInt, args), 'int', descr=calldescr)
+        assert res.value == func(*args)
         
     def test_call_stack_alignment(self):
         # test stack alignment issues, notably for Mac OS/X.
@@ -638,6 +657,21 @@
         assert r.value == 1
 
     def test_array_basic(self):
+        a_box, A = self.alloc_array_of(rffi.SHORT, 342)
+        arraydescr = self.cpu.arraydescrof(A)
+        assert not arraydescr.is_array_of_pointers()
+        #
+        r = self.execute_operation(rop.ARRAYLEN_GC, [a_box],
+                                   'int', descr=arraydescr)
+        assert r.value == 342
+        r = self.execute_operation(rop.SETARRAYITEM_GC, [a_box, BoxInt(310),
+                                                         BoxInt(744)],
+                                   'void', descr=arraydescr)
+        assert r is None
+        r = self.execute_operation(rop.GETARRAYITEM_GC, [a_box, BoxInt(310)],
+                                   'int', descr=arraydescr)
+        assert r.value == 744
+
         a_box, A = self.alloc_array_of(lltype.Signed, 342)
         arraydescr = self.cpu.arraydescrof(A)
         assert not arraydescr.is_array_of_pointers()
@@ -978,6 +1012,8 @@
             else:
                 assert 0
             operations.append(ResOperation(opnum, boxargs, boxres))
+        # Unique-ify inputargs
+        inputargs = list(set(inputargs))
         faildescr = BasicFailDescr(1)
         operations.append(ResOperation(rop.FINISH, [], None,
                                        descr=faildescr))
@@ -1050,9 +1086,11 @@
                                          descr=BasicFailDescr(5))]
                         operations[1].fail_args = []
                         looptoken = LoopToken()
-                        self.cpu.compile_loop(list(testcase), operations,
+                        # Use "set" to unique-ify inputargs
+                        unique_testcase_list = list(set(testcase))
+                        self.cpu.compile_loop(unique_testcase_list, operations,
                                               looptoken)
-                        for i, box in enumerate(testcase):
+                        for i, box in enumerate(unique_testcase_list):
                             self.cpu.set_future_value_float(i, box.value)
                         fail = self.cpu.execute_token(looptoken)
                         if fail.identifier != 5 - (expected_id^expected):
@@ -1695,7 +1733,7 @@
     def test_assembler_call(self):
         called = []
         def assembler_helper(failindex, virtualizable):
-            assert self.cpu.get_latest_value_int(0) == 10
+            assert self.cpu.get_latest_value_int(0) == 97
             called.append(failindex)
             return 4 + 9
 
@@ -1708,33 +1746,41 @@
                 _assembler_helper_ptr)
 
         ops = '''
-        [i0, i1]
-        i2 = int_add(i0, i1)
-        finish(i2)'''
+        [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9]
+        i10 = int_add(i0, i1)
+        i11 = int_add(i10, i2)
+        i12 = int_add(i11, i3)
+        i13 = int_add(i12, i4)
+        i14 = int_add(i13, i5)
+        i15 = int_add(i14, i6)
+        i16 = int_add(i15, i7)
+        i17 = int_add(i16, i8)
+        i18 = int_add(i17, i9)
+        finish(i18)'''
         loop = parse(ops)
         looptoken = LoopToken()
         looptoken.outermost_jitdriver_sd = FakeJitDriverSD()
         self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
-        ARGS = [lltype.Signed, lltype.Signed]
+        ARGS = [lltype.Signed] * 10
         RES = lltype.Signed
         self.cpu.portal_calldescr = self.cpu.calldescrof(
             lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
-        self.cpu.set_future_value_int(0, 1)
-        self.cpu.set_future_value_int(1, 2)
+        for i in range(10):
+            self.cpu.set_future_value_int(i, i+1)
         res = self.cpu.execute_token(looptoken)
-        assert self.cpu.get_latest_value_int(0) == 3
+        assert self.cpu.get_latest_value_int(0) == 55
         ops = '''
-        [i4, i5]
-        i6 = int_add(i4, 1)
-        i3 = call_assembler(i6, i5, descr=looptoken)
+        [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9]
+        i10 = int_add(i0, 42)
+        i11 = call_assembler(i10, i1, i2, i3, i4, i5, i6, i7, i8, i9, descr=looptoken)
         guard_not_forced()[]
-        finish(i3)
+        finish(i11)
         '''
         loop = parse(ops, namespace=locals())
         othertoken = LoopToken()
         self.cpu.compile_loop(loop.inputargs, loop.operations, othertoken)
-        self.cpu.set_future_value_int(0, 4)
-        self.cpu.set_future_value_int(1, 5)
+        for i in range(10):
+            self.cpu.set_future_value_int(i, i+1)
         res = self.cpu.execute_token(othertoken)
         assert self.cpu.get_latest_value_int(0) == 13
         assert called

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/assembler.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/assembler.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/assembler.py	Tue Aug 10 20:07:15 2010
@@ -7,24 +7,35 @@
 from pypy.rpython.lltypesystem.lloperation import llop
 from pypy.rpython.annlowlevel import llhelper
 from pypy.tool.uid import fixid
-from pypy.jit.backend.x86.regalloc import RegAlloc, WORD,\
-     X86RegisterManager, X86XMMRegisterManager, get_ebp_ofs, FRAME_FIXED_SIZE,\
-     FORCE_INDEX_OFS
+from pypy.jit.backend.x86.regalloc import RegAlloc, \
+     X86RegisterManager, X86XMMRegisterManager, get_ebp_ofs
+
+from pypy.jit.backend.x86.arch import FRAME_FIXED_SIZE, FORCE_INDEX_OFS, WORD, IS_X86_32, IS_X86_64
+
+from pypy.jit.backend.x86.regloc import (eax, ecx, edx, ebx,
+                                         esp, ebp, esi, edi,
+                                         xmm0, xmm1, xmm2, xmm3,
+                                         xmm4, xmm5, xmm6, xmm7,
+                                         r8, r9, r10, r11,
+                                         r12, r13, r14, r15,
+                                         X86_64_SCRATCH_REG,
+                                         X86_64_XMM_SCRATCH_REG,
+                                         RegLoc, StackLoc, ConstFloatLoc,
+                                         ImmedLoc, AddressLoc, imm)
+
 from pypy.rlib.objectmodel import we_are_translated, specialize
-from pypy.jit.backend.x86 import codebuf
-from pypy.jit.backend.x86.ri386 import *
-from pypy.jit.metainterp.resoperation import rop
+from pypy.jit.backend.x86 import rx86, regloc, codebuf
+from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.backend.x86.support import values_array
 from pypy.rlib.debug import debug_print
 from pypy.rlib import rgc
+from pypy.jit.backend.x86.jump import remap_frame_layout
 from pypy.rlib.streamio import open_file_as_stream
-
-# our calling convention - we pass first 6 args in registers
-# and the rest stays on the stack
+from pypy.jit.metainterp.history import ConstInt, BoxInt
 
 # darwin requires the stack to be 16 bytes aligned on calls. Same for gcc 4.5.0,
 # better safe than sorry
-CALL_ALIGN = 4
+CALL_ALIGN = 16 // WORD
 
 def align_stack_words(words):
     return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
@@ -32,16 +43,36 @@
 class MachineCodeBlockWrapper(object):
     MC_DEFAULT_SIZE = 1024*1024
 
-    def __init__(self, bigsize, profile_agent=None):
+    def __init__(self, assembler, bigsize, profile_agent=None):
+        self.assembler = assembler
         self.old_mcs = [] # keepalive
         self.bigsize = bigsize
         self._mc = self._instantiate_mc()
         self.function_name = None
         self.profile_agent = profile_agent
+        self.reset_reserved_bytes()
 
     def _instantiate_mc(self): # hook for testing
         return codebuf.MachineCodeBlock(self.bigsize)
 
+    def ensure_bytes_available(self, num_bytes):
+        if self.bytes_free() <= (self._reserved_bytes + num_bytes):
+            self.make_new_mc()
+
+    def reserve_bytes(self, num_bytes):
+        self.ensure_bytes_available(num_bytes)
+        self._reserved_bytes += num_bytes
+
+    def reset_reserved_bytes(self):
+        # XXX er.... pretty random number, just to be sure
+        #     not to write half-instruction
+        self._reserved_bytes = 64
+
+    def get_relative_pos(self):
+        return self._mc.get_relative_pos()
+
+    def overwrite(self, pos, listofchars):
+        return self._mc.overwrite(pos, listofchars)
 
     def bytes_free(self):
         return self._mc._size - self._mc.get_relative_pos()
@@ -63,12 +94,25 @@
     def make_new_mc(self):
         new_mc = self._instantiate_mc()
         debug_print('[new machine code block at', new_mc.tell(), ']')
-        self._mc.JMP(rel32(new_mc.tell()))
+
+        if IS_X86_64:
+            # The scratch register is sometimes used as a temporary
+            # register, but the JMP below might clobber it. Rather than risk
+            # subtle bugs, we preserve the scratch register across the jump.
+            self._mc.PUSH_r(X86_64_SCRATCH_REG.value)
+            
+        self._mc.JMP(imm(new_mc.tell()))
+
+        if IS_X86_64:
+            # Restore scratch reg
+            new_mc.POP_r(X86_64_SCRATCH_REG.value)
 
         if self.function_name is not None:
             self.end_function(done=False)
             self.start_pos = new_mc.get_relative_pos()
 
+        self.assembler.write_pending_failure_recoveries()
+
         self._mc.done()
         self.old_mcs.append(self._mc)
         self._mc = new_mc
@@ -82,21 +126,35 @@
 
 def _new_method(name):
     def method(self, *args):
-        # XXX er.... pretty random number, just to be sure
-        #     not to write half-instruction
-        if self.bytes_free() < 64:
+        if self.bytes_free() < self._reserved_bytes:
             self.make_new_mc()
         getattr(self._mc, name)(*args)    
     method.func_name = name
     return method
 
+for _name in rx86.all_instructions + regloc.all_extra_instructions:
+    setattr(MachineCodeBlockWrapper, _name, _new_method(_name))
+
 for name in dir(codebuf.MachineCodeBlock):
     if name.upper() == name or name == "writechr":
         setattr(MachineCodeBlockWrapper, name, _new_method(name))
 
+class GuardToken(object):
+    def __init__(self, faildescr, failargs, fail_locs, exc, desc_bytes):
+        self.faildescr = faildescr
+        self.failargs = failargs
+        self.fail_locs = fail_locs
+        self.exc = exc
+        self.desc_bytes = desc_bytes
+
+    def recovery_stub_size(self):
+        # XXX: 32 is pulled out of the air
+        return 32 + len(self.desc_bytes)
+
+DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed))
+
 class Assembler386(object):
     mc = None
-    mc2 = None
     mc_size = MachineCodeBlockWrapper.MC_DEFAULT_SIZE
     _float_constants = None
     _regalloc = None
@@ -115,16 +173,16 @@
         self.fail_boxes_ptr = values_array(llmemory.GCREF, failargs_limit)
         self.fail_boxes_float = values_array(lltype.Float, failargs_limit)
         self.fail_ebp = 0
-        self.loop_run_counter = values_array(lltype.Signed, 10000)
-        self.loop_names = []
+        self.loop_run_counters = []
         # if we have 10000 loops, we have some other problems I guess
-        self.loc_float_const_neg = None
-        self.loc_float_const_abs = None
+        self.float_const_neg_addr = 0
+        self.float_const_abs_addr = 0
         self.malloc_fixedsize_slowpath1 = 0
         self.malloc_fixedsize_slowpath2 = 0
+        self.pending_guard_tokens = None
         self.setup_failure_recovery()
-        self._loop_counter = 0
         self._debug = False
+        self.debug_counter_descr = cpu.fielddescrof(DEBUG_COUNTER, 'i')
 
     def leave_jitted_hook(self):
         ptrs = self.fail_boxes_ptr.ar
@@ -134,7 +192,7 @@
     def set_debug(self, v):
         self._debug = v
 
-    def make_sure_mc_exists(self):
+    def setup(self):
         if self.mc is None:
             # the address of the function called by 'new'
             gc_ll_descr = self.cpu.gc_ll_descr
@@ -153,11 +211,7 @@
                 ll_new_unicode = gc_ll_descr.get_funcptr_for_newunicode()
                 self.malloc_unicode_func_addr = rffi.cast(lltype.Signed,
                                                           ll_new_unicode)
-            # done
-            # we generate the loop body in 'mc'
-            # 'mc2' is for guard recovery code
-            self.mc = MachineCodeBlockWrapper(self.mc_size, self.cpu.profile_agent)
-            self.mc2 = MachineCodeBlockWrapper(self.mc_size)
+            self.mc = MachineCodeBlockWrapper(self, self.mc_size, self.cpu.profile_agent)
             self._build_failure_recovery(False)
             self._build_failure_recovery(True)
             if self.cpu.supports_floats:
@@ -173,56 +227,66 @@
                     s = s.split(':')[-1]
                 self.set_debug(True)
                 self._output_loop_log = s + ".count"
+            # Intialize here instead of __init__ to prevent
+            # pending_guard_tokens from being considered a prebuilt object,
+            # which sometimes causes memory leaks since the prebuilt list is
+            # still considered a GC root after we re-assign
+            # pending_guard_tokens in write_pending_failure_recoveries
+            self.pending_guard_tokens = []
 
     def finish_once(self):
         if self._debug:
             output_log = self._output_loop_log
             assert output_log is not None
             f = open_file_as_stream(output_log, "w")
-            for i in range(self._loop_counter):
-                f.write(self.loop_names[i] + ":" +
-                        str(self.loop_run_counter.getitem(i)) + "\n")
+            for i in range(len(self.loop_run_counters)):
+                name, struct = self.loop_run_counters[i]
+                f.write(name + ":" + str(struct.i) + "\n")
             f.close()
 
     def _build_float_constants(self):
-        # 11 words: 8 words for the data, and up to 3 words for alignment
-        addr = lltype.malloc(rffi.CArray(lltype.Signed), 11, flavor='raw')
+        # 44 bytes: 32 bytes for the data, and up to 12 bytes for alignment
+        addr = lltype.malloc(rffi.CArray(lltype.Char), 44, flavor='raw')
         if not we_are_translated():
             self._keepalive_malloced_float_consts = addr
         float_constants = rffi.cast(lltype.Signed, addr)
         float_constants = (float_constants + 15) & ~15    # align to 16 bytes
-        addr = rffi.cast(rffi.CArrayPtr(lltype.Signed), float_constants)
-        addr[0] = 0                # \
-        addr[1] = -2147483648      # / for neg
-        addr[2] = 0                #
-        addr[3] = 0                #
-        addr[4] = -1               # \
-        addr[5] = 2147483647       # / for abs
-        addr[6] = 0                #
-        addr[7] = 0                #
-        self.loc_float_const_neg = heap64(float_constants)
-        self.loc_float_const_abs = heap64(float_constants + 16)
+        addr = rffi.cast(rffi.CArrayPtr(lltype.Char), float_constants)
+        qword_padding = '\x00\x00\x00\x00\x00\x00\x00\x00'
+        # 0x8000000000000000
+        neg_const = '\x00\x00\x00\x00\x00\x00\x00\x80'
+        # 0x7FFFFFFFFFFFFFFF
+        abs_const = '\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F'
+        data = neg_const + qword_padding + abs_const + qword_padding
+        for i in range(len(data)):
+            addr[i] = data[i]
+        self.float_const_neg_addr = float_constants
+        self.float_const_abs_addr = float_constants + 16
 
     def _build_malloc_fixedsize_slowpath(self):
-        mc = self.mc2._mc
         # ---------- first helper for the slow path of malloc ----------
-        self.malloc_fixedsize_slowpath1 = mc.tell()
+        self.malloc_fixedsize_slowpath1 = self.mc.tell()
         if self.cpu.supports_floats:          # save the XMM registers in
-            for i in range(8):                # the *caller* frame, from esp+8
-                mc.MOVSD(mem64(esp, 8+8*i), xmm_registers[i])
-        mc.SUB(edx, eax)                      # compute the size we want
-        mc.MOV(mem(esp, 4), edx)              # save it as the new argument
+            for i in range(self.cpu.NUM_REGS):# the *caller* frame, from esp+8
+                self.mc.MOVSD_sx((WORD*2)+8*i, i)
+        self.mc.SUB_rr(edx.value, eax.value)       # compute the size we want
+        if IS_X86_32:
+            self.mc.MOV_sr(WORD, edx.value)        # save it as the new argument
+        elif IS_X86_64:
+            # FIXME: We can't just clobber rdi like this, can we?
+            self.mc.MOV_rr(edi.value, edx.value)
+
         addr = self.cpu.gc_ll_descr.get_malloc_fixedsize_slowpath_addr()
-        mc.JMP(rel32(addr))                   # tail call to the real malloc
+        self.mc.JMP(imm(addr))                    # tail call to the real malloc
         # ---------- second helper for the slow path of malloc ----------
-        self.malloc_fixedsize_slowpath2 = mc.tell()
+        self.malloc_fixedsize_slowpath2 = self.mc.tell()
         if self.cpu.supports_floats:          # restore the XMM registers
-            for i in range(8):                # from where they were saved
-                mc.MOVSD(xmm_registers[i], mem64(esp, 8+8*i))
+            for i in range(self.cpu.NUM_REGS):# from where they were saved
+                self.mc.MOVSD_xs(i, (WORD*2)+8*i)
         nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
-        mc.MOV(edx, heap(nursery_free_adr))   # load this in EDX
-        mc.RET()
-        self.mc2.done()
+        self.mc.MOV(edx, heap(nursery_free_adr))   # load this in EDX
+        self.mc.RET()
+        self.mc.done()
 
     def assemble_loop(self, inputargs, operations, looptoken):
         """adds the following attributes to looptoken:
@@ -233,15 +297,18 @@
                _x86_param_depth
                _x86_arglocs
         """
-        self.make_sure_mc_exists()
+        if not we_are_translated():
+            # Arguments should be unique
+            assert len(set(inputargs)) == len(inputargs)
+
+        self.setup()
         funcname = self._find_debug_merge_point(operations)
 
+        
         regalloc = RegAlloc(self, self.cpu.translate_support_code)
+        operations = self._inject_debugging_code(operations)
         arglocs = regalloc.prepare_loop(inputargs, operations, looptoken)
         looptoken._x86_arglocs = arglocs
-        needed_mem = len(arglocs[0]) * 16 + 16
-        if needed_mem >= self.mc.bytes_free():
-            self.mc.make_new_mc()
 
         # profile support
         name = "Loop # %s: %s" % (looptoken.number, funcname)
@@ -256,21 +323,21 @@
         self._patch_stackadjust(adr_stackadjust, frame_depth+param_depth)
         looptoken._x86_frame_depth = frame_depth
         looptoken._x86_param_depth = param_depth
-        # we need to make sure here that we don't overload an mc badly.
-        # a safe estimate is that we need at most 16 bytes per arg
-        needed_mem = len(arglocs[0]) * 16 + 16
-        if needed_mem >= self.mc.bytes_free():
-            self.mc.make_new_mc()
+
         looptoken._x86_direct_bootstrap_code = self.mc.tell()
         self._assemble_bootstrap_direct_call(arglocs, curadr,
                                              frame_depth+param_depth)
         debug_print("Loop #", looptoken.number, "has address",
                     looptoken._x86_loop_code, "to", self.mc.tell())
         self.mc.end_function()
+        self.write_pending_failure_recoveries()
         
-
     def assemble_bridge(self, faildescr, inputargs, operations):
-        self.make_sure_mc_exists()
+        if not we_are_translated():
+            # Arguments should be unique
+            assert len(set(inputargs)) == len(inputargs)
+
+        self.setup()
         funcname = self._find_debug_merge_point(operations)
 
         arglocs = self.rebuild_faillocs_from_descr(
@@ -279,6 +346,7 @@
             assert ([loc.assembler() for loc in arglocs] ==
                     [loc.assembler() for loc in faildescr._x86_debug_faillocs])
         regalloc = RegAlloc(self, self.cpu.translate_support_code)
+        operations = self._inject_debugging_code(operations)
         fail_depths = faildescr._x86_current_depths
         regalloc.prepare_bridge(fail_depths, inputargs, arglocs,
                                 operations)
@@ -302,6 +370,19 @@
                     descr_number,
                     "has address", adr_bridge, "to", self.mc.tell())
         self.mc.end_function()
+        self.write_pending_failure_recoveries()
+
+    def write_pending_failure_recoveries(self):
+        for tok in self.pending_guard_tokens:
+            # Okay to write to _mc because we've already made sure that
+            # there's enough space by "reserving" bytes.
+            addr = self.generate_quick_failure(self.mc._mc, tok.faildescr, tok.failargs, tok.fail_locs, tok.exc, tok.desc_bytes)
+            tok.faildescr._x86_adr_recovery_stub = addr
+            self.patch_jump_for_descr(tok.faildescr, addr)
+
+        self.pending_guard_tokens = []
+        self.mc.reset_reserved_bytes()
+        self.mc.done()
 
     def _find_debug_merge_point(self, operations):
 
@@ -310,34 +391,60 @@
                 funcname = op.args[0]._get_str()
                 break
         else:
-            funcname = "<loop %d>" % self._loop_counter
+            funcname = "<loop %d>" % len(self.loop_run_counters)
         # invent the counter, so we don't get too confused
         if self._debug:
-            self.loop_names.append(funcname)
-            self._loop_counter += 1
+            struct = lltype.malloc(DEBUG_COUNTER, flavor='raw')
+            struct.i = 0
+            self.loop_run_counters.append((funcname, struct))
         return funcname
         
     def patch_jump_for_descr(self, faildescr, adr_new_target):
         adr_jump_offset = faildescr._x86_adr_jump_offset
-        mc = codebuf.InMemoryCodeBuilder(adr_jump_offset, adr_jump_offset + 4)
-        mc.write(packimm32(adr_new_target - adr_jump_offset - 4))
+        adr_recovery_stub = faildescr._x86_adr_recovery_stub
+        offset = adr_new_target - (adr_jump_offset + 4)
+        # If the new target fits within a rel32 of the jump, just patch
+        # that. Otherwise, leave the original rel32 to the recovery stub in
+        # place, but clobber the recovery stub with a jump to the real
+        # target.
+        if rx86.fits_in_32bits(offset):
+            mc = codebuf.InMemoryCodeBuilder(adr_jump_offset, adr_jump_offset + 4)
+            mc.writeimm32(offset)
+        else:
+            # "mov r11, addr; jmp r11" is 13 bytes
+            mc = codebuf.InMemoryCodeBuilder(adr_recovery_stub, adr_recovery_stub + 13)
+            mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
+            mc.JMP_r(X86_64_SCRATCH_REG.value)
+
         mc.valgrind_invalidated()
         mc.done()
 
-    def _assemble(self, regalloc, operations):
-        self._regalloc = regalloc
+    def _inject_debugging_code(self, operations):
         if self._debug:
             # before doing anything, let's increase a counter
-            # we need one register free (a bit of a hack, but whatever)
-            self.mc.PUSH(eax)
-            adr = self.loop_run_counter.get_addr_for_num(self._loop_counter - 1)
-            self.mc.MOV(eax, heap(adr))
-            self.mc.ADD(eax, imm(1))
-            self.mc.MOV(heap(adr), eax)
-            self.mc.POP(eax)
+            c_adr = ConstInt(rffi.cast(lltype.Signed,
+                                     self.loop_run_counters[-1][1]))
+            box = BoxInt()
+            box2 = BoxInt()
+            ops = [ResOperation(rop.GETFIELD_RAW, [c_adr],
+                                box, descr=self.debug_counter_descr),
+                   ResOperation(rop.INT_ADD, [box, ConstInt(1)], box2),
+                   ResOperation(rop.SETFIELD_RAW, [c_adr, box2],
+                                None, descr=self.debug_counter_descr)]
+            operations = ops + operations
+            # # we need one register free (a bit of a hack, but whatever)
+            # self.mc.PUSH(eax)
+            # adr = rffi.cast(lltype.Signed, self.loop_run_counters[-1][1])
+            # self.mc.MOV(eax, heap(adr))
+            # self.mc.ADD(eax, imm(1))
+            # self.mc.MOV(heap(adr), eax)
+            # self.mc.POP(eax)
+        return operations
+
+    def _assemble(self, regalloc, operations):
+        self._regalloc = regalloc
         regalloc.walk_operations(operations)        
         self.mc.done()
-        self.mc2.done()
         if we_are_translated() or self.cpu.dont_keepalive_stuff:
             self._regalloc = None   # else keep it around for debugging
         frame_depth = regalloc.fm.frame_depth
@@ -352,7 +459,7 @@
 
     def _patchable_stackadjust(self):
         # stack adjustment LEA
-        self.mc.LEA(esp, fixedsize_ebp_ofs(0))
+        self.mc.LEA32_rb(esp.value, 0)
         return self.mc.tell() - 4
 
     def _patch_stackadjust(self, adr_lea, reserved_depth):
@@ -361,23 +468,34 @@
         # Compute the correct offset for the instruction LEA ESP, [EBP-4*words].
         # Given that [EBP] is where we saved EBP, i.e. in the last word
         # of our fixed frame, then the 'words' value is:
-        words = (FRAME_FIXED_SIZE - 1) + reserved_depth
+        words = (self.cpu.FRAME_FIXED_SIZE - 1) + reserved_depth
         # align, e.g. for Mac OS X        
         aligned_words = align_stack_words(words+2)-2 # 2 = EIP+EBP
-        mc.write(packimm32(-WORD * aligned_words))
+        mc.writeimm32(-WORD * aligned_words)
         mc.done()
 
     def _call_header(self):
-        self.mc.PUSH(ebp)
-        self.mc.MOV(ebp, esp)
-        self.mc.PUSH(ebx)
-        self.mc.PUSH(esi)
-        self.mc.PUSH(edi)
+        self.mc.PUSH_r(ebp.value)
+        self.mc.MOV_rr(ebp.value, esp.value)
+        for regloc in self.cpu.CALLEE_SAVE_REGISTERS:
+            self.mc.PUSH_r(regloc.value)
+
         # NB. the shape of the frame is hard-coded in get_basic_shape() too.
         # Also, make sure this is consistent with FRAME_FIXED_SIZE.
         return self._patchable_stackadjust()
 
+    def _call_footer(self):
+        self.mc.LEA_rb(esp.value, -len(self.cpu.CALLEE_SAVE_REGISTERS) * WORD)
+
+        for i in range(len(self.cpu.CALLEE_SAVE_REGISTERS)-1, -1, -1):
+            self.mc.POP_r(self.cpu.CALLEE_SAVE_REGISTERS[i].value)
+
+        self.mc.POP_r(ebp.value)
+        self.mc.RET()
+
     def _assemble_bootstrap_direct_call(self, arglocs, jmpadr, stackdepth):
+        if IS_X86_64:
+            return self._assemble_bootstrap_direct_call_64(arglocs, jmpadr, stackdepth)
         # XXX pushing ebx esi and edi is a bit pointless, since we store
         #     all regsiters anyway, for the case of guard_not_forced
         # XXX this can be improved greatly. Right now it'll behave like
@@ -388,23 +506,81 @@
         self._patch_stackadjust(adr_stackadjust, stackdepth)
         for i in range(len(nonfloatlocs)):
             loc = nonfloatlocs[i]
-            if isinstance(loc, REG):
-                self.mc.MOV(loc, mem(ebp, (2 + i) * WORD))
+            if isinstance(loc, RegLoc):
+                assert not loc.is_xmm
+                self.mc.MOV_rb(loc.value, (2 + i) * WORD)
             loc = floatlocs[i]
-            if isinstance(loc, XMMREG):
-                self.mc.MOVSD(loc, mem64(ebp, (1 + i) * 2 * WORD))
+            if isinstance(loc, RegLoc):
+                assert loc.is_xmm
+                self.mc.MOVSD_xb(loc.value, (1 + i) * 2 * WORD)
         tmp = eax
         xmmtmp = xmm0
         for i in range(len(nonfloatlocs)):
             loc = nonfloatlocs[i]
-            if loc is not None and not isinstance(loc, REG):
-                self.mc.MOV(tmp, mem(ebp, (2 + i) * WORD))
+            if loc is not None and not isinstance(loc, RegLoc):
+                self.mc.MOV_rb(tmp.value, (2 + i) * WORD)
                 self.mc.MOV(loc, tmp)
             loc = floatlocs[i]
-            if loc is not None and not isinstance(loc, XMMREG):
-                self.mc.MOVSD(xmmtmp, mem64(ebp, (1 + i) * 2 * WORD))
-                self.mc.MOVSD(loc, xmmtmp)
-        self.mc.JMP(rel32(jmpadr))
+            if loc is not None and not isinstance(loc, RegLoc):
+                self.mc.MOVSD_xb(xmmtmp.value, (1 + i) * 2 * WORD)
+                assert isinstance(loc, StackLoc)
+                self.mc.MOVSD_bx(loc.value, xmmtmp.value)
+        self.mc.JMP_l(jmpadr)
+        return adr_stackadjust
+
+    def _assemble_bootstrap_direct_call_64(self, arglocs, jmpadr, stackdepth):
+        # XXX: Very similar to _emit_call_64
+
+        src_locs = []
+        dst_locs = []
+        xmm_src_locs = []
+        xmm_dst_locs = []
+        get_from_stack = []
+
+        # In reverse order for use with pop()
+        unused_gpr = [r9, r8, ecx, edx, esi, edi]
+        unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
+
+        nonfloatlocs, floatlocs = arglocs
+        adr_stackadjust = self._call_header()
+        self._patch_stackadjust(adr_stackadjust, stackdepth)
+
+        # The lists are padded with Nones
+        assert len(nonfloatlocs) == len(floatlocs)
+
+        for i in range(len(nonfloatlocs)):
+            loc = nonfloatlocs[i]
+            if loc is not None:
+                if len(unused_gpr) > 0:
+                    src_locs.append(unused_gpr.pop())
+                    dst_locs.append(loc)
+                else:
+                    get_from_stack.append((loc, False))
+
+            floc = floatlocs[i]
+            if floc is not None:
+                if len(unused_xmm) > 0:
+                    xmm_src_locs.append(unused_xmm.pop())
+                    xmm_dst_locs.append(floc)
+                else:
+                    get_from_stack.append((floc, True))
+
+        remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
+        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs, X86_64_XMM_SCRATCH_REG)
+
+        for i in range(len(get_from_stack)):
+            loc, is_xmm = get_from_stack[i]
+            if is_xmm:
+                self.mc.MOVSD_xb(X86_64_XMM_SCRATCH_REG.value, (2 + i) * WORD)
+                self.mc.MOVSD(loc, X86_64_XMM_SCRATCH_REG)
+            else:
+                self.mc.MOV_rb(X86_64_SCRATCH_REG.value, (2 + i) * WORD)
+                # XXX: We're assuming that "loc" won't require regloc to
+                # clobber the scratch register
+                self.mc.MOV(loc, X86_64_SCRATCH_REG)
+
+        self.mc.JMP(imm(jmpadr))
+
         return adr_stackadjust
 
     def _assemble_bootstrap_code(self, inputargs, arglocs):
@@ -412,11 +588,12 @@
         adr_stackadjust = self._call_header()
         tmp = X86RegisterManager.all_regs[0]
         xmmtmp = X86XMMRegisterManager.all_regs[0]
+        self.mc._mc.begin_reuse_scratch_register()
         for i in range(len(nonfloatlocs)):
             loc = nonfloatlocs[i]
             if loc is None:
                 continue
-            if isinstance(loc, REG):
+            if isinstance(loc, RegLoc):
                 target = loc
             else:
                 target = tmp
@@ -430,17 +607,20 @@
                 adr = self.fail_boxes_int.get_addr_for_num(i)
                 self.mc.MOV(target, heap(adr))
             if target is not loc:
-                self.mc.MOV(loc, target)
+                assert isinstance(loc, StackLoc)
+                self.mc.MOV_br(loc.value, target.value)
         for i in range(len(floatlocs)):
             loc = floatlocs[i]
             if loc is None:
                 continue
             adr = self.fail_boxes_float.get_addr_for_num(i)
-            if isinstance(loc, REG):
-                self.mc.MOVSD(loc, heap64(adr))
+            if isinstance(loc, RegLoc):
+                self.mc.MOVSD(loc, heap(adr))
             else:
-                self.mc.MOVSD(xmmtmp, heap64(adr))
-                self.mc.MOVSD(loc, xmmtmp)
+                self.mc.MOVSD(xmmtmp, heap(adr))
+                assert isinstance(loc, StackLoc)
+                self.mc.MOVSD_bx(loc.value, xmmtmp.value)
+        self.mc._mc.end_reuse_scratch_register()
         return adr_stackadjust
 
     def dump(self, text):
@@ -453,27 +633,10 @@
         finally:
             Box._extended_display = _prev
 
-    def _start_block(self):
-        # Return a 'mc' that can be used to write an "atomic" block,
-        # i.e. one that will not contain any JMP.
-        mc = self.mc._mc
-        if not we_are_translated():
-            self._block_started_mc = (self.mc, mc.tell())
-            self.mc = "block started"
-        return mc
-
-    def _stop_block(self):
-        if not we_are_translated():
-            assert self.mc == "block started"
-            self.mc, orgpos = self._block_started_mc
-            assert 0 <= self.mc._mc.tell() - orgpos <= 58, (
-                "too many bytes in _start_block/_stop_block pair")
-            del self._block_started_mc
-
     # ------------------------------------------------------------
 
     def mov(self, from_loc, to_loc):
-        if isinstance(from_loc, XMMREG) or isinstance(to_loc, XMMREG):
+        if (isinstance(from_loc, RegLoc) and from_loc.is_xmm) or (isinstance(to_loc, RegLoc) and to_loc.is_xmm):
             self.mc.MOVSD(to_loc, from_loc)
         else:
             self.mc.MOV(to_loc, from_loc)
@@ -481,24 +644,24 @@
     regalloc_mov = mov # legacy interface
 
     def regalloc_push(self, loc):
-        if isinstance(loc, XMMREG):
-            self.mc.SUB(esp, imm(2*WORD))
-            self.mc.MOVSD(mem64(esp, 0), loc)
-        elif isinstance(loc, MODRM64):
+        if isinstance(loc, RegLoc) and loc.is_xmm:
+            self.mc.SUB_ri(esp.value, 2*WORD)
+            self.mc.MOVSD_sx(0, loc.value)
+        elif WORD == 4 and isinstance(loc, StackLoc) and loc.width == 8:
             # XXX evil trick
-            self.mc.PUSH(mem(ebp, get_ebp_ofs(loc.position)))
-            self.mc.PUSH(mem(ebp, get_ebp_ofs(loc.position + 1)))
+            self.mc.PUSH_b(get_ebp_ofs(loc.position))
+            self.mc.PUSH_b(get_ebp_ofs(loc.position + 1))
         else:
             self.mc.PUSH(loc)
 
     def regalloc_pop(self, loc):
-        if isinstance(loc, XMMREG):
-            self.mc.MOVSD(loc, mem64(esp, 0))
-            self.mc.ADD(esp, imm(2*WORD))
-        elif isinstance(loc, MODRM64):
+        if isinstance(loc, RegLoc) and loc.is_xmm:
+            self.mc.MOVSD_xs(loc.value, 0)
+            self.mc.ADD_ri(esp.value, 2*WORD)
+        elif WORD == 4 and isinstance(loc, StackLoc) and loc.width == 8:
             # XXX evil trick
-            self.mc.POP(mem(ebp, get_ebp_ofs(loc.position + 1)))
-            self.mc.POP(mem(ebp, get_ebp_ofs(loc.position)))
+            self.mc.POP_b(get_ebp_ofs(loc.position + 1))
+            self.mc.POP_b(get_ebp_ofs(loc.position))
         else:
             self.mc.POP(loc)
 
@@ -515,14 +678,14 @@
         faildescr._x86_current_depths = current_depths
         failargs = guard_op.fail_args
         guard_opnum = guard_op.opnum
-        failaddr = self.implement_guard_recovery(guard_opnum,
-                                                 faildescr, failargs,
-                                                 faillocs)
+        guard_token = self.implement_guard_recovery(guard_opnum,
+                                                    faildescr, failargs,
+                                                    faillocs)
         if op is None:
             dispatch_opnum = guard_opnum
         else:
             dispatch_opnum = op.opnum
-        res = genop_guard_list[dispatch_opnum](self, op, guard_op, failaddr,
+        res = genop_guard_list[dispatch_opnum](self, op, guard_op, guard_token,
                                                arglocs, resloc)
         faildescr._x86_adr_jump_offset = res
 
@@ -549,103 +712,161 @@
             rl = result_loc.lowest8bits()
             if isinstance(op.args[0], Const):
                 self.mc.CMP(arglocs[1], arglocs[0])
-                getattr(self.mc, 'SET' + rev_cond)(rl)
+                self.mc.SET_ir(rx86.Conditions[rev_cond], rl.value)
             else:
                 self.mc.CMP(arglocs[0], arglocs[1])
-                getattr(self.mc, 'SET' + cond)(rl)
-            self.mc.MOVZX(result_loc, rl)
+                self.mc.SET_ir(rx86.Conditions[cond], rl.value)
+            self.mc.MOVZX8_rr(result_loc.value, rl.value)
         return genop_cmp
 
     def _cmpop_float(cond, is_ne=False):
         def genop_cmp(self, op, arglocs, result_loc):
             self.mc.UCOMISD(arglocs[0], arglocs[1])
-            rl = result_loc.lowest8bits()
-            rh = result_loc.higher8bits()
-            getattr(self.mc, 'SET' + cond)(rl)
+            tmp1 = result_loc.lowest8bits()
+            if IS_X86_32:
+                tmp2 = result_loc.higher8bits()
+            elif IS_X86_64:
+                tmp2 = X86_64_SCRATCH_REG.lowest8bits()
+
+            self.mc.SET_ir(rx86.Conditions[cond], tmp1.value)
             if is_ne:
-                self.mc.SETP(rh)
-                self.mc.OR(rl, rh)
+                self.mc.SET_ir(rx86.Conditions['P'], tmp2.value)
+                self.mc.OR8_rr(tmp1.value, tmp2.value)
             else:
-                self.mc.SETNP(rh)
-                self.mc.AND(rl, rh)
-            self.mc.MOVZX(result_loc, rl)
+                self.mc.SET_ir(rx86.Conditions['NP'], tmp2.value)
+                self.mc.AND8_rr(tmp1.value, tmp2.value)
+            self.mc.MOVZX8_rr(result_loc.value, tmp1.value)
         return genop_cmp
 
     def _cmpop_guard(cond, rev_cond, false_cond, false_rev_cond):
-        def genop_cmp_guard(self, op, guard_op, addr, arglocs, result_loc):
+        def genop_cmp_guard(self, op, guard_op, guard_token, arglocs, result_loc):
             guard_opnum = guard_op.opnum
             if isinstance(op.args[0], Const):
                 self.mc.CMP(arglocs[1], arglocs[0])
                 if guard_opnum == rop.GUARD_FALSE:
-                    name = 'J' + rev_cond
-                    return self.implement_guard(addr, getattr(self.mc, name))
+                    return self.implement_guard(guard_token, rev_cond)
                 else:
-                    name = 'J' + false_rev_cond
-                    return self.implement_guard(addr, getattr(self.mc, name))
+                    return self.implement_guard(guard_token, false_rev_cond)
             else:
                 self.mc.CMP(arglocs[0], arglocs[1])
                 if guard_opnum == rop.GUARD_FALSE:
-                    name = 'J' + cond
-                    return self.implement_guard(addr, getattr(self.mc, name))
+                    return self.implement_guard(guard_token, cond)
                 else:
-                    name = 'J' + false_cond
-                    return self.implement_guard(addr, getattr(self.mc, name))
+                    return self.implement_guard(guard_token, false_cond)
         return genop_cmp_guard
 
     def _cmpop_guard_float(cond, false_cond, need_jp):
-        def genop_cmp_guard_float(self, op, guard_op, addr, arglocs,
+        def genop_cmp_guard_float(self, op, guard_op, guard_token, arglocs,
                                   result_loc):
             guard_opnum = guard_op.opnum
             self.mc.UCOMISD(arglocs[0], arglocs[1])
+            # 16 is enough space for the rel8 jumps below and the rel32
+            # jump in implement_guard
+            self.mc.ensure_bytes_available(16 + guard_token.recovery_stub_size())
             if guard_opnum == rop.GUARD_FALSE:
-                mc = self.mc._mc
-                name = 'J' + cond
                 if need_jp:
-                    mc.JP(rel8(6))
-                getattr(mc, name)(rel32(addr))
-                return mc.tell() - 4
+                    self.mc.J_il8(rx86.Conditions['P'], 6)
+                return self.implement_guard(guard_token, cond)
             else:
                 if need_jp:
-                    mc = self.mc._mc
-                    mc.JP(rel8(2))
-                    getattr(mc, 'J' + cond)(rel8(5))
-                    return self.implement_guard(addr, mc.JMP)
-                name = 'J' + false_cond
-                return self.implement_guard(addr, getattr(self.mc, name))
+                    self.mc.J_il8(rx86.Conditions['P'], 2)
+                    self.mc.J_il8(rx86.Conditions[cond], 5)
+                    return self.implement_guard(guard_token)
+                return self.implement_guard(guard_token, false_cond)
         return genop_cmp_guard_float
 
-    @specialize.arg(5)
-    def _emit_call(self, x, arglocs, start=0, tmp=eax, force_mc=False,
-                   mc=None):
-        if not force_mc:
-            mc = self.mc
+    def _emit_call(self, x, arglocs, start=0, tmp=eax):
+        if IS_X86_64:
+            return self._emit_call_64(x, arglocs, start)
+
         p = 0
         n = len(arglocs)
         for i in range(start, n):
             loc = arglocs[i]
-            if isinstance(loc, REG):
-                if isinstance(loc, XMMREG):
-                    mc.MOVSD(mem64(esp, p), loc)
+            if isinstance(loc, RegLoc):
+                if loc.is_xmm:
+                    self.mc.MOVSD_sx(p, loc.value)
                 else:
-                    mc.MOV(mem(esp, p), loc)
+                    self.mc.MOV_sr(p, loc.value)
             p += round_up_to_4(loc.width)
         p = 0
         for i in range(start, n):
             loc = arglocs[i]
-            if not isinstance(loc, REG):
-                if isinstance(loc, MODRM64):
-                    mc.MOVSD(xmm0, loc)
-                    mc.MOVSD(mem64(esp, p), xmm0)
+            if not isinstance(loc, RegLoc):
+                if loc.width == 8:
+                    self.mc.MOVSD(xmm0, loc)
+                    self.mc.MOVSD_sx(p, xmm0.value)
                 else:
-                    mc.MOV(tmp, loc)
-                    mc.MOV(mem(esp, p), tmp)
+                    self.mc.MOV(tmp, loc)
+                    self.mc.MOV_sr(p, tmp.value)
             p += round_up_to_4(loc.width)
         self._regalloc.reserve_param(p//WORD)
-        mc.CALL(x)
+        # x is a location
+        self.mc.CALL(x)
         self.mark_gc_roots()
+
+    def _emit_call_64(self, x, arglocs, start=0):
+        src_locs = []
+        dst_locs = []
+        xmm_src_locs = []
+        xmm_dst_locs = []
+        pass_on_stack = []
+
+        # In reverse order for use with pop()
+        unused_gpr = [r9, r8, ecx, edx, esi, edi]
+        unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
+
+        for i in range(start, len(arglocs)):
+            loc = arglocs[i]
+            # XXX: Should be much simplier to tell whether a location is a
+            # float! It's so ugly because we have to "guard" the access to
+            # .type with isinstance, since not all AssemblerLocation classes
+            # are "typed"
+            if ((isinstance(loc, RegLoc) and loc.is_xmm) or
+                (isinstance(loc, StackLoc) and loc.type == FLOAT) or
+                (isinstance(loc, ConstFloatLoc))):
+                if len(unused_xmm) > 0:
+                    xmm_src_locs.append(loc)
+                    xmm_dst_locs.append(unused_xmm.pop())
+                else:
+                    pass_on_stack.append(loc)
+            else:
+                if len(unused_gpr) > 0:
+                    src_locs.append(loc)
+                    dst_locs.append(unused_gpr.pop())
+                else:
+                    pass_on_stack.append(loc)
         
+        # Emit instructions to pass the stack arguments
+        # XXX: Would be nice to let remap_frame_layout take care of this, but
+        # we'd need to create something like StackLoc, but relative to esp,
+        # and I don't know if it's worth it.
+        for i in range(len(pass_on_stack)):
+            loc = pass_on_stack[i]
+            if not isinstance(loc, RegLoc):
+                if isinstance(loc, StackLoc) and loc.type == FLOAT:
+                    self.mc.MOVSD(X86_64_XMM_SCRATCH_REG, loc)
+                    self.mc.MOVSD_sx(i*WORD, X86_64_XMM_SCRATCH_REG.value)
+                else:
+                    self.mc.MOV(X86_64_SCRATCH_REG, loc)
+                    self.mc.MOV_sr(i*WORD, X86_64_SCRATCH_REG.value)
+            else:
+                # It's a register
+                if loc.is_xmm:
+                    self.mc.MOVSD_sx(i*WORD, loc.value)
+                else:
+                    self.mc.MOV_sr(i*WORD, loc.value)
+
+        # Handle register arguments
+        remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
+        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs, X86_64_XMM_SCRATCH_REG)
+
+        self._regalloc.reserve_param(len(pass_on_stack))
+        self.mc.CALL(x)
+        self.mark_gc_roots()
+
     def call(self, addr, args, res):
-        self._emit_call(rel32(addr), args)
+        self._emit_call(imm(addr), args)
         assert res is eax
 
     genop_int_neg = _unaryop("NEG")
@@ -656,6 +877,9 @@
     genop_int_and = _binaryop("AND", True)
     genop_int_or  = _binaryop("OR", True)
     genop_int_xor = _binaryop("XOR", True)
+    genop_int_lshift = _binaryop("SHL")
+    genop_int_rshift = _binaryop("SAR")
+    genop_uint_rshift = _binaryop("SHR")
     genop_float_add = _binaryop("ADDSD", True)
     genop_float_sub = _binaryop('SUBSD')
     genop_float_mul = _binaryop('MULSD', True)
@@ -702,26 +926,27 @@
     genop_guard_float_gt = _cmpop_guard_float("A", "BE", False)
     genop_guard_float_ge = _cmpop_guard_float("AE", "B", False)
 
-    def genop_guard_float_ne(self, op, guard_op, addr, arglocs, result_loc):
+    def genop_guard_float_ne(self, op, guard_op, guard_token, arglocs, result_loc):
         guard_opnum = guard_op.opnum
         self.mc.UCOMISD(arglocs[0], arglocs[1])
-        mc = self.mc._mc
+        # 16 is enough space for the rel8 jumps below and the rel32
+        # jump in implement_guard
+        self.mc.ensure_bytes_available(16 + guard_token.recovery_stub_size())
         if guard_opnum == rop.GUARD_TRUE:
-            mc.JP(rel8(6))
-            mc.JE(rel32(addr))
-            return mc.tell() - 4
-        else:
-            mc.JP(rel8(2))
-            mc.JE(rel8(5))
-            return self.implement_guard(addr, mc.JMP)
+            self.mc.J_il8(rx86.Conditions['P'], 6)
+            return self.implement_guard(guard_token, 'E')
+        else:
+            self.mc.J_il8(rx86.Conditions['P'], 2)
+            self.mc.J_il8(rx86.Conditions['E'], 5)
+            return self.implement_guard(guard_token)
 
     def genop_float_neg(self, op, arglocs, resloc):
         # Following what gcc does: res = x ^ 0x8000000000000000
-        self.mc.XORPD(arglocs[0], self.loc_float_const_neg)
+        self.mc.XORPD(arglocs[0], heap(self.float_const_neg_addr))
 
     def genop_float_abs(self, op, arglocs, resloc):
         # Following what gcc does: res = x & 0x7FFFFFFFFFFFFFFF
-        self.mc.ANDPD(arglocs[0], self.loc_float_const_abs)
+        self.mc.ANDPD(arglocs[0], heap(self.float_const_abs_addr))
 
     def genop_cast_float_to_int(self, op, arglocs, resloc):
         self.mc.CVTTSD2SI(resloc, arglocs[0])
@@ -729,70 +954,56 @@
     def genop_cast_int_to_float(self, op, arglocs, resloc):
         self.mc.CVTSI2SD(resloc, arglocs[0])
 
-    def genop_int_lshift(self, op, arglocs, resloc):
-        loc, loc2 = arglocs
-        if loc2 is ecx:
-            loc2 = cl
-        self.mc.SHL(loc, loc2)
-
-    def genop_int_rshift(self, op, arglocs, resloc):
-        loc, loc2 = arglocs
-        if loc2 is ecx:
-            loc2 = cl
-        self.mc.SAR(loc, loc2)
-
-    def genop_uint_rshift(self, op, arglocs, resloc):
-        loc, loc2 = arglocs
-        if loc2 is ecx:
-            loc2 = cl
-        self.mc.SHR(loc, loc2)
-
-    def genop_guard_int_is_true(self, op, guard_op, addr, arglocs, resloc):
+    def genop_guard_int_is_true(self, op, guard_op, guard_token, arglocs, resloc):
         guard_opnum = guard_op.opnum
-        self.mc.CMP(arglocs[0], imm8(0))
+        self.mc.CMP(arglocs[0], imm(0))
         if guard_opnum == rop.GUARD_TRUE:
-            return self.implement_guard(addr, self.mc.JZ)
+            return self.implement_guard(guard_token, 'Z')
         else:
-            return self.implement_guard(addr, self.mc.JNZ)
+            return self.implement_guard(guard_token, 'NZ')
 
     def genop_int_is_true(self, op, arglocs, resloc):
-        self.mc.CMP(arglocs[0], imm8(0))
+        self.mc.CMP(arglocs[0], imm(0))
         rl = resloc.lowest8bits()
-        self.mc.SETNE(rl)
-        self.mc.MOVZX(resloc, rl)
+        self.mc.SET_ir(rx86.Conditions['NE'], rl.value)
+        self.mc.MOVZX8(resloc, rl)
 
-    def genop_guard_int_is_zero(self, op, guard_op, addr, arglocs, resloc):
+    def genop_guard_int_is_zero(self, op, guard_op, guard_token, arglocs, resloc):
         guard_opnum = guard_op.opnum
-        self.mc.CMP(arglocs[0], imm8(0))
+        self.mc.CMP(arglocs[0], imm(0))
         if guard_opnum == rop.GUARD_TRUE:
-            return self.implement_guard(addr, self.mc.JNZ)
+            return self.implement_guard(guard_token, 'NZ')
         else:
-            return self.implement_guard(addr, self.mc.JZ)
+            return self.implement_guard(guard_token, 'Z')
 
     def genop_int_is_zero(self, op, arglocs, resloc):
-        self.mc.CMP(arglocs[0], imm8(0))
+        self.mc.CMP(arglocs[0], imm(0))
         rl = resloc.lowest8bits()
-        self.mc.SETE(rl)
-        self.mc.MOVZX(resloc, rl)
+        self.mc.SET_ir(rx86.Conditions['E'], rl.value)
+        self.mc.MOVZX8(resloc, rl)
 
     def genop_same_as(self, op, arglocs, resloc):
         self.mov(arglocs[0], resloc)
     #genop_cast_ptr_to_int = genop_same_as
 
     def genop_int_mod(self, op, arglocs, resloc):
-        self.mc.CDQ()
-        self.mc.IDIV(ecx)
+        if IS_X86_32:
+            self.mc.CDQ()
+        elif IS_X86_64:
+            self.mc.CQO()
+
+        self.mc.IDIV_r(ecx.value)
 
     genop_int_floordiv = genop_int_mod
 
     def genop_uint_floordiv(self, op, arglocs, resloc):
-        self.mc.XOR(edx, edx)
-        self.mc.DIV(ecx)
+        self.mc.XOR_rr(edx.value, edx.value)
+        self.mc.DIV_r(ecx.value)
 
     def genop_new_with_vtable(self, op, arglocs, result_loc):
         assert result_loc is eax
         loc_vtable = arglocs[-1]
-        assert isinstance(loc_vtable, IMM32)
+        assert isinstance(loc_vtable, ImmedLoc)
         arglocs = arglocs[:-1]
         self.call(self.malloc_func_addr, arglocs, eax)
         # xxx ignore NULL returns for now
@@ -800,7 +1011,9 @@
 
     def set_vtable(self, loc, loc_vtable):
         if self.cpu.vtable_offset is not None:
-            self.mc.MOV(mem(loc, self.cpu.vtable_offset), loc_vtable)
+            assert isinstance(loc, RegLoc)
+            assert isinstance(loc_vtable, ImmedLoc)
+            self.mc.MOV_mi((loc.value, self.cpu.vtable_offset), loc_vtable.value)
 
     # XXX genop_new is abused for all varsized mallocs with Boehm, for now
     # (instead of genop_new_array, genop_newstr, genop_newunicode)
@@ -822,16 +1035,19 @@
 
     def genop_getfield_gc(self, op, arglocs, resloc):
         base_loc, ofs_loc, size_loc = arglocs
-        assert isinstance(size_loc, IMM32)
+        assert isinstance(size_loc, ImmedLoc)
+        assert isinstance(resloc, RegLoc)
         size = size_loc.value
-        if size == 1:
-            self.mc.MOVZX(resloc, addr8_add(base_loc, ofs_loc))
+
+        source_addr = AddressLoc(base_loc, ofs_loc)
+        if resloc.is_xmm:
+            self.mc.MOVSD(resloc, source_addr)
+        elif size == 1:
+            self.mc.MOVZX8(resloc, source_addr)
         elif size == 2:
-            self.mc.MOVZX(resloc, addr_add(base_loc, ofs_loc))
+            self.mc.MOVZX16(resloc, source_addr)
         elif size == WORD:
-            self.mc.MOV(resloc, addr_add(base_loc, ofs_loc))
-        elif size == 8:
-            self.mc.MOVSD(resloc, addr64_add(base_loc, ofs_loc))
+            self.mc.MOV(resloc, source_addr)
         else:
             raise NotImplementedError("getfield size = %d" % size)
 
@@ -841,20 +1057,23 @@
 
     def genop_getarrayitem_gc(self, op, arglocs, resloc):
         base_loc, ofs_loc, scale, ofs = arglocs
-        assert isinstance(ofs, IMM32)
-        assert isinstance(scale, IMM32)
+        assert isinstance(ofs, ImmedLoc)
+        assert isinstance(scale, ImmedLoc)
         if op.result.type == FLOAT:
-            self.mc.MOVSD(resloc, addr64_add(base_loc, ofs_loc, ofs.value,
+            self.mc.MOVSD(resloc, addr_add(base_loc, ofs_loc, ofs.value,
                                              scale.value))
         else:
             if scale.value == 0:
-                self.mc.MOVZX(resloc, addr8_add(base_loc, ofs_loc, ofs.value,
+                self.mc.MOVZX8(resloc, addr_add(base_loc, ofs_loc, ofs.value,
+                                                scale.value))
+            elif scale.value == 1:
+                self.mc.MOVZX16(resloc, addr_add(base_loc, ofs_loc, ofs.value,
                                                 scale.value))
-            elif scale.value == 2:
+            elif (1 << scale.value) == WORD:
                 self.mc.MOV(resloc, addr_add(base_loc, ofs_loc, ofs.value,
                                              scale.value))
             else:
-                print "[asmgen]setarrayitem unsupported size: %d" % scale.value
+                print "[asmgen]getarrayitem unsupported size: %d" % scale.value
                 raise NotImplementedError()
 
     genop_getarrayitem_gc_pure = genop_getarrayitem_gc
@@ -862,34 +1081,35 @@
 
     def genop_discard_setfield_gc(self, op, arglocs):
         base_loc, ofs_loc, size_loc, value_loc = arglocs
-        assert isinstance(size_loc, IMM32)
+        assert isinstance(size_loc, ImmedLoc)
         size = size_loc.value
-        if size == WORD * 2:
-            self.mc.MOVSD(addr64_add(base_loc, ofs_loc), value_loc)
+        dest_addr = AddressLoc(base_loc, ofs_loc)
+        if isinstance(value_loc, RegLoc) and value_loc.is_xmm:
+            self.mc.MOVSD(dest_addr, value_loc)
         elif size == WORD:
-            self.mc.MOV(addr_add(base_loc, ofs_loc), value_loc)
+            self.mc.MOV(dest_addr, value_loc)
         elif size == 2:
-            self.mc.MOV16(addr_add(base_loc, ofs_loc), value_loc)
+            self.mc.MOV16(dest_addr, value_loc)
         elif size == 1:
-            self.mc.MOV(addr8_add(base_loc, ofs_loc), value_loc.lowest8bits())
+            self.mc.MOV8(dest_addr, value_loc.lowest8bits())
         else:
             print "[asmgen]setfield addr size %d" % size
             raise NotImplementedError("Addr size %d" % size)
 
     def genop_discard_setarrayitem_gc(self, op, arglocs):
         base_loc, ofs_loc, value_loc, scale_loc, baseofs = arglocs
-        assert isinstance(baseofs, IMM32)
-        assert isinstance(scale_loc, IMM32)
+        assert isinstance(baseofs, ImmedLoc)
+        assert isinstance(scale_loc, ImmedLoc)
+        dest_addr = AddressLoc(base_loc, ofs_loc, scale_loc.value, baseofs.value)
         if op.args[2].type == FLOAT:
-            self.mc.MOVSD(addr64_add(base_loc, ofs_loc, baseofs.value,
-                                     scale_loc.value), value_loc)
+            self.mc.MOVSD(dest_addr, value_loc)
         else:
-            if scale_loc.value == 2:
-                self.mc.MOV(addr_add(base_loc, ofs_loc, baseofs.value,
-                                     scale_loc.value), value_loc)
+            if (1 << scale_loc.value) == WORD:
+                self.mc.MOV(dest_addr, value_loc)
+            elif scale_loc.value == 1:
+                self.mc.MOV16(dest_addr, value_loc)
             elif scale_loc.value == 0:
-                self.mc.MOV(addr8_add(base_loc, ofs_loc, baseofs.value,
-                                      scale_loc.value), value_loc.lowest8bits())
+                self.mc.MOV8(dest_addr, value_loc.lowest8bits())
             else:
                 raise NotImplementedError("scale = %d" % scale_loc.value)
 
@@ -898,17 +1118,17 @@
         basesize, itemsize, ofs_length = symbolic.get_array_token(rstr.STR,
                                               self.cpu.translate_support_code)
         assert itemsize == 1
-        self.mc.MOV(addr8_add(base_loc, ofs_loc, basesize),
-                    val_loc.lowest8bits())
+        dest_addr = AddressLoc(base_loc, ofs_loc, 0, basesize)
+        self.mc.MOV8(dest_addr, val_loc.lowest8bits())
 
     def genop_discard_unicodesetitem(self, op, arglocs):
         base_loc, ofs_loc, val_loc = arglocs
         basesize, itemsize, ofs_length = symbolic.get_array_token(rstr.UNICODE,
                                               self.cpu.translate_support_code)
         if itemsize == 4:
-            self.mc.MOV(addr_add(base_loc, ofs_loc, basesize, 2), val_loc)
+            self.mc.MOV32(AddressLoc(base_loc, ofs_loc, 2, basesize), val_loc)
         elif itemsize == 2:
-            self.mc.MOV16(addr_add(base_loc, ofs_loc, basesize, 1), val_loc)
+            self.mc.MOV16(AddressLoc(base_loc, ofs_loc, 1, basesize), val_loc)
         else:
             assert 0, itemsize
 
@@ -929,7 +1149,7 @@
 
     def genop_arraylen_gc(self, op, arglocs, resloc):
         base_loc, ofs_loc = arglocs
-        assert isinstance(ofs_loc, IMM32)
+        assert isinstance(ofs_loc, ImmedLoc)
         self.mc.MOV(resloc, addr_add_const(base_loc, ofs_loc.value))
 
     def genop_strgetitem(self, op, arglocs, resloc):
@@ -937,83 +1157,83 @@
         basesize, itemsize, ofs_length = symbolic.get_array_token(rstr.STR,
                                              self.cpu.translate_support_code)
         assert itemsize == 1
-        self.mc.MOVZX(resloc, addr8_add(base_loc, ofs_loc, basesize))
+        self.mc.MOVZX8(resloc, AddressLoc(base_loc, ofs_loc, 0, basesize))
 
     def genop_unicodegetitem(self, op, arglocs, resloc):
         base_loc, ofs_loc = arglocs
         basesize, itemsize, ofs_length = symbolic.get_array_token(rstr.UNICODE,
                                              self.cpu.translate_support_code)
         if itemsize == 4:
-            self.mc.MOV(resloc, addr_add(base_loc, ofs_loc, basesize, 2))
+            self.mc.MOV32(resloc, AddressLoc(base_loc, ofs_loc, 2, basesize))
         elif itemsize == 2:
-            self.mc.MOVZX(resloc, addr_add(base_loc, ofs_loc, basesize, 1))
+            self.mc.MOVZX16(resloc, AddressLoc(base_loc, ofs_loc, 1, basesize))
         else:
             assert 0, itemsize
 
-    def genop_guard_guard_true(self, ign_1, guard_op, addr, locs, ign_2):
+    def genop_guard_guard_true(self, ign_1, guard_op, guard_token, locs, ign_2):
         loc = locs[0]
         self.mc.TEST(loc, loc)
-        return self.implement_guard(addr, self.mc.JZ)
+        return self.implement_guard(guard_token, 'Z')
     genop_guard_guard_nonnull = genop_guard_guard_true
 
-    def genop_guard_guard_no_exception(self, ign_1, guard_op, addr,
+    def genop_guard_guard_no_exception(self, ign_1, guard_op, guard_token,
                                        locs, ign_2):
         self.mc.CMP(heap(self.cpu.pos_exception()), imm(0))
-        return self.implement_guard(addr, self.mc.JNZ)
+        return self.implement_guard(guard_token, 'NZ')
 
-    def genop_guard_guard_exception(self, ign_1, guard_op, addr,
+    def genop_guard_guard_exception(self, ign_1, guard_op, guard_token,
                                     locs, resloc):
         loc = locs[0]
         loc1 = locs[1]
         self.mc.MOV(loc1, heap(self.cpu.pos_exception()))
         self.mc.CMP(loc1, loc)
-        addr = self.implement_guard(addr, self.mc.JNE)
+        addr = self.implement_guard(guard_token, 'NE')
         if resloc is not None:
             self.mc.MOV(resloc, heap(self.cpu.pos_exc_value()))
         self.mc.MOV(heap(self.cpu.pos_exception()), imm(0))
         self.mc.MOV(heap(self.cpu.pos_exc_value()), imm(0))
         return addr
 
-    def _gen_guard_overflow(self, guard_op, addr):
+    def _gen_guard_overflow(self, guard_op, guard_token):
         guard_opnum = guard_op.opnum
         if guard_opnum == rop.GUARD_NO_OVERFLOW:
-            return self.implement_guard(addr, self.mc.JO)
+            return self.implement_guard(guard_token, 'O')
         elif guard_opnum == rop.GUARD_OVERFLOW:
-            return self.implement_guard(addr, self.mc.JNO)
+            return self.implement_guard(guard_token, 'NO')
         else:
             print "int_xxx_ovf followed by", guard_op.getopname()
             raise AssertionError
 
-    def genop_guard_int_add_ovf(self, op, guard_op, addr, arglocs, result_loc):
+    def genop_guard_int_add_ovf(self, op, guard_op, guard_token, arglocs, result_loc):
         self.genop_int_add(op, arglocs, result_loc)
-        return self._gen_guard_overflow(guard_op, addr)
+        return self._gen_guard_overflow(guard_op, guard_token)
 
-    def genop_guard_int_sub_ovf(self, op, guard_op, addr, arglocs, result_loc):
+    def genop_guard_int_sub_ovf(self, op, guard_op, guard_token, arglocs, result_loc):
         self.genop_int_sub(op, arglocs, result_loc)
-        return self._gen_guard_overflow(guard_op, addr)
+        return self._gen_guard_overflow(guard_op, guard_token)
 
-    def genop_guard_int_mul_ovf(self, op, guard_op, addr, arglocs, result_loc):
+    def genop_guard_int_mul_ovf(self, op, guard_op, guard_token, arglocs, result_loc):
         self.genop_int_mul(op, arglocs, result_loc)
-        return self._gen_guard_overflow(guard_op, addr)
+        return self._gen_guard_overflow(guard_op, guard_token)
 
-    def genop_guard_guard_false(self, ign_1, guard_op, addr, locs, ign_2):
+    def genop_guard_guard_false(self, ign_1, guard_op, guard_token, locs, ign_2):
         loc = locs[0]
         self.mc.TEST(loc, loc)
-        return self.implement_guard(addr, self.mc.JNZ)
+        return self.implement_guard(guard_token, 'NZ')
     genop_guard_guard_isnull = genop_guard_guard_false
 
-    def genop_guard_guard_value(self, ign_1, guard_op, addr, locs, ign_2):
+    def genop_guard_guard_value(self, ign_1, guard_op, guard_token, locs, ign_2):
         if guard_op.args[0].type == FLOAT:
             assert guard_op.args[1].type == FLOAT
             self.mc.UCOMISD(locs[0], locs[1])
         else:
             self.mc.CMP(locs[0], locs[1])
-        return self.implement_guard(addr, self.mc.JNE)
+        return self.implement_guard(guard_token, 'NE')
 
-    def _cmp_guard_class(self, mc, locs):
+    def _cmp_guard_class(self, locs):
         offset = self.cpu.vtable_offset
         if offset is not None:
-            mc.CMP(mem(locs[0], offset), locs[1])
+            self.mc.CMP(mem(locs[0], offset), locs[1])
         else:
             # XXX hard-coded assumption: to go from an object to its class
             # we use the following algorithm:
@@ -1022,7 +1242,7 @@
             #   - multiply by 4 and use it as an offset in type_info_group
             #   - add 16 bytes, to go past the TYPE_INFO structure
             loc = locs[1]
-            assert isinstance(loc, IMM32)
+            assert isinstance(loc, ImmedLoc)
             classptr = loc.value
             # here, we have to go back from 'classptr' to the value expected
             # from reading the 16 bits in the object header
@@ -1031,37 +1251,37 @@
             type_info_group = llop.gc_get_type_info_group(llmemory.Address)
             type_info_group = rffi.cast(lltype.Signed, type_info_group)
             expected_typeid = (classptr - sizeof_ti - type_info_group) >> 2
-            mc.CMP16(mem(locs[0], 0), imm32(expected_typeid))
+            self.mc.CMP16(mem(locs[0], 0), ImmedLoc(expected_typeid))
 
-    def genop_guard_guard_class(self, ign_1, guard_op, addr, locs, ign_2):
-        mc = self._start_block()
-        self._cmp_guard_class(mc, locs)
-        self._stop_block()
-        return self.implement_guard(addr, self.mc.JNE)
+    def genop_guard_guard_class(self, ign_1, guard_op, guard_token, locs, ign_2):
+        self.mc.ensure_bytes_available(256)
+        self._cmp_guard_class(locs)
+        return self.implement_guard(guard_token, 'NE')
 
     def genop_guard_guard_nonnull_class(self, ign_1, guard_op,
-                                        addr, locs, ign_2):
-        mc = self._start_block()
-        mc.CMP(locs[0], imm8(1))
-        mc.JB(rel8_patched_later)
-        jb_location = mc.get_relative_pos()
-        self._cmp_guard_class(mc, locs)
+                                        guard_token, locs, ign_2):
+        self.mc.ensure_bytes_available(256)
+        self.mc.CMP(locs[0], imm(1))
+        # Patched below
+        self.mc.J_il8(rx86.Conditions['B'], 0)
+        jb_location = self.mc.get_relative_pos()
+        self._cmp_guard_class(locs)
         # patch the JB above
-        offset = mc.get_relative_pos() - jb_location
+        offset = self.mc.get_relative_pos() - jb_location
         assert 0 < offset <= 127
-        mc.overwrite(jb_location-1, [chr(offset)])
-        self._stop_block()
+        self.mc.overwrite(jb_location-1, [chr(offset)])
         #
-        return self.implement_guard(addr, self.mc.JNE)
+        return self.implement_guard(guard_token, 'NE')
 
     def implement_guard_recovery(self, guard_opnum, faildescr, failargs,
                                                                fail_locs):
         exc = (guard_opnum == rop.GUARD_EXCEPTION or
                guard_opnum == rop.GUARD_NO_EXCEPTION or
                guard_opnum == rop.GUARD_NOT_FORCED)
-        return self.generate_quick_failure(faildescr, failargs, fail_locs, exc)
+        desc_bytes = self.failure_recovery_description(failargs, fail_locs)
+        return GuardToken(faildescr, failargs, fail_locs, exc, desc_bytes)
 
-    def generate_quick_failure(self, faildescr, failargs, fail_locs, exc):
+    def generate_quick_failure(self, mc, faildescr, failargs, fail_locs, exc, desc_bytes):
         """Generate the initial code for handling a failure.  We try to
         keep it as compact as possible.  The idea is that this code is
         executed at most once (and very often, zero times); when
@@ -1069,38 +1289,43 @@
         really handle recovery from this particular failure.
         """
         fail_index = self.cpu.get_fail_descr_number(faildescr)
-        bytes_needed = 20 + 5 * len(failargs)    # conservative estimate
-        if self.mc2.bytes_free() < bytes_needed:
-            self.mc2.make_new_mc()
-        mc = self.mc2._mc
         addr = mc.tell()
         withfloats = False
         for box in failargs:
             if box is not None and box.type == FLOAT:
                 withfloats = True
                 break
-        mc.CALL(rel32(self.failure_recovery_code[exc + 2 * withfloats]))
+        mc.CALL(imm(self.failure_recovery_code[exc + 2 * withfloats]))
         # write tight data that describes the failure recovery
         faildescr._x86_failure_recovery_bytecode = mc.tell()
-        self.write_failure_recovery_description(mc, failargs, fail_locs)
+        for byte in desc_bytes:
+            mc.writechr(ord(byte))
         # write the fail_index too
-        mc.write(packimm32(fail_index))
+        mc.writeimm32(fail_index)
         # for testing the decoding, write a final byte 0xCC
         if not we_are_translated():
             mc.writechr(0xCC)
             faildescr._x86_debug_faillocs = [loc for loc in fail_locs
                                                  if loc is not None]
+
+        # Make sure the recovery stub is at least 16 bytes long (for the
+        # case where we overwrite the recovery stub with a 64-bit absolute
+        # jump)
+        while mc.tell() - addr < 16:
+            mc.writechr(0x00)
         return addr
 
     DESCR_REF       = 0x00
     DESCR_INT       = 0x01
     DESCR_FLOAT     = 0x02
     DESCR_SPECIAL   = 0x03
-    CODE_FROMSTACK  = 4*8
+    # XXX: 4*8 works on i386, should we optimize for that case?
+    CODE_FROMSTACK  = 4*16
     CODE_STOP       = 0 | DESCR_SPECIAL
     CODE_HOLE       = 4 | DESCR_SPECIAL
 
-    def write_failure_recovery_description(self, mc, failargs, locs):
+    def failure_recovery_description(self, failargs, locs):
+        desc_bytes = []
         for i in range(len(failargs)):
             arg = failargs[i]
             if arg is not None:
@@ -1113,24 +1338,30 @@
                 else:
                     raise AssertionError("bogus kind")
                 loc = locs[i]
-                if isinstance(loc, MODRM):
+                if isinstance(loc, StackLoc):
                     n = self.CODE_FROMSTACK//4 + loc.position
                 else:
-                    assert isinstance(loc, REG)
-                    n = loc.op
+                    assert isinstance(loc, RegLoc)
+                    n = loc.value
                 n = kind + 4*n
                 while n > 0x7F:
-                    mc.writechr((n & 0x7F) | 0x80)
+                    desc_bytes.append(chr((n & 0x7F) | 0x80))
                     n >>= 7
             else:
                 n = self.CODE_HOLE
-            mc.writechr(n)
-        mc.writechr(self.CODE_STOP)
+            desc_bytes.append(chr(n))
+        desc_bytes.append(chr(self.CODE_STOP))
         # assert that the fail_boxes lists are big enough
         assert len(failargs) <= self.fail_boxes_int.SIZE
+        return desc_bytes
+
+    def write_failure_recovery_description(self, mc, failargs, locs):
+        for byte in self.failure_recovery_description(failargs, locs):
+            mc.writechr(ord(byte))
 
     def rebuild_faillocs_from_descr(self, bytecode):
         from pypy.jit.backend.x86.regalloc import X86FrameManager
+        descr_to_box_type = [REF, INT, FLOAT]
         bytecode = rffi.cast(rffi.UCHARP, bytecode)
         arglocs = []
         while 1:
@@ -1155,7 +1386,7 @@
                     size = 2
                 else:
                     size = 1
-                loc = X86FrameManager.frame_pos(code, size)
+                loc = X86FrameManager.frame_pos(code, descr_to_box_type[kind])
             elif code == self.CODE_STOP:
                 break
             elif code == self.CODE_HOLE:
@@ -1165,16 +1396,16 @@
                 kind = code & 3
                 code >>= 2
                 if kind == self.DESCR_FLOAT:
-                    loc = xmm_registers[code]
+                    loc = regloc.XMMREGLOCS[code]
                 else:
-                    loc = registers[code]
+                    loc = regloc.REGLOCS[code]
             arglocs.append(loc)
         return arglocs[:]
 
     @rgc.no_collect
     def grab_frame_values(self, bytecode, frame_addr, allregisters):
         # no malloc allowed here!!
-        self.fail_ebp = allregisters[16 + ebp.op]
+        self.fail_ebp = allregisters[16 + ebp.value]
         num = 0
         value_hi = 0
         while 1:
@@ -1197,7 +1428,7 @@
                 code = (code - self.CODE_FROMSTACK) >> 2
                 stackloc = frame_addr + get_ebp_ofs(code)
                 value = rffi.cast(rffi.LONGP, stackloc)[0]
-                if kind == self.DESCR_FLOAT:
+                if kind == self.DESCR_FLOAT and WORD == 4:
                     value_hi = value
                     value = rffi.cast(rffi.LONGP, stackloc - 4)[0]
             else:
@@ -1211,8 +1442,11 @@
                     break
                 code >>= 2
                 if kind == self.DESCR_FLOAT:
-                    value = allregisters[2*code]
-                    value_hi = allregisters[2*code + 1]
+                    if WORD == 4:
+                        value = allregisters[2*code]
+                        value_hi = allregisters[2*code + 1]
+                    else:
+                        value = allregisters[code]
                 else:
                     value = allregisters[16 + code]
 
@@ -1223,7 +1457,8 @@
                 tgt = self.fail_boxes_ptr.get_addr_for_num(num)
             elif kind == self.DESCR_FLOAT:
                 tgt = self.fail_boxes_float.get_addr_for_num(num)
-                rffi.cast(rffi.LONGP, tgt)[1] = value_hi
+                if WORD == 4:
+                    rffi.cast(rffi.LONGP, tgt)[1] = value_hi
             else:
                 assert 0, "bogus kind"
             rffi.cast(rffi.LONGP, tgt)[0] = value
@@ -1232,7 +1467,8 @@
         if not we_are_translated():
             assert bytecode[4] == 0xCC
         self.fail_boxes_count = num
-        fail_index = rffi.cast(rffi.LONGP, bytecode)[0]
+        fail_index = rffi.cast(rffi.INTP, bytecode)[0]
+        fail_index = rffi.cast(lltype.Signed, fail_index)
         return fail_index
 
     def setup_failure_recovery(self):
@@ -1243,8 +1479,8 @@
             # original value of the registers, optionally the original
             # value of XMM registers, and finally a reference to the
             # recovery bytecode.  See _build_failure_recovery() for details.
-            stack_at_ebp = registers[ebp.op]
-            bytecode = rffi.cast(rffi.UCHARP, registers[8])
+            stack_at_ebp = registers[ebp.value]
+            bytecode = rffi.cast(rffi.UCHARP, registers[self.cpu.NUM_REGS])
             allregisters = rffi.ptradd(registers, -16)
             return self.grab_frame_values(bytecode, stack_at_ebp, allregisters)
 
@@ -1259,23 +1495,23 @@
                                          self.failure_recovery_func)
         failure_recovery_func = rffi.cast(lltype.Signed,
                                           failure_recovery_func)
-        mc = self.mc2._mc
+        mc = self.mc._mc
         # Assume that we are called at the beginning, when there is no risk
         # that 'mc' runs out of space.  Checked by asserts in mc.write().
         recovery_addr = mc.tell()
-        mc.PUSH(edi)
-        mc.PUSH(esi)
-        mc.PUSH(ebp)
-        mc.PUSH(esp)  # <-- not really used, but needed to take up the space
-        mc.PUSH(ebx)
-        mc.PUSH(edx)
-        mc.PUSH(ecx)
-        mc.PUSH(eax)
-        mc.MOV(esi, esp)
+
+        # Push all general purpose registers
+        for gpr in range(self.cpu.NUM_REGS-1, -1, -1):
+            mc.PUSH_r(gpr)
+
+        # ebx/rbx is callee-save in both i386 and x86-64
+        mc.MOV_rr(ebx.value, esp.value)
+
         if withfloats:
-            mc.SUB(esp, imm(8*8))
-            for i in range(8):
-                mc.MOVSD(mem64(esp, 8*i), xmm_registers[i])
+            # Push all float registers
+            mc.SUB_ri(esp.value, self.cpu.NUM_REGS*8)
+            for i in range(self.cpu.NUM_REGS):
+                mc.MOVSD_sx(8*i, i)
 
         # we call a provided function that will
         # - call our on_leave_jitted_hook which will mark
@@ -1283,7 +1519,7 @@
         #   avoid unwarranted freeing
         # - optionally save exception depending on the flag
         addr = self.cpu.get_on_leave_jitted_int(save_exception=exc)
-        mc.CALL(rel32(addr))
+        mc.CALL(imm(addr))
 
         # the following call saves all values from the stack and from
         # registers to the right 'fail_boxes_<type>' location.
@@ -1293,50 +1529,58 @@
         # bytecode, pushed just before by the CALL instruction written by
         # generate_quick_failure().  XXX misaligned stack in the call, but
         # it's ok because failure_recovery_func is not calling anything more
-        mc.PUSH(esi)
-        mc.CALL(rel32(failure_recovery_func))
+
+        # XXX
+        if IS_X86_32:
+            mc.PUSH_r(ebx.value)
+        elif IS_X86_64:
+            mc.MOV_rr(edi.value, ebx.value)
+            # XXX: Correct to only align the stack on 64-bit?
+            mc.AND_ri(esp.value, -16)
+        else:
+            raise AssertionError("Shouldn't happen")
+
+        mc.CALL(imm(failure_recovery_func))
         # returns in eax the fail_index
 
         # now we return from the complete frame, which starts from
-        # _assemble_bootstrap_code().  The LEA below throws away most
-        # of the frame, including all the PUSHes that we did just above.
-        mc.LEA(esp, addr_add(ebp, imm(-3 * WORD)))
-        mc.POP(edi)    # [ebp-12]
-        mc.POP(esi)    # [ebp-8]
-        mc.POP(ebx)    # [ebp-4]
-        mc.POP(ebp)    # [ebp]
-        mc.RET()
-        self.mc2.done()
+        # _assemble_bootstrap_code().  The LEA in _call_footer below throws
+        # away most of the frame, including all the PUSHes that we did just
+        # above.
+
+        self._call_footer()
+        self.mc.done()
         self.failure_recovery_code[exc + 2 * withfloats] = recovery_addr
 
     def generate_failure(self, fail_index, locs, exc, locs_are_ref):
-        mc = self.mc
+        self.mc._mc.begin_reuse_scratch_register()
         for i in range(len(locs)):
             loc = locs[i]
-            if isinstance(loc, REG):
-                if loc.width == 8:
+            if isinstance(loc, RegLoc):
+                if loc.is_xmm:
                     adr = self.fail_boxes_float.get_addr_for_num(i)
-                    mc.MOVSD(heap64(adr), loc)
+                    self.mc.MOVSD(heap(adr), loc)
                 else:
                     if locs_are_ref[i]:
                         adr = self.fail_boxes_ptr.get_addr_for_num(i)
                     else:
                         adr = self.fail_boxes_int.get_addr_for_num(i)
-                    mc.MOV(heap(adr), loc)
+                    self.mc.MOV(heap(adr), loc)
         for i in range(len(locs)):
             loc = locs[i]
-            if not isinstance(loc, REG):
-                if loc.width == 8:
-                    mc.MOVSD(xmm0, loc)
+            if not isinstance(loc, RegLoc):
+                if isinstance(loc, StackLoc) and loc.type == FLOAT:
+                    self.mc.MOVSD_xb(xmm0.value, loc.value)
                     adr = self.fail_boxes_float.get_addr_for_num(i)
-                    mc.MOVSD(heap64(adr), xmm0)
+                    self.mc.MOVSD(heap(adr), xmm0)
                 else:
                     if locs_are_ref[i]:
                         adr = self.fail_boxes_ptr.get_addr_for_num(i)
                     else:
                         adr = self.fail_boxes_int.get_addr_for_num(i)
-                    mc.MOV(eax, loc)
-                    mc.MOV(heap(adr), eax)
+                    self.mc.MOV(eax, loc)
+                    self.mc.MOV(heap(adr), eax)
+        self.mc._mc.end_reuse_scratch_register()
 
         # we call a provided function that will
         # - call our on_leave_jitted_hook which will mark
@@ -1344,28 +1588,31 @@
         #   avoid unwarranted freeing
         # - optionally save exception depending on the flag
         addr = self.cpu.get_on_leave_jitted_int(save_exception=exc)
-        mc.CALL(rel32(addr))
+        self.mc.CALL(imm(addr))
 
-        mc.LEA(esp, addr_add(ebp, imm(-3 * WORD)))
-        mc.MOV(eax, imm(fail_index))
-        mc.POP(edi)    # [ebp-12]
-        mc.POP(esi)    # [ebp-8]
-        mc.POP(ebx)    # [ebp-4]
-        mc.POP(ebp)    # [ebp]
-        mc.RET()
-
-    @specialize.arg(2)
-    def implement_guard(self, addr, emit_jump):
-        emit_jump(rel32(addr))
+        self.mc.MOV_ri(eax.value, fail_index)
+
+        # exit function
+        self._call_footer()
+
+    def implement_guard(self, guard_token, condition=None):
+        self.mc.reserve_bytes(guard_token.recovery_stub_size())
+        self.pending_guard_tokens.append(guard_token)
+        # XXX: These jumps are patched later, the self.mc.tell() are just
+        # dummy values
+        if condition:
+            self.mc.J_il(rx86.Conditions[condition], self.mc.tell())
+        else:
+            self.mc.JMP_l(self.mc.tell())
         return self.mc.tell() - 4
 
     def genop_call(self, op, arglocs, resloc):
         sizeloc = arglocs[0]
-        assert isinstance(sizeloc, IMM32)
+        assert isinstance(sizeloc, ImmedLoc)
         size = sizeloc.value
 
         if isinstance(op.args[0], Const):
-            x = rel32(op.args[0].getint())
+            x = imm(op.args[0].getint())
         else:
             x = arglocs[1]
         if x is eax:
@@ -1375,35 +1622,35 @@
         
         self._emit_call(x, arglocs, 2, tmp=tmp)
 
-        if isinstance(resloc, MODRM64):
-            self.mc.FSTP(resloc)
+        if isinstance(resloc, StackLoc) and resloc.width == 8 and IS_X86_32:
+            self.mc.FSTP_b(resloc.value)
         elif size == 1:
-            self.mc.AND(eax, imm(0xff))
+            self.mc.AND_ri(eax.value, 0xff)
         elif size == 2:
-            self.mc.AND(eax, imm(0xffff))
+            self.mc.AND_ri(eax.value, 0xffff)
     
-    def genop_guard_call_may_force(self, op, guard_op, addr,
+    def genop_guard_call_may_force(self, op, guard_op, guard_token,
                                    arglocs, result_loc):
         faildescr = guard_op.descr
         fail_index = self.cpu.get_fail_descr_number(faildescr)
-        self.mc.MOV(mem(ebp, FORCE_INDEX_OFS), imm(fail_index))
+        self.mc.MOV_bi(FORCE_INDEX_OFS, fail_index)
         self.genop_call(op, arglocs, result_loc)
-        self.mc.CMP(mem(ebp, FORCE_INDEX_OFS), imm(0))
-        return self.implement_guard(addr, self.mc.JL)
+        self.mc.CMP_bi(FORCE_INDEX_OFS, 0)
+        return self.implement_guard(guard_token, 'L')
 
-    def genop_guard_call_assembler(self, op, guard_op, addr,
+    def genop_guard_call_assembler(self, op, guard_op, guard_token,
                                    arglocs, result_loc):
         faildescr = guard_op.descr
         fail_index = self.cpu.get_fail_descr_number(faildescr)
-        self.mc.MOV(mem(ebp, FORCE_INDEX_OFS), imm(fail_index))
+        self.mc.MOV_bi(FORCE_INDEX_OFS, fail_index)
         descr = op.descr
         assert isinstance(descr, LoopToken)
         assert len(arglocs) - 2 == len(descr._x86_arglocs[0])
         #
         # Write a call to the direct_bootstrap_code of the target assembler
-        self._emit_call(rel32(descr._x86_direct_bootstrap_code), arglocs, 2,
+        self._emit_call(imm(descr._x86_direct_bootstrap_code), arglocs, 2,
                         tmp=eax)
-        mc = self._start_block()
+        self.mc.ensure_bytes_available(256)
         if op.result is None:
             assert result_loc is None
             value = self.cpu.done_with_this_frame_void_v
@@ -1419,26 +1666,27 @@
                 value = self.cpu.done_with_this_frame_float_v
             else:
                 raise AssertionError(kind)
-        mc.CMP(eax, imm(value))
-        mc.JE(rel8_patched_later)    # goto B if we get 'done_with_this_frame'
-        je_location = mc.get_relative_pos()
+        self.mc.CMP_ri(eax.value, value)
+        # patched later
+        self.mc.J_il8(rx86.Conditions['E'], 0) # goto B if we get 'done_with_this_frame'
+        je_location = self.mc.get_relative_pos()
         #
         # Path A: use assembler_helper_adr
         jd = descr.outermost_jitdriver_sd
         assert jd is not None
         asm_helper_adr = self.cpu.cast_adr_to_int(jd.assembler_helper_adr)
-        self._emit_call(rel32(asm_helper_adr), [eax, arglocs[1]], 0,
-                        tmp=ecx, force_mc=True, mc=mc)
-        if isinstance(result_loc, MODRM64):
-            mc.FSTP(result_loc)
+        self._emit_call(imm(asm_helper_adr), [eax, arglocs[1]], 0,
+                        tmp=ecx)
+        if IS_X86_32 and isinstance(result_loc, StackLoc) and result_loc.type == FLOAT:
+            self.mc.FSTP_b(result_loc.value)
         #else: result_loc is already either eax or None, checked below
-        mc.JMP(rel8_patched_later)     # done
-        jmp_location = mc.get_relative_pos()
+        self.mc.JMP_l8(0) # jump to done, patched later
+        jmp_location = self.mc.get_relative_pos()
         #
         # Path B: fast path.  Must load the return value, and reset the token
         offset = jmp_location - je_location
         assert 0 < offset <= 127
-        mc.overwrite(je_location - 1, [chr(offset)])
+        self.mc.overwrite(je_location - 1, [chr(offset)])
         #
         # Reset the vable token --- XXX really too much special logic here:-(
         if jd.index_of_virtualizable >= 0:
@@ -1446,8 +1694,8 @@
             fielddescr = jd.vable_token_descr
             assert isinstance(fielddescr, BaseFieldDescr)
             ofs = fielddescr.offset
-            mc.MOV(eax, arglocs[1])
-            mc.MOV(addr_add(eax, imm(ofs)), imm(0))
+            self.mc.MOV(eax, arglocs[1])
+            self.mc.MOV_mi((eax.value, ofs), 0)
             # in the line above, TOKEN_NONE = 0
         #
         if op.result is not None:
@@ -1456,27 +1704,26 @@
             if kind == FLOAT:
                 xmmtmp = X86XMMRegisterManager.all_regs[0]
                 adr = self.fail_boxes_float.get_addr_for_num(0)
-                mc.MOVSD(xmmtmp, heap64(adr))
-                mc.MOVSD(result_loc, xmmtmp)
+                self.mc.MOVSD(xmmtmp, heap(adr))
+                self.mc.MOVSD(result_loc, xmmtmp)
             else:
                 assert result_loc is eax
                 if kind == INT:
                     adr = self.fail_boxes_int.get_addr_for_num(0)
-                    mc.MOV(eax, heap(adr))
+                    self.mc.MOV(eax, heap(adr))
                 elif kind == REF:
                     adr = self.fail_boxes_ptr.get_addr_for_num(0)
-                    mc.XOR(eax, eax)
-                    mc.XCHG(eax, heap(adr))
+                    self.mc.XOR_rr(eax.value, eax.value)
+                    self.mc.XCHG(eax, heap(adr))
                 else:
                     raise AssertionError(kind)
         #
         # Here we join Path A and Path B again
-        offset = mc.get_relative_pos() - jmp_location
+        offset = self.mc.get_relative_pos() - jmp_location
         assert 0 <= offset <= 127
-        mc.overwrite(jmp_location - 1, [chr(offset)])
-        self._stop_block()
-        self.mc.CMP(mem(ebp, FORCE_INDEX_OFS), imm(0))
-        return self.implement_guard(addr, self.mc.JL)
+        self.mc.overwrite(jmp_location - 1, [chr(offset)])
+        self.mc.CMP_bi(FORCE_INDEX_OFS, 0)
+        return self.implement_guard(guard_token, 'L')
 
     def genop_discard_cond_call_gc_wb(self, op, arglocs):
         # use 'mc._mc' directly instead of 'mc', to avoid
@@ -1486,31 +1733,41 @@
             cls = self.cpu.gc_ll_descr.has_write_barrier_class()
             assert cls is not None and isinstance(descr, cls)
         loc_base = arglocs[0]
-        mc = self._start_block()
-        mc.TEST(mem8(loc_base, descr.jit_wb_if_flag_byteofs),
-                imm8(descr.jit_wb_if_flag_singlebyte))
-        mc.JZ(rel8_patched_later)
-        jz_location = mc.get_relative_pos()
+        self.mc.ensure_bytes_available(256)
+        self.mc.TEST8_mi((loc_base.value, descr.jit_wb_if_flag_byteofs),
+                descr.jit_wb_if_flag_singlebyte)
+        self.mc.J_il8(rx86.Conditions['Z'], 0) # patched later
+        jz_location = self.mc.get_relative_pos()
         # the following is supposed to be the slow path, so whenever possible
         # we choose the most compact encoding over the most efficient one.
         for i in range(len(arglocs)-1, -1, -1):
-            mc.PUSH(arglocs[i])
+            self.mc.PUSH(arglocs[i])
+        
+        if IS_X86_64:
+            # We clobber these registers to pass the arguments, but that's
+            # okay, because consider_cond_call_gc_wb makes sure that any
+            # caller-save registers with values in them are present in arglocs,
+            # so they are saved on the stack above and restored below 
+            self.mc.MOV_rs(edi.value, 0)
+            self.mc.MOV_rs(esi.value, 8)
+
         # misaligned stack in the call, but it's ok because the write barrier
         # is not going to call anything more.  Also, this assumes that the
         # write barrier does not touch the xmm registers.
-        mc.CALL(rel32(descr.get_write_barrier_fn(self.cpu)))
+        self.mc.CALL(imm(descr.get_write_barrier_fn(self.cpu)))
         for i in range(len(arglocs)):
             loc = arglocs[i]
-            assert isinstance(loc, REG)
-            mc.POP(loc)
+            assert isinstance(loc, RegLoc)
+            self.mc.POP(loc)
         # patch the JZ above
-        offset = mc.get_relative_pos() - jz_location
+        offset = self.mc.get_relative_pos() - jz_location
         assert 0 < offset <= 127
-        mc.overwrite(jz_location-1, [chr(offset)])
-        self._stop_block()
+        self.mc.overwrite(jz_location-1, [chr(offset)])
 
     def genop_force_token(self, op, arglocs, resloc):
-        self.mc.LEA(resloc, mem(ebp, FORCE_INDEX_OFS))
+        # RegAlloc.consider_force_token ensures this:
+        assert isinstance(resloc, RegLoc)
+        self.mc.LEA_rb(resloc.value, FORCE_INDEX_OFS)
 
     def not_implemented_op_discard(self, op, arglocs):
         msg = "not implemented operation: %s" % op.getopname()
@@ -1538,17 +1795,16 @@
         return loop_token._x86_arglocs
 
     def closing_jump(self, loop_token):
-        self.mc.JMP(rel32(loop_token._x86_loop_code))
+        self.mc.JMP(imm(loop_token._x86_loop_code))
 
     def malloc_cond_fixedsize(self, nursery_free_adr, nursery_top_adr,
                               size, tid):
-        # don't use self.mc
-        mc = self._start_block()
-        mc.MOV(eax, heap(nursery_free_adr))
-        mc.LEA(edx, addr_add(eax, imm(size)))
-        mc.CMP(edx, heap(nursery_top_adr))
-        mc.JNA(rel8_patched_later)
-        jmp_adr = mc.get_relative_pos()
+        self.mc.ensure_bytes_available(256)
+        self.mc.MOV(eax, heap(nursery_free_adr))
+        self.mc.LEA_rm(edx.value, (eax.value, size))
+        self.mc.CMP(edx, heap(nursery_top_adr))
+        self.mc.J_il8(rx86.Conditions['NA'], 0) # patched later
+        jmp_adr = self.mc.get_relative_pos()
 
         # See comments in _build_malloc_fixedsize_slowpath for the
         # details of the two helper functions that we are calling below.
@@ -1564,17 +1820,16 @@
         # reserve room for the argument to the real malloc and the
         # 8 saved XMM regs
         self._regalloc.reserve_param(1+16)
-        mc.CALL(rel32(slowpath_addr1))
+        self.mc.CALL(imm(slowpath_addr1))
         self.mark_gc_roots()
         slowpath_addr2 = self.malloc_fixedsize_slowpath2
-        mc.CALL(rel32(slowpath_addr2))
+        self.mc.CALL(imm(slowpath_addr2))
 
-        offset = mc.get_relative_pos() - jmp_adr
+        offset = self.mc.get_relative_pos() - jmp_adr
         assert 0 < offset <= 127
-        mc.overwrite(jmp_adr-1, [chr(offset)])
-        mc.MOV(addr_add(eax, imm(0)), imm(tid))
-        mc.MOV(heap(nursery_free_adr), edx)
-        self._stop_block()
+        self.mc.overwrite(jmp_adr-1, [chr(offset)])
+        self.mc.MOV_mi((eax.value, 0), tid)
+        self.mc.MOV(heap(nursery_free_adr), edx)
         
 genop_discard_list = [Assembler386.not_implemented_op_discard] * rop._LAST
 genop_list = [Assembler386.not_implemented_op] * rop._LAST
@@ -1594,32 +1849,20 @@
         num = getattr(rop, opname.upper())
         genop_list[num] = value
 
-def new_addr_add(heap, mem, memsib):
-    def addr_add(reg_or_imm1, reg_or_imm2, offset=0, scale=0):
-        if isinstance(reg_or_imm1, IMM32):
-            if isinstance(reg_or_imm2, IMM32):
-                return heap(reg_or_imm1.value + offset +
-                            (reg_or_imm2.value << scale))
-            else:
-                return memsib(None, reg_or_imm2, scale, reg_or_imm1.value + offset)
-        else:
-            if isinstance(reg_or_imm2, IMM32):
-                return mem(reg_or_imm1, offset + (reg_or_imm2.value << scale))
-            else:
-                return memsib(reg_or_imm1, reg_or_imm2, scale, offset)
-    return addr_add
-
-addr8_add = new_addr_add(heap8, mem8, memSIB8)
-addr_add = new_addr_add(heap, mem, memSIB)
-addr64_add = new_addr_add(heap64, mem64, memSIB64)
-
-def addr_add_const(reg_or_imm1, offset):
-    if isinstance(reg_or_imm1, IMM32):
-        return heap(reg_or_imm1.value + offset)
-    else:
-        return mem(reg_or_imm1, offset)
-
 def round_up_to_4(size):
     if size < 4:
         return 4
     return size
+
+# XXX: ri386 migration shims:
+def addr_add(reg_or_imm1, reg_or_imm2, offset=0, scale=0):
+    return AddressLoc(reg_or_imm1, reg_or_imm2, scale, offset)
+
+def addr_add_const(reg_or_imm1, offset):
+    return AddressLoc(reg_or_imm1, ImmedLoc(0), 0, offset)
+
+def mem(loc, offset):
+    return AddressLoc(loc, ImmedLoc(0), 0, offset)
+
+def heap(addr):
+    return AddressLoc(ImmedLoc(addr), ImmedLoc(0), 0, 0)

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/codebuf.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/codebuf.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/codebuf.py	Tue Aug 10 20:07:15 2010
@@ -2,12 +2,20 @@
 import os, sys
 from pypy.rpython.lltypesystem import lltype, rffi
 from pypy.translator.tool.cbuild import ExternalCompilationInfo
-from pypy.jit.backend.x86.ri386 import I386CodeBuilder
+from pypy.jit.backend.x86.rx86 import X86_32_CodeBuilder, X86_64_CodeBuilder
+from pypy.jit.backend.x86.regloc import LocationCodeBuilder
 from pypy.rlib.rmmap import PTR, alloc, free
 from pypy.rlib.debug import make_sure_not_resized
+from pypy.jit.backend.x86.arch import IS_X86_32, IS_X86_64
+from pypy.rlib.objectmodel import we_are_translated
 
+# XXX: Seems nasty to change the superclass of InMemoryCodeBuilder like this
+if IS_X86_32:
+    codebuilder_cls = X86_32_CodeBuilder
+elif IS_X86_64:
+    codebuilder_cls = X86_64_CodeBuilder
 
-class InMemoryCodeBuilder(I386CodeBuilder):
+class InMemoryCodeBuilder(codebuilder_cls, LocationCodeBuilder):
     _last_dump_start = 0
 
     def __init__(self, start, end):
@@ -31,13 +39,15 @@
     def write(self, listofchars):
         self._pos = self.overwrite(self._pos, listofchars)
 
-    def writechr(self, n):
-        # purely for performance: don't make the one-element list [chr(n)]
+    def writechar(self, char):
         pos = self._pos
         assert pos + 1 <= self._size
-        self._data[pos] = chr(n)
+        self._data[pos] = char
         self._pos = pos + 1
 
+    def writechr(self, n):
+        self.writechar(chr(n))
+
     def get_relative_pos(self):
         return self._pos
 
@@ -50,11 +60,6 @@
         self._pos = pos
         self._last_dump_start = pos
 
-    def execute(self, arg1, arg2):
-        # XXX old testing stuff
-        fnptr = rffi.cast(lltype.Ptr(BINARYFN), self._data)
-        return fnptr(arg1, arg2)
-
     def done(self):
         # normally, no special action is needed here
         if machine_code_dumper.enabled:
@@ -77,9 +82,6 @@
         valgrind.discard_translations(self._data, self._size)
 
 
-BINARYFN = lltype.FuncType([lltype.Signed, lltype.Signed], lltype.Signed)
-
-
 class MachineCodeDumper:
     enabled = True
     log_fd = -1
@@ -107,7 +109,10 @@
                 return False
             # log the executable name
             from pypy.jit.backend.hlinfo import highleveljitinfo
-            os.write(self.log_fd, 'BACKEND i386\n')
+            if IS_X86_32:
+                os.write(self.log_fd, 'BACKEND x86\n')
+            elif IS_X86_64:
+                os.write(self.log_fd, 'BACKEND x86_64\n')
             if highleveljitinfo.sys_executable:
                 os.write(self.log_fd, 'SYS_EXECUTABLE %s\n' % (
                     highleveljitinfo.sys_executable,))
@@ -137,6 +142,12 @@
 
     def __init__(self, map_size):
         data = alloc(map_size)
+        if IS_X86_64 and not we_are_translated():
+            # Hack to make sure that mcs are not within 32-bits of one
+            # another for testing purposes
+            from pypy.rlib.rmmap import hint
+            hint.pos += 0xFFFFFFFF
+            
         self._init(data, map_size)
 
     def __del__(self):

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/jump.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/jump.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/jump.py	Tue Aug 10 20:07:15 2010
@@ -1,23 +1,6 @@
 import sys
 from pypy.tool.pairtype import extendabletype
-from pypy.jit.backend.x86.ri386 import *
-
-class __extend__(OPERAND):
-    __metaclass__ = extendabletype
-    def _getregkey(self):
-        raise AssertionError("should only happen to registers and frame "
-                             "positions")
-
-class __extend__(REG):
-    __metaclass__ = extendabletype
-    def _getregkey(self):
-        return ~self.op
-
-class __extend__(MODRM):
-    __metaclass__ = extendabletype
-    def _getregkey(self):
-        return self.position
-
+from pypy.jit.backend.x86.regloc import ImmedLoc, StackLoc
 
 def remap_frame_layout(assembler, src_locations, dst_locations, tmpreg):
     pending_dests = len(dst_locations)
@@ -27,7 +10,7 @@
         srccount[dst._getregkey()] = 0
     for i in range(len(dst_locations)):
         src = src_locations[i]
-        if isinstance(src, IMM32):
+        if isinstance(src, ImmedLoc):
             continue
         key = src._getregkey()
         if key in srccount:
@@ -46,7 +29,7 @@
                 srccount[key] = -1       # means "it's done"
                 pending_dests -= 1
                 src = src_locations[i]
-                if not isinstance(src, IMM32):
+                if not isinstance(src, ImmedLoc):
                     key = src._getregkey()
                     if key in srccount:
                         srccount[key] -= 1
@@ -80,7 +63,7 @@
             assert pending_dests == 0
 
 def _move(assembler, src, dst, tmpreg):
-    if isinstance(dst, MODRM) and isinstance(src, MODRM):
+    if dst.is_memory_reference() and src.is_memory_reference():
         assembler.regalloc_mov(src, tmpreg)
         src = tmpreg
     assembler.regalloc_mov(src, dst)

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/regalloc.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/regalloc.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/regalloc.py	Tue Aug 10 20:07:15 2010
@@ -5,7 +5,7 @@
 from pypy.jit.metainterp.history import (Box, Const, ConstInt, ConstPtr,
                                          ResOperation, BoxPtr,
                                          LoopToken, INT, REF, FLOAT)
-from pypy.jit.backend.x86.ri386 import *
+from pypy.jit.backend.x86.regloc import *
 from pypy.rpython.lltypesystem import lltype, ll2ctypes, rffi, rstr
 from pypy.rlib.objectmodel import we_are_translated
 from pypy.rlib import rgc
@@ -17,16 +17,7 @@
 from pypy.jit.backend.llsupport.descr import BaseCallDescr, BaseSizeDescr
 from pypy.jit.backend.llsupport.regalloc import FrameManager, RegisterManager,\
      TempBox
-
-WORD = 4
-FRAME_FIXED_SIZE = 5     # ebp + ebx + esi + edi + force_index = 5 words
-FORCE_INDEX_OFS = -4*WORD
-
-width_of_type = {
-    INT : 1,
-    REF : 1,
-    FLOAT : 2,
-    }
+from pypy.jit.backend.x86.arch import WORD, FRAME_FIXED_SIZE, IS_X86_32, IS_X86_64
 
 class X86RegisterManager(RegisterManager):
 
@@ -50,12 +41,19 @@
             print "convert_to_imm: got a %s" % c
             raise AssertionError
 
+class X86_64_RegisterManager(X86RegisterManager):
+    # r11 omitted because it's used as scratch
+    all_regs = [eax, ecx, edx, ebx, esi, edi, r8, r9, r10, r12, r13, r14, r15]
+    no_lower_byte_regs = []
+    save_around_call_regs = [eax, ecx, edx, esi, edi, r8, r9, r10]
+
 
 class FloatConstants(object):
     BASE_CONSTANT_SIZE = 1000
 
     def __init__(self):
         self.cur_array_free = 0
+        self.const_id = 0
 
     def _get_new_array(self):
         n = self.BASE_CONSTANT_SIZE
@@ -71,7 +69,8 @@
         n = self.cur_array_free - 1
         arr[n] = floatval
         self.cur_array_free = n
-        return rffi.cast(lltype.Signed, arr) + n * 8
+        self.const_id += 1
+        return (self.const_id, rffi.cast(lltype.Signed, arr) + n * 8)
 
 
 class X86XMMRegisterManager(RegisterManager):
@@ -80,7 +79,6 @@
     all_regs = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7]
     # we never need lower byte I hope
     save_around_call_regs = all_regs
-    reg_width = 2
 
     def __init__(self, longevity, frame_manager=None, assembler=None):
         RegisterManager.__init__(self, longevity, frame_manager=frame_manager,
@@ -93,28 +91,36 @@
             self.float_constants = assembler._float_constants
 
     def convert_to_imm(self, c):
-        adr = self.float_constants.record_float(c.getfloat())
-        return heap64(adr)
+        const_id, adr = self.float_constants.record_float(c.getfloat())
+        return ConstFloatLoc(adr, const_id)
         
     def after_call(self, v):
         # the result is stored in st0, but we don't have this around,
         # so genop_call will move it to some frame location immediately
         # after the call
-        return self.frame_manager.loc(v, 2)
+        return self.frame_manager.loc(v)
 
-class X86FrameManager(FrameManager):
+class X86_64_XMMRegisterManager(X86XMMRegisterManager):
+    # xmm15 reserved for scratch use
+    all_regs = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14]
+    save_around_call_regs = all_regs
+
+    def call_result_location(self, v):
+        return xmm0
+
+    def after_call(self, v):
+        # We use RegisterManager's implementation, since X86XMMRegisterManager
+        # places the result on the stack, which we don't need to do when the
+        # calling convention places the result in xmm0
+        return RegisterManager.after_call(self, v)
 
+class X86FrameManager(FrameManager):
     @staticmethod
-    def frame_pos(i, size):
-        if size == 1:
-            res = mem(ebp, get_ebp_ofs(i))
-        elif size == 2:
-            res = mem64(ebp, get_ebp_ofs(i + 1))
-        else:
-            print "Unimplemented size %d" % i
-            raise NotImplementedError("unimplemented size %d" % i)
-        res.position = i
-        return res
+    def frame_pos(i, box_type):
+        if IS_X86_32 and box_type == FLOAT:
+            return StackLoc(i, get_ebp_ofs(i+1), 2, box_type)
+        else:
+            return StackLoc(i, get_ebp_ofs(i), 1, box_type)
 
 class RegAlloc(object):
     exc = False
@@ -135,11 +141,21 @@
         # compute longevity of variables
         longevity = self._compute_vars_longevity(inputargs, operations)
         self.longevity = longevity
-        self.rm = X86RegisterManager(longevity,
-                                     frame_manager = self.fm,
-                                     assembler = self.assembler)
-        self.xrm = X86XMMRegisterManager(longevity, frame_manager = self.fm,
-                                         assembler = self.assembler)
+        # XXX
+        if cpu.WORD == 4:
+            gpr_reg_mgr_cls = X86RegisterManager
+            xmm_reg_mgr_cls = X86XMMRegisterManager
+        elif cpu.WORD == 8:
+            gpr_reg_mgr_cls = X86_64_RegisterManager
+            xmm_reg_mgr_cls = X86_64_XMMRegisterManager
+        else:
+            raise AssertionError("Word size should be 4 or 8")
+            
+        self.rm = gpr_reg_mgr_cls(longevity,
+                                  frame_manager = self.fm,
+                                  assembler = self.assembler)
+        self.xrm = xmm_reg_mgr_cls(longevity, frame_manager = self.fm,
+                                   assembler = self.assembler)
 
     def prepare_loop(self, inputargs, operations, looptoken):
         self._prepare(inputargs, operations)
@@ -184,7 +200,7 @@
             if reg:
                 loc = reg
             else:
-                loc = self.fm.loc(arg, width_of_type[arg.type])
+                loc = self.fm.loc(arg)
             if arg.type == FLOAT:
                 floatlocs[i] = loc
             else:
@@ -252,23 +268,23 @@
             arg = inputargs[i]
             i += 1
             if arg.type == FLOAT:
-                if isinstance(loc, REG):
+                if isinstance(loc, RegLoc):
                     self.xrm.reg_bindings[arg] = loc
                     used[loc] = None
                 else:
                     self.fm.frame_bindings[arg] = loc
             else:
-                if isinstance(loc, REG):
+                if isinstance(loc, RegLoc):
                     self.rm.reg_bindings[arg] = loc
                     used[loc] = None
                 else:
                     self.fm.frame_bindings[arg] = loc
         self.rm.free_regs = []
-        for reg in X86RegisterManager.all_regs:
+        for reg in self.rm.all_regs:
             if reg not in used:
                 self.rm.free_regs.append(reg)
         self.xrm.free_regs = []
-        for reg in X86XMMRegisterManager.all_regs:
+        for reg in self.xrm.all_regs:
             if reg not in used:
                 self.xrm.free_regs.append(reg)
         # note: we need to make a copy of inputargs because possibly_free_vars
@@ -647,7 +663,7 @@
         vable_index = jd.index_of_virtualizable
         if vable_index >= 0:
             self.rm._sync_var(op.args[vable_index])
-            vable = self.fm.loc(op.args[vable_index], 1)
+            vable = self.fm.loc(op.args[vable_index])
         else:
             vable = imm(0)
         self._call(op, [imm(size), vable] +
@@ -671,7 +687,7 @@
         # function, a GC write barrier, is known not to touch them.
         # See remember_young_pointer() in rpython/memory/gc/generation.py.
         for v, reg in self.rm.reg_bindings.items():
-            if ((reg is eax or reg is ecx or reg is edx)
+            if (reg in self.rm.save_around_call_regs
                 and self.rm.stays_alive(v)):
                 arglocs.append(reg)
         self.PerformDiscard(op, arglocs)
@@ -810,7 +826,7 @@
 
     def consider_setfield_gc(self, op):
         ofs_loc, size_loc, ptr = self._unpack_fielddescr(op.descr)
-        assert isinstance(size_loc, IMM32)
+        assert isinstance(size_loc, ImmedLoc)
         if size_loc.value == 1:
             need_lower_byte = True
         else:
@@ -951,7 +967,7 @@
         shape = gcrootmap.get_basic_shape()
         for v, val in self.fm.frame_bindings.items():
             if (isinstance(v, BoxPtr) and self.rm.stays_alive(v)):
-                assert isinstance(val, MODRM)
+                assert isinstance(val, StackLoc)
                 gcrootmap.add_ebp_offset(shape, get_ebp_ofs(val.position))
         for v, reg in self.rm.reg_bindings.items():
             if reg is eax:

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/runner.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/runner.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/runner.py	Tue Aug 10 20:07:15 2010
@@ -4,11 +4,13 @@
 from pypy.rlib.objectmodel import we_are_translated
 from pypy.jit.metainterp import history, compile
 from pypy.jit.backend.x86.assembler import Assembler386
-from pypy.jit.backend.x86.regalloc import FORCE_INDEX_OFS
+from pypy.jit.backend.x86.arch import FORCE_INDEX_OFS
 from pypy.jit.backend.x86.profagent import ProfileAgent
 from pypy.jit.backend.llsupport.llmodel import AbstractLLCPU
+from pypy.jit.backend.x86 import regloc
+import sys
 
-class CPU386(AbstractLLCPU):
+class AbstractX86CPU(AbstractLLCPU):
     debug = True
     supports_floats = True
 
@@ -132,10 +134,28 @@
         assert fail_index == fail_index_2
         return faildescr
 
+class CPU386(AbstractX86CPU):
+    WORD = 4
+    NUM_REGS = 8
+    CALLEE_SAVE_REGISTERS = [regloc.ebx, regloc.esi, regloc.edi]
+    FRAME_FIXED_SIZE = len(CALLEE_SAVE_REGISTERS) + 2
+
+    def __init__(self, *args, **kwargs):
+        assert sys.maxint == (2**31 - 1)
+        super(CPU386, self).__init__(*args, **kwargs)
 
 class CPU386_NO_SSE2(CPU386):
     supports_floats = False
 
+class CPU_X86_64(AbstractX86CPU):
+    WORD = 8
+    NUM_REGS = 16
+    CALLEE_SAVE_REGISTERS = [regloc.ebx, regloc.r12, regloc.r13, regloc.r14, regloc.r15]
+    FRAME_FIXED_SIZE = len(CALLEE_SAVE_REGISTERS) + 2
+
+    def __init__(self, *args, **kwargs):
+        assert sys.maxint == (2**63 - 1)
+        super(CPU_X86_64, self).__init__(*args, **kwargs)
 
 CPU = CPU386
 

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/conftest.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/conftest.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/conftest.py	Tue Aug 10 20:07:15 2010
@@ -3,6 +3,5 @@
 
 cpu = detect_cpu.autodetect()
 def pytest_runtest_setup(item):
-    if cpu != 'x86':
-        py.test.skip("x86 directory skipped: cpu is %r" % (cpu,))
-    
+    if cpu not in ('x86', 'x86_64'):
+        py.test.skip("x86/x86_64 tests skipped: cpu is %r" % (cpu,))

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_assembler.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_assembler.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_assembler.py	Tue Aug 10 20:07:15 2010
@@ -1,14 +1,22 @@
-from pypy.jit.backend.x86.ri386 import *
+from pypy.jit.backend.x86.regloc import *
 from pypy.jit.backend.x86.assembler import Assembler386, MachineCodeBlockWrapper
 from pypy.jit.backend.x86.regalloc import X86FrameManager, get_ebp_ofs
-from pypy.jit.metainterp.history import BoxInt, BoxPtr, BoxFloat
+from pypy.jit.metainterp.history import BoxInt, BoxPtr, BoxFloat, INT, REF, FLOAT
 from pypy.rlib.rarithmetic import intmask
 from pypy.rpython.lltypesystem import lltype, llmemory, rffi
+from pypy.jit.backend.x86.arch import WORD, IS_X86_32, IS_X86_64
+from pypy.jit.backend.detect_cpu import getcpuclass 
+from pypy.jit.backend.x86.regalloc import X86RegisterManager, X86_64_RegisterManager, X86XMMRegisterManager, X86_64_XMMRegisterManager
 
+ACTUAL_CPU = getcpuclass()
 
 class FakeCPU:
     rtyper = None
     supports_floats = True
+    NUM_REGS = ACTUAL_CPU.NUM_REGS
+
+    def fielddescrof(self, STRUCT, name):
+        return 42
 
 class FakeMC:
     def __init__(self, base_address=0):
@@ -25,7 +33,14 @@
         self.content.append(("JMP", args))
     def done(self):
         pass
+    def PUSH_r(self, reg):
+        pass
+    def POP_r(self, reg):
+        pass
 
+class FakeAssembler:
+    def write_pending_failure_recoveries(self):
+        pass
 
 def test_write_failure_recovery_description():
     assembler = Assembler386(FakeCPU())
@@ -33,12 +48,12 @@
     failargs = [BoxInt(), BoxPtr(), BoxFloat()] * 3
     failargs.insert(6, None)
     failargs.insert(7, None)
-    locs = [X86FrameManager.frame_pos(0, 1),
-            X86FrameManager.frame_pos(1, 1),
-            X86FrameManager.frame_pos(10, 2),
-            X86FrameManager.frame_pos(100, 1),
-            X86FrameManager.frame_pos(101, 1),
-            X86FrameManager.frame_pos(110, 2),
+    locs = [X86FrameManager.frame_pos(0, INT),
+            X86FrameManager.frame_pos(1, REF),
+            X86FrameManager.frame_pos(10, FLOAT),
+            X86FrameManager.frame_pos(100, INT),
+            X86FrameManager.frame_pos(101, REF),
+            X86FrameManager.frame_pos(110, FLOAT),
             None,
             None,
             ebx,
@@ -46,17 +61,17 @@
             xmm2]
     assert len(failargs) == len(locs)
     assembler.write_failure_recovery_description(mc, failargs, locs)
-    nums = [Assembler386.DESCR_INT   + 4*(8+0),
-            Assembler386.DESCR_REF   + 4*(8+1),
-            Assembler386.DESCR_FLOAT + 4*(8+10),
-            Assembler386.DESCR_INT   + 4*(8+100),
-            Assembler386.DESCR_REF   + 4*(8+101),
-            Assembler386.DESCR_FLOAT + 4*(8+110),
+    nums = [Assembler386.DESCR_INT   + 4*(16+0),
+            Assembler386.DESCR_REF   + 4*(16+1),
+            Assembler386.DESCR_FLOAT + 4*(16+10),
+            Assembler386.DESCR_INT   + 4*(16+100),
+            Assembler386.DESCR_REF   + 4*(16+101),
+            Assembler386.DESCR_FLOAT + 4*(16+110),
             Assembler386.CODE_HOLE,
             Assembler386.CODE_HOLE,
-            Assembler386.DESCR_INT   + 4*ebx.op,
-            Assembler386.DESCR_REF   + 4*esi.op,
-            Assembler386.DESCR_FLOAT + 4*xmm2.op]
+            Assembler386.DESCR_INT   + 4*ebx.value,
+            Assembler386.DESCR_REF   + 4*esi.value,
+            Assembler386.DESCR_FLOAT + 4*xmm2.value]
     double_byte_nums = []
     for num in nums[3:6]:
         double_byte_nums.append((num & 0x7F) | 0x80)
@@ -94,6 +109,9 @@
         return lltype.cast_opaque_ptr(llmemory.GCREF, lltype.malloc(S))
 
     def get_random_float():
+        # Returns <float>, <low word>, <high word>
+        # NB: on 64-bit, <low word> will be the entire float and <high word>
+        # will be random garbage from malloc!
         assert withfloats
         value = random.random() - 0.5
         # make sure it fits into 64 bits
@@ -101,9 +119,16 @@
         rffi.cast(rffi.DOUBLEP, tmp)[0] = value
         return rffi.cast(rffi.DOUBLEP, tmp)[0], tmp[0], tmp[1]
 
+    if IS_X86_32:
+        main_registers = X86RegisterManager.all_regs
+        xmm_registers = X86XMMRegisterManager.all_regs
+    elif IS_X86_64:
+        main_registers = X86_64_RegisterManager.all_regs
+        xmm_registers = X86_64_XMMRegisterManager.all_regs
+
     # memory locations: 26 integers, 26 pointers, 26 floats
     # main registers: half of them as signed and the other half as ptrs
-    # xmm registers: all floats, from xmm0 to xmm7
+    # xmm registers: all floats, from xmm0 to xmm(7|15)
     # holes: 8
     locations = []
     baseloc = 4
@@ -117,18 +142,17 @@
     content = ([('int', locations.pop()) for _ in range(26)] +
                [('ptr', locations.pop()) for _ in range(26)] +
                [(['int', 'ptr'][random.randrange(0, 2)], reg)
-                         for reg in [eax, ecx, edx, ebx, esi, edi]])
+                         for reg in main_registers])
     if withfloats:
         content += ([('float', locations.pop()) for _ in range(26)] +
-                    [('float', reg) for reg in [xmm0, xmm1, xmm2, xmm3,
-                                                xmm4, xmm5, xmm6, xmm7]])
+                    [('float', reg) for reg in xmm_registers])
     for i in range(8):
         content.append(('hole', None))
     random.shuffle(content)
 
     # prepare the expected target arrays, the descr_bytecode,
     # the 'registers' and the 'stack' arrays according to 'content'
-    xmmregisters = lltype.malloc(rffi.LONGP.TO, 16+9, flavor='raw')
+    xmmregisters = lltype.malloc(rffi.LONGP.TO, 16+ACTUAL_CPU.NUM_REGS+1, flavor='raw')
     registers = rffi.ptradd(xmmregisters, 16)
     stacklen = baseloc + 10
     stack = lltype.malloc(rffi.LONGP.TO, stacklen, flavor='raw')
@@ -140,8 +164,8 @@
         assert loc >= 0
         ofs = get_ebp_ofs(loc)
         assert ofs < 0
-        assert (ofs % 4) == 0
-        stack[stacklen + ofs//4] = value
+        assert (ofs % WORD) == 0
+        stack[stacklen + ofs//WORD] = value
 
     descr_bytecode = []
     for i, (kind, loc) in enumerate(content):
@@ -152,12 +176,18 @@
                 value, lo, hi = get_random_float()
                 expected_floats[i] = value
                 kind = Assembler386.DESCR_FLOAT
-                if isinstance(loc, REG):
-                    xmmregisters[2*loc.op] = lo
-                    xmmregisters[2*loc.op+1] = hi
+                if isinstance(loc, RegLoc):
+                    if WORD == 4:
+                        xmmregisters[2*loc.value] = lo
+                        xmmregisters[2*loc.value+1] = hi
+                    elif WORD == 8:
+                        xmmregisters[loc.value] = lo
                 else:
-                    write_in_stack(loc, hi)
-                    write_in_stack(loc+1, lo)
+                    if WORD == 4:
+                        write_in_stack(loc, hi)
+                        write_in_stack(loc+1, lo)
+                    elif WORD == 8:
+                        write_in_stack(loc, lo)
             else:
                 if kind == 'int':
                     value = get_random_int()
@@ -170,15 +200,15 @@
                     value = rffi.cast(rffi.LONG, value)
                 else:
                     assert 0, kind
-                if isinstance(loc, REG):
-                    registers[loc.op] = value
+                if isinstance(loc, RegLoc):
+                    registers[loc.value] = value
                 else:
                     write_in_stack(loc, value)
 
-            if isinstance(loc, REG):
-                num = kind + 4*loc.op
+            if isinstance(loc, RegLoc):
+                num = kind + 4*loc.value
             else:
-                num = kind + 4*(8+loc)
+                num = kind + Assembler386.CODE_FROMSTACK + (4*loc)
             while num >= 0x80:
                 descr_bytecode.append((num & 0x7F) | 0x80)
                 num >>= 7
@@ -195,8 +225,8 @@
     for i in range(len(descr_bytecode)):
         assert 0 <= descr_bytecode[i] <= 255
         descr_bytes[i] = rffi.cast(rffi.UCHAR, descr_bytecode[i])
-    registers[8] = rffi.cast(rffi.LONG, descr_bytes)
-    registers[ebp.op] = rffi.cast(rffi.LONG, stack) + 4*stacklen
+    registers[ACTUAL_CPU.NUM_REGS] = rffi.cast(rffi.LONG, descr_bytes)
+    registers[ebp.value] = rffi.cast(rffi.LONG, stack) + WORD*stacklen
 
     # run!
     assembler = Assembler386(FakeCPU())
@@ -237,7 +267,8 @@
 
 def test_mc_wrapper_profile_agent():
     agent = FakeProfileAgent()
-    mc = FakeMCWrapper(100, agent)
+    assembler = FakeAssembler()
+    mc = FakeMCWrapper(assembler, 100, agent)
     mc.start_function("abc")
     mc.writechr("x")
     mc.writechr("x")

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_basic.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_basic.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_basic.py	Tue Aug 10 20:07:15 2010
@@ -1,5 +1,5 @@
 import py
-from pypy.jit.backend.x86.runner import CPU386
+from pypy.jit.backend.detect_cpu import getcpuclass
 from pypy.jit.metainterp.warmspot import ll_meta_interp
 from pypy.jit.metainterp.test import test_basic
 from pypy.jit.codewriter.policy import StopAtXPolicy
@@ -7,7 +7,7 @@
 
 class Jit386Mixin(test_basic.LLJitMixin):
     type_system = 'lltype'
-    CPUClass = CPU386
+    CPUClass = getcpuclass()
 
     def check_jumps(self, maxcount):
         pass

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_gc_integration.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_gc_integration.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_gc_integration.py	Tue Aug 10 20:07:15 2010
@@ -9,13 +9,13 @@
 from pypy.jit.codewriter import heaptracker
 from pypy.jit.backend.llsupport.descr import GcCache
 from pypy.jit.backend.llsupport.gc import GcLLDescription
-from pypy.jit.backend.x86.runner import CPU
-from pypy.jit.backend.x86.regalloc import RegAlloc, WORD, FRAME_FIXED_SIZE
+from pypy.jit.backend.detect_cpu import getcpuclass
+from pypy.jit.backend.x86.regalloc import RegAlloc
+from pypy.jit.backend.x86.arch import WORD, FRAME_FIXED_SIZE
 from pypy.jit.metainterp.test.oparser import parse
 from pypy.rpython.lltypesystem import lltype, llmemory, rffi
 from pypy.rpython.annlowlevel import llhelper
 from pypy.rpython.lltypesystem import rclass, rstr
-from pypy.jit.backend.x86.ri386 import *
 from pypy.jit.backend.llsupport.gc import GcLLDescr_framework, GcRefList, GcPtrFieldDescr
 
 from pypy.jit.backend.x86.test.test_regalloc import MockAssembler
@@ -23,6 +23,8 @@
 from pypy.jit.backend.x86.regalloc import X86RegisterManager, X86FrameManager,\
      X86XMMRegisterManager
 
+CPU = getcpuclass()
+
 class MockGcRootMap(object):
     def get_basic_shape(self):
         return ['shape']
@@ -84,7 +86,7 @@
         mark = regalloc.get_mark_gc_roots(cpu.gc_ll_descr.gcrootmap)
         assert mark[0] == 'compressed'
         base = -WORD * FRAME_FIXED_SIZE
-        expected = ['ebx', 'esi', 'edi', base, base-4, base-8]
+        expected = ['ebx', 'esi', 'edi', base, base-WORD, base-WORD*2]
         assert dict.fromkeys(mark[1:]) == dict.fromkeys(expected)
 
 class TestRegallocGcIntegration(BaseTestRegalloc):
@@ -175,7 +177,7 @@
         self.addrs[1] = self.addrs[0] + 64
         # 64 bytes
         def malloc_slowpath(size):
-            assert size == 8
+            assert size == WORD*2
             nadr = rffi.cast(lltype.Signed, self.nursery)
             self.addrs[0] = nadr + size
             return nadr
@@ -199,7 +201,7 @@
         return rffi.cast(lltype.Signed, self.addrs)
 
     def get_nursery_top_addr(self):
-        return rffi.cast(lltype.Signed, self.addrs) + 4
+        return rffi.cast(lltype.Signed, self.addrs) + WORD
 
     def get_malloc_fixedsize_slowpath_addr(self):
         fptr = llhelper(lltype.Ptr(self.MALLOC_SLOWPATH), self.malloc_slowpath)
@@ -213,7 +215,7 @@
 
     def setup_method(self, method):
         cpu = CPU(None, None)
-        cpu.vtable_offset = 4
+        cpu.vtable_offset = WORD
         cpu.gc_ll_descr = GCDescrFastpathMalloc()
 
         NODE = lltype.Struct('node', ('tid', lltype.Signed),
@@ -249,7 +251,7 @@
         assert gc_ll_descr.nursery[0] == self.nodedescr.tid
         assert gc_ll_descr.nursery[1] == 42
         nurs_adr = rffi.cast(lltype.Signed, gc_ll_descr.nursery)
-        assert gc_ll_descr.addrs[0] == nurs_adr + 8
+        assert gc_ll_descr.addrs[0] == nurs_adr + (WORD*2)
 
     def test_malloc_slowpath(self):
         ops = '''
@@ -269,7 +271,7 @@
         # this should call slow path once
         gc_ll_descr = self.cpu.gc_ll_descr
         nadr = rffi.cast(lltype.Signed, gc_ll_descr.nursery)
-        assert gc_ll_descr.addrs[0] == nadr + 8
+        assert gc_ll_descr.addrs[0] == nadr + (WORD*2)
 
     def test_new_with_vtable(self):
         ops = '''
@@ -284,4 +286,4 @@
         assert gc_ll_descr.nursery[0] == self.descrsize.tid
         assert gc_ll_descr.nursery[1] == self.vtable_int
         nurs_adr = rffi.cast(lltype.Signed, gc_ll_descr.nursery)
-        assert gc_ll_descr.addrs[0] == nurs_adr + 12
+        assert gc_ll_descr.addrs[0] == nurs_adr + (WORD*3)

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_jump.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_jump.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_jump.py	Tue Aug 10 20:07:15 2010
@@ -1,6 +1,7 @@
-from pypy.jit.backend.x86.ri386 import *
+from pypy.jit.backend.x86.regloc import *
 from pypy.jit.backend.x86.regalloc import X86FrameManager
 from pypy.jit.backend.x86.jump import remap_frame_layout
+from pypy.jit.metainterp.history import INT
 
 frame_pos = X86FrameManager.frame_pos
 
@@ -25,7 +26,7 @@
                 continue
             assert len(op1) == len(op2)
             for x, y in zip(op1, op2):
-                if isinstance(x, MODRM) and isinstance(y, MODRM):
+                if isinstance(x, StackLoc) and isinstance(y, MODRM):
                     assert x.byte == y.byte
                     assert x.extradata == y.extradata
                 else:
@@ -41,9 +42,9 @@
     remap_frame_layout(assembler, [eax, ebx, ecx, edx, esi, edi],
                                   [eax, ebx, ecx, edx, esi, edi], '?')
     assert assembler.ops == []
-    s8 = frame_pos(1, 1)
-    s12 = frame_pos(31, 1)
-    s20 = frame_pos(6, 1)
+    s8 = frame_pos(1, INT)
+    s12 = frame_pos(31, INT)
+    s20 = frame_pos(6, INT)
     remap_frame_layout(assembler, [eax, ebx, ecx, s20, s8, edx, s12, esi, edi],
                                   [eax, ebx, ecx, s20, s8, edx, s12, esi, edi],
                                   '?')
@@ -58,10 +59,10 @@
 
 def test_simple_framelocs():
     assembler = MockAssembler()
-    s8 = frame_pos(0, 1)
-    s12 = frame_pos(13, 1)
-    s20 = frame_pos(20, 1)
-    s24 = frame_pos(221, 1)
+    s8 = frame_pos(0, INT)
+    s12 = frame_pos(13, INT)
+    s20 = frame_pos(20, INT)
+    s24 = frame_pos(221, INT)
     remap_frame_layout(assembler, [s8, eax, s12], [s20, s24, edi], edx)
     assert assembler.ops == [('mov', s8, edx),
                              ('mov', edx, s20),
@@ -70,10 +71,10 @@
 
 def test_reordering():
     assembler = MockAssembler()
-    s8 = frame_pos(8, 1)
-    s12 = frame_pos(12, 1)
-    s20 = frame_pos(19, 1)
-    s24 = frame_pos(1, 1)
+    s8 = frame_pos(8, INT)
+    s12 = frame_pos(12, INT)
+    s20 = frame_pos(19, INT)
+    s24 = frame_pos(1, INT)
     remap_frame_layout(assembler, [eax, s8, s20, ebx],
                                   [s8, ebx, eax, edi], '?')
     assert assembler.got([('mov', ebx, edi),
@@ -83,10 +84,10 @@
 
 def test_cycle():
     assembler = MockAssembler()
-    s8 = frame_pos(8, 1)
-    s12 = frame_pos(12, 1)
-    s20 = frame_pos(19, 1)
-    s24 = frame_pos(1, 1)
+    s8 = frame_pos(8, INT)
+    s12 = frame_pos(12, INT)
+    s20 = frame_pos(19, INT)
+    s24 = frame_pos(1, INT)
     remap_frame_layout(assembler, [eax, s8, s20, ebx],
                                   [s8, ebx, eax, s20], '?')
     assert assembler.got([('push', s8),
@@ -97,12 +98,12 @@
 
 def test_cycle_2():
     assembler = MockAssembler()
-    s8 = frame_pos(8, 1)
-    s12 = frame_pos(12, 1)
-    s20 = frame_pos(19, 1)
-    s24 = frame_pos(1, 1)
-    s2 = frame_pos(2, 1)
-    s3 = frame_pos(3, 1)
+    s8 = frame_pos(8, INT)
+    s12 = frame_pos(12, INT)
+    s20 = frame_pos(19, INT)
+    s24 = frame_pos(1, INT)
+    s2 = frame_pos(2, INT)
+    s3 = frame_pos(3, INT)
     remap_frame_layout(assembler,
                        [eax, s8, edi, s20, eax, s20, s24, esi, s2, s3],
                        [s8, s20, edi, eax, edx, s24, ebx, s12, s3, s2],
@@ -127,14 +128,14 @@
     remap_frame_layout(assembler, [c3], [eax], '?')
     assert assembler.ops == [('mov', c3, eax)]
     assembler = MockAssembler()
-    s12 = frame_pos(12, 1)
+    s12 = frame_pos(12, INT)
     remap_frame_layout(assembler, [c3], [s12], '?')
     assert assembler.ops == [('mov', c3, s12)]
 
 def test_constants_and_cycle():
     assembler = MockAssembler()
     c3 = imm(3)
-    s12 = frame_pos(13, 1)
+    s12 = frame_pos(13, INT)
     remap_frame_layout(assembler, [ebx, c3,  s12],
                                   [s12, eax, ebx], edi)
     assert assembler.ops == [('mov', c3, eax),

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_recompilation.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_recompilation.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_recompilation.py	Tue Aug 10 20:07:15 2010
@@ -1,6 +1,5 @@
-
-from pypy.jit.backend.x86.runner import CPU
 from pypy.jit.backend.x86.test.test_regalloc import BaseTestRegalloc
+from pypy.jit.backend.x86.arch import IS_X86_32, IS_X86_64
 
 class TestRecompilation(BaseTestRegalloc):
     def test_compile_bridge_not_deeper(self):
@@ -51,7 +50,9 @@
         descr = loop.operations[2].descr
         new = descr._x86_bridge_frame_depth
         assert descr._x86_bridge_param_depth == 0        
-        assert new > previous
+        # XXX: Maybe add enough ops to force stack on 64-bit as well?
+        if IS_X86_32:
+            assert new > previous
         self.cpu.set_future_value_int(0, 0)
         fail = self.run(loop)
         assert fail.identifier == 2
@@ -111,7 +112,9 @@
         guard_op = loop.operations[5]
         loop_frame_depth = loop.token._x86_frame_depth
         assert loop.token._x86_param_depth == 0
-        assert guard_op.descr._x86_bridge_frame_depth > loop_frame_depth
+        # XXX: Maybe add enough ops to force stack on 64-bit as well?
+        if IS_X86_32:
+            assert guard_op.descr._x86_bridge_frame_depth > loop_frame_depth
         assert guard_op.descr._x86_bridge_param_depth == 0
         self.cpu.set_future_value_int(0, 0)
         self.cpu.set_future_value_int(1, 0)

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_regalloc.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_regalloc.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_regalloc.py	Tue Aug 10 20:07:15 2010
@@ -7,15 +7,17 @@
      BoxPtr, ConstPtr, LoopToken, BasicFailDescr
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.backend.llsupport.descr import GcCache
-from pypy.jit.backend.x86.runner import CPU
-from pypy.jit.backend.x86.regalloc import RegAlloc, WORD, X86RegisterManager,\
+from pypy.jit.backend.detect_cpu import getcpuclass
+from pypy.jit.backend.x86.regalloc import RegAlloc, X86RegisterManager,\
      FloatConstants
+from pypy.jit.backend.x86.arch import IS_X86_32, IS_X86_64
 from pypy.jit.metainterp.test.oparser import parse
 from pypy.rpython.lltypesystem import lltype, llmemory, rffi
 from pypy.rpython.annlowlevel import llhelper
 from pypy.rpython.lltypesystem import rclass, rstr
-from pypy.jit.backend.x86.ri386 import *
+from pypy.jit.backend.x86.rx86 import *
 
+CPU = getcpuclass()
 class MockGcDescr(GcCache):
     def get_funcptr_for_new(self):
         return 123
@@ -92,13 +94,20 @@
     def f2(x, y):
         return x*y
 
+    def f10(*args):
+        assert len(args) == 10
+        return sum(args)
+
     F1PTR = lltype.Ptr(lltype.FuncType([lltype.Signed], lltype.Signed))
     F2PTR = lltype.Ptr(lltype.FuncType([lltype.Signed]*2, lltype.Signed))
+    F10PTR = lltype.Ptr(lltype.FuncType([lltype.Signed]*10, lltype.Signed))
     f1ptr = llhelper(F1PTR, f1)
     f2ptr = llhelper(F2PTR, f2)
+    f10ptr = llhelper(F10PTR, f10)
 
     f1_calldescr = cpu.calldescrof(F1PTR.TO, F1PTR.TO.ARGS, F1PTR.TO.RESULT)
     f2_calldescr = cpu.calldescrof(F2PTR.TO, F2PTR.TO.ARGS, F2PTR.TO.RESULT)
+    f10_calldescr = cpu.calldescrof(F10PTR.TO, F10PTR.TO.ARGS, F10PTR.TO.RESULT)
 
     namespace = locals().copy()
     type_system = 'lltype'
@@ -541,6 +550,12 @@
         assert self.getints(9) == [0, 1, 1, 1, 1, 1, 1, 1, 1]
 
 class TestRegAllocCallAndStackDepth(BaseTestRegalloc):
+    def expected_param_depth(self, num_args):
+        # Assumes the arguments are all non-float
+        if IS_X86_32:
+            return num_args
+        elif IS_X86_64:
+            return max(num_args - 6, 0)
 
     def test_one_call(self):
         ops = '''
@@ -550,7 +565,7 @@
         '''
         loop = self.interpret(ops, [4, 7, 9, 9 ,9, 9, 9, 9, 9, 9, 9])
         assert self.getints(11) == [5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9]
-        assert loop.token._x86_param_depth == 1
+        assert loop.token._x86_param_depth == self.expected_param_depth(1)
 
     def test_two_calls(self):
         ops = '''
@@ -561,8 +576,21 @@
         '''
         loop = self.interpret(ops, [4, 7, 9, 9 ,9, 9, 9, 9, 9, 9, 9])
         assert self.getints(11) == [5*7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9]
-        assert loop.token._x86_param_depth == 2
-        
+        assert loop.token._x86_param_depth == self.expected_param_depth(2)
+
+    def test_call_many_arguments(self):
+        # NB: The first and last arguments in the call are constants. This
+        # is primarily for x86-64, to ensure that loading a constant to an
+        # argument register or to the stack works correctly
+        ops = '''
+        [i0, i1, i2, i3, i4, i5, i6, i7]
+        i8 = call(ConstClass(f10ptr), 1, i0, i1, i2, i3, i4, i5, i6, i7, 10, descr=f10_calldescr)
+        finish(i8)
+        '''
+        loop = self.interpret(ops, [2, 3, 4, 5, 6, 7, 8, 9])
+        assert self.getint(0) == 55
+        assert loop.token._x86_param_depth == self.expected_param_depth(10)
+
     def test_bridge_calls_1(self):
         ops = '''
         [i0, i1]
@@ -579,7 +607,7 @@
         '''
         bridge = self.attach_bridge(ops, loop, -2)
 
-        assert loop.operations[-2].descr._x86_bridge_param_depth == 2
+        assert loop.operations[-2].descr._x86_bridge_param_depth == self.expected_param_depth(2)
 
         self.cpu.set_future_value_int(0, 4)
         self.cpu.set_future_value_int(1, 7)        
@@ -602,7 +630,7 @@
         '''
         bridge = self.attach_bridge(ops, loop, -2)
 
-        assert loop.operations[-2].descr._x86_bridge_param_depth == 2        
+        assert loop.operations[-2].descr._x86_bridge_param_depth == self.expected_param_depth(2)
 
         self.cpu.set_future_value_int(0, 4)
         self.cpu.set_future_value_int(1, 7)        

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_regalloc2.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_regalloc2.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_regalloc2.py	Tue Aug 10 20:07:15 2010
@@ -2,7 +2,9 @@
 from pypy.jit.metainterp.history import ResOperation, BoxInt, ConstInt,\
      BoxPtr, ConstPtr, BasicFailDescr, LoopToken
 from pypy.jit.metainterp.resoperation import rop
-from pypy.jit.backend.x86.runner import CPU
+from pypy.jit.backend.detect_cpu import getcpuclass
+from pypy.jit.backend.x86.arch import WORD
+CPU = getcpuclass()
 
 def test_bug_rshift():
     v1 = BoxInt()
@@ -281,5 +283,8 @@
     assert cpu.get_latest_value_int(16) == -57344
     assert cpu.get_latest_value_int(17) == 1
     assert cpu.get_latest_value_int(18) == -1
-    assert cpu.get_latest_value_int(19) == -2147483648
+    if WORD == 4:
+        assert cpu.get_latest_value_int(19) == -2147483648
+    elif WORD == 8:
+        assert cpu.get_latest_value_int(19) == 19327352832
     assert cpu.get_latest_value_int(20) == -49

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_runner.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_runner.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_runner.py	Tue Aug 10 20:07:15 2010
@@ -1,10 +1,11 @@
 import py
 from pypy.rpython.lltypesystem import lltype, llmemory, rffi, rstr, rclass
+from pypy.rpython.annlowlevel import llhelper
 from pypy.jit.metainterp.history import ResOperation, LoopToken
-from pypy.jit.metainterp.history import (BoxInt, BoxPtr, ConstInt, ConstPtr,
-                                         Box, BasicFailDescr)
-from pypy.jit.backend.x86.runner import CPU
-from pypy.jit.backend.x86.regalloc import WORD
+from pypy.jit.metainterp.history import (BoxInt, BoxPtr, ConstInt, ConstFloat,
+                                         ConstPtr, Box, BoxFloat, BasicFailDescr)
+from pypy.jit.backend.detect_cpu import getcpuclass
+from pypy.jit.backend.x86.arch import WORD
 from pypy.jit.backend.llsupport import symbolic
 from pypy.jit.metainterp.resoperation import rop
 from pypy.jit.metainterp.executor import execute
@@ -15,6 +16,8 @@
 import sys
 import os
 
+CPU = getcpuclass()
+
 class FakeStats(object):
     pass
 
@@ -59,7 +62,7 @@
         assert u.chars[3] == u'd'
 
     @staticmethod
-    def _resbuf(res, item_tp=ctypes.c_int):
+    def _resbuf(res, item_tp=ctypes.c_long):
         return ctypes.cast(res.value._obj.intval, ctypes.POINTER(item_tp))
 
     def test_allocations(self):
@@ -74,8 +77,11 @@
             return ctypes.cast(buf, ctypes.c_void_p).value
         func = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int)(f)
         addr = ctypes.cast(func, ctypes.c_void_p).value
+        # ctypes produces an unsigned value. We need it to be signed for, eg,
+        # relative addressing to work properly.
+        addr = rffi.cast(lltype.Signed, addr)
         
-        self.cpu.assembler.make_sure_mc_exists()
+        self.cpu.assembler.setup()
         self.cpu.assembler.malloc_func_addr = addr
         ofs = symbolic.get_field_token(rstr.STR, 'chars', False)[0]
 
@@ -360,7 +366,9 @@
         self.cpu.compile_bridge(faildescr1, [i1b], bridge)        
         name, address, size = agent.functions[1]
         assert name == "Bridge # 0: bye"
-        assert address == loopaddress + loopsize
+        # Would be exactly ==, but there are some guard failure recovery
+        # stubs in-between
+        assert address >= loopaddress + loopsize
         assert size >= 10 # randomish number
 
         self.cpu.set_future_value_int(0, 2)
@@ -369,6 +377,19 @@
         res = self.cpu.get_latest_value_int(0)
         assert res == 20
 
+    def test_call_with_const_floats(self):
+        def func(f1, f2):
+            return f1 + f2
+
+        FUNC = self.FuncType([lltype.Float, lltype.Float], lltype.Float)
+        FPTR = self.Ptr(FUNC)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        func_ptr = llhelper(FPTR, func)
+        funcbox = self.get_funcbox(self.cpu, func_ptr)
+        res = self.execute_operation(rop.CALL, [funcbox, ConstFloat(1.5), ConstFloat(2.5)], 'float', descr=calldescr)
+        assert res.value == 4.0
+
+
 class TestX86OverflowMC(TestX86):
 
     def setup_method(self, meth):
@@ -386,7 +407,7 @@
         ops.append(ResOperation(rop.FINISH, [v], None,
                                 descr=BasicFailDescr()))
         looptoken = LoopToken()
-        self.cpu.assembler.make_sure_mc_exists()
+        self.cpu.assembler.setup()
         old_mc_mc = self.cpu.assembler.mc._mc
         self.cpu.compile_loop([base_v], ops, looptoken)
         assert self.cpu.assembler.mc._mc != old_mc_mc   # overflowed
@@ -394,6 +415,63 @@
         self.cpu.execute_token(looptoken)
         assert self.cpu.get_latest_value_int(0) == 1024
 
+    def test_overflow_guard_float_cmp(self):
+        # The float comparisons on x86 tend to use small relative jumps,
+        # which may run into trouble if they fall on the edge of a
+        # MachineCodeBlock change.
+        a = BoxFloat(1.0)
+        b = BoxFloat(2.0)
+        failed = BoxInt(41)
+        finished = BoxInt(42)
+
+        # We select guards that will always succeed, so that execution will
+        # continue through the entire set of comparisions
+        ops_to_test = (
+            (rop.FLOAT_LT, [a, b], rop.GUARD_TRUE),
+            (rop.FLOAT_LT, [b, a], rop.GUARD_FALSE),
+
+            (rop.FLOAT_LE, [a, a], rop.GUARD_TRUE),
+            (rop.FLOAT_LE, [a, b], rop.GUARD_TRUE),
+            (rop.FLOAT_LE, [b, a], rop.GUARD_FALSE),
+
+            (rop.FLOAT_EQ, [a, a], rop.GUARD_TRUE),
+            (rop.FLOAT_EQ, [a, b], rop.GUARD_FALSE),
+
+            (rop.FLOAT_NE, [a, b], rop.GUARD_TRUE),
+            (rop.FLOAT_NE, [a, a], rop.GUARD_FALSE),
+
+            (rop.FLOAT_GT, [b, a], rop.GUARD_TRUE),
+            (rop.FLOAT_GT, [a, b], rop.GUARD_FALSE),
+
+            (rop.FLOAT_GE, [a, a], rop.GUARD_TRUE),
+            (rop.FLOAT_GE, [b, a], rop.GUARD_TRUE),
+            (rop.FLOAT_GE, [a, b], rop.GUARD_FALSE),
+        )
+
+        for float_op, args, guard_op in ops_to_test:
+            ops = []
+
+            for i in range(200):
+                cmp_result = BoxInt()
+                ops.append(ResOperation(float_op, args, cmp_result))
+                ops.append(ResOperation(guard_op, [cmp_result], None, descr=BasicFailDescr()))
+                ops[-1].fail_args = [failed]
+
+            ops.append(ResOperation(rop.FINISH, [finished], None, descr=BasicFailDescr()))
+
+            looptoken = LoopToken()
+            self.cpu.compile_loop([a, b, failed, finished], ops, looptoken)
+            self.cpu.set_future_value_float(0, a.value)
+            self.cpu.set_future_value_float(1, b.value)
+            self.cpu.set_future_value_int(2, failed.value)
+            self.cpu.set_future_value_int(3, finished.value)
+            self.cpu.execute_token(looptoken)
+
+            # Really just a sanity check. We're actually interested in
+            # whether the test segfaults.
+            assert self.cpu.get_latest_value_int(0) == finished.value
+
+
 class TestDebuggingAssembler(object):
     def setup_method(self, meth):
         self.pypylog = os.environ.get('PYPYLOG', None)
@@ -420,8 +498,9 @@
         self.cpu.set_future_value_int(0, 0)
         self.cpu.execute_token(ops.token)
         # check debugging info
-        assert self.cpu.assembler.loop_names == ["xyz"]
-        assert self.cpu.assembler.loop_run_counter.getitem(0) == 10
+        name, struct = self.cpu.assembler.loop_run_counters[0]
+        assert name == 'xyz'
+        assert struct.i == 10
         self.cpu.finish_once()
         lines = py.path.local(self.logfile + ".count").readlines()
         assert lines[0] == 'xyz:10\n'

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_symbolic_x86.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_symbolic_x86.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_symbolic_x86.py	Tue Aug 10 20:07:15 2010
@@ -1,6 +1,7 @@
 import py
 from pypy.jit.backend.llsupport.symbolic import *
 from pypy.rpython.lltypesystem import lltype, rffi
+from pypy.jit.backend.x86.arch import WORD
 
 # This test file is here and not in llsupport/test/ because it checks
 # that we get correct numbers for a 32-bit machine.
@@ -19,32 +20,32 @@
     ofs_z, size_z = get_field_token(S, 'z', False)
     # ofs_x might be 0 or not, depending on how we count the headers
     # but the rest should be as expected for a 386 machine
-    assert size_x == size_y == size_z == 4
+    assert size_x == size_y == size_z == WORD
     assert ofs_x >= 0
-    assert ofs_y == ofs_x + 4
-    assert ofs_z == ofs_x + 8
+    assert ofs_y == ofs_x + WORD
+    assert ofs_z == ofs_x + (WORD*2)
 
 def test_struct_size():
     ofs_z, size_z = get_field_token(S, 'z', False)
     totalsize = get_size(S, False)
-    assert totalsize == ofs_z + 4
+    assert totalsize == ofs_z + WORD
 
 def test_primitive_size():
-    assert get_size(lltype.Signed, False) == 4
+    assert get_size(lltype.Signed, False) == WORD
     assert get_size(lltype.Char, False) == 1
-    assert get_size(lltype.Ptr(S), False) == 4
+    assert get_size(lltype.Ptr(S), False) == WORD
 
 def test_array_token():
     A = lltype.GcArray(lltype.Char)
     basesize, itemsize, ofs_length = get_array_token(A, False)
-    assert basesize >= 4    # at least the 'length', maybe some gc headers
+    assert basesize >= WORD    # at least the 'length', maybe some gc headers
     assert itemsize == 1
-    assert ofs_length == basesize - 4
+    assert ofs_length == basesize - WORD
     A = lltype.GcArray(lltype.Signed)
     basesize, itemsize, ofs_length = get_array_token(A, False)
-    assert basesize >= 4    # at least the 'length', maybe some gc headers
-    assert itemsize == 4
-    assert ofs_length == basesize - 4
+    assert basesize >= WORD    # at least the 'length', maybe some gc headers
+    assert itemsize == WORD
+    assert ofs_length == basesize - WORD
 
 def test_varsized_struct_size():
     S1 = lltype.GcStruct('S1', ('parent', S),
@@ -54,9 +55,9 @@
     ofs_extra, size_extra = get_field_token(S1, 'extra', False)
     basesize, itemsize, ofs_length = get_array_token(S1, False)
     assert size_parent == ofs_extra
-    assert size_extra == 4
-    assert ofs_length == ofs_extra + 4
-    assert basesize == ofs_length + 4
+    assert size_extra == WORD
+    assert ofs_length == ofs_extra + WORD
+    assert basesize == ofs_length + WORD
     assert itemsize == 1
 
 def test_string():

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_zll_random.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_zll_random.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_zll_random.py	Tue Aug 10 20:07:15 2010
@@ -1,9 +1,11 @@
 from pypy.jit.backend.test.test_random import check_random_function, Random
 from pypy.jit.backend.test.test_ll_random import LLtypeOperationBuilder
-from pypy.jit.backend.x86.runner import CPU386
+from pypy.jit.backend.detect_cpu import getcpuclass
+
+CPU = getcpuclass()
 
 def test_stress():
-    cpu = CPU386(None, None)
+    cpu = CPU(None, None)
     r = Random()
     for i in range(1000):
         check_random_function(cpu, LLtypeOperationBuilder, r, i, 1000)

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_zrpy_gc.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_zrpy_gc.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_zrpy_gc.py	Tue Aug 10 20:07:15 2010
@@ -17,6 +17,8 @@
 from pypy.jit.backend.llsupport.gc import GcRefList, GcRootMap_asmgcc
 from pypy.jit.backend.llsupport.gc import GcLLDescr_framework
 from pypy.tool.udir import udir
+from pypy.jit.backend.x86.arch import IS_X86_64
+import py.test
 
 class X(object):
     def __init__(self, x=0):
@@ -126,6 +128,10 @@
 
 class TestCompileHybrid(object):
     def setup_class(cls):
+        if IS_X86_64:
+            # No hybrid GC on 64-bit for the time being
+            py.test.skip()
+
         funcs = []
         name_to_func = {}
         for fullname in dir(cls):

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_ztranslation.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_ztranslation.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/test/test_ztranslation.py	Tue Aug 10 20:07:15 2010
@@ -3,13 +3,14 @@
 from pypy.rlib.jit import JitDriver, OPTIMIZER_FULL, unroll_parameters
 from pypy.rlib.jit import PARAMETERS, dont_look_inside
 from pypy.jit.metainterp.jitprof import Profiler
-from pypy.jit.backend.x86.runner import CPU386
+from pypy.jit.backend.detect_cpu import getcpuclass
 from pypy.jit.backend.test.support import CCompiledMixin
 from pypy.jit.codewriter.policy import StopAtXPolicy
 from pypy.translator.translator import TranslationContext
+from pypy.jit.backend.x86.arch import IS_X86_32, IS_X86_64
 
 class TestTranslationX86(CCompiledMixin):
-    CPUClass = CPU386
+    CPUClass = getcpuclass()
 
     def _check_cbuilder(self, cbuilder):
         # We assume here that we have sse2.  If not, the CPUClass
@@ -114,7 +115,7 @@
 
 
 class TestTranslationRemoveTypePtrX86(CCompiledMixin):
-    CPUClass = CPU386
+    CPUClass = getcpuclass()
 
     def _get_TranslationContext(self):
         t = TranslationContext()
@@ -125,6 +126,10 @@
         return t
 
     def test_external_exception_handling_translates(self):
+        # FIXME
+        if IS_X86_64:
+            import py.test; py.test.skip()
+
         jitdriver = JitDriver(greens = [], reds = ['n', 'total'])
 
         class ImDone(Exception):

Modified: pypy/branch/fast-ctypes/pypy/jit/backend/x86/tool/viewcode.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/backend/x86/tool/viewcode.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/backend/x86/tool/viewcode.py	Tue Aug 10 20:07:15 2010
@@ -31,16 +31,23 @@
 if sys.platform == "win32":
     XXX   # lots more in Psyco
 
-def machine_code_dump(data, originaddr):
-    # the disassembler to use. 'objdump' writes GNU-style instructions.
-    # 'ndisasm' would use Intel syntax, but you need to fix the output parsing.
-    objdump = ('objdump -M intel -b binary -m i386 '
+def machine_code_dump(data, originaddr, backend_name):
+    objdump_backend_option = {
+        'x86': 'i386',
+        'x86_64': 'x86-64',
+        'i386': 'i386',
+    }
+    objdump = ('objdump -M intel,%(backend)s -b binary -m i386 '
                '--adjust-vma=%(origin)d -D %(file)s')
     #
     f = open(tmpfile, 'wb')
     f.write(data)
     f.close()
-    g = os.popen(objdump % {'file': tmpfile, 'origin': originaddr}, 'r')
+    g = os.popen(objdump % {
+        'file': tmpfile,
+        'origin': originaddr,
+        'backend': objdump_backend_option[backend_name],
+    }, 'r')
     result = g.readlines()
     g.close()
     return result[6:]   # drop some objdump cruft
@@ -126,7 +133,7 @@
 
     def disassemble(self):
         if not hasattr(self, 'text'):
-            lines = machine_code_dump(self.data, self.addr)
+            lines = machine_code_dump(self.data, self.addr, self.world.backend_name)
             # instead of adding symbol names in the dumps we could
             # also make the 0xNNNNNNNN addresses be red and show the
             # symbol name when the mouse is over them
@@ -171,10 +178,13 @@
         self.jumps = {}
         self.symbols = {}
         self.logentries = {}
+        self.backend_name = None
 
     def parse(self, f, textonly=True):
         for line in f:
-            if line.startswith('CODE_DUMP '):
+            if line.startswith('BACKEND '):
+                self.backend_name = line.split(' ')[1].strip()
+            elif line.startswith('CODE_DUMP '):
                 pieces = line.split()
                 assert pieces[1].startswith('@')
                 assert pieces[2].startswith('+')

Modified: pypy/branch/fast-ctypes/pypy/jit/metainterp/optimizeopt.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/metainterp/optimizeopt.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/metainterp/optimizeopt.py	Tue Aug 10 20:07:15 2010
@@ -1,7 +1,7 @@
 from pypy.jit.metainterp.history import Box, BoxInt, LoopToken, BoxFloat,\
      ConstFloat
 from pypy.jit.metainterp.history import Const, ConstInt, ConstPtr, ConstObj, REF
-from pypy.jit.metainterp.resoperation import rop, ResOperation
+from pypy.jit.metainterp.resoperation import rop, ResOperation, opboolinvers, opboolreflex
 from pypy.jit.metainterp import jitprof
 from pypy.jit.metainterp.executor import execute_nonspec
 from pypy.jit.metainterp.specnode import SpecNode, NotSpecNode, ConstantSpecNode
@@ -612,12 +612,59 @@
                 assert oldop.opnum == op.opnum
                 self.make_equal_to(op.result, self.getvalue(oldop.result))
                 return
+            elif self.find_rewriteable_bool(op, args):
+                return
             else:
                 self.pure_operations[args] = op
 
         # otherwise, the operation remains
         self.emit_operation(op)
 
+
+    def try_boolinvers(self, op, targs):
+        oldop = self.pure_operations.get(targs, None)
+        if oldop is not None and oldop.descr is op.descr:
+            value = self.getvalue(oldop.result)
+            if value.is_constant():
+                if value.box is CONST_1:
+                    self.make_constant(op.result, CONST_0)
+                    return True
+                elif value.box is CONST_0:
+                    self.make_constant(op.result, CONST_1)
+                    return True
+        return False
+
+    
+    def find_rewriteable_bool(self, op, args):
+        try:
+            oldopnum = opboolinvers[op.opnum]
+            targs = [args[0], args[1], ConstInt(oldopnum)]
+            if self.try_boolinvers(op, targs):
+                return True
+        except KeyError:
+            pass
+
+        try:
+            oldopnum = opboolreflex[op.opnum]
+            targs = [args[1], args[0], ConstInt(oldopnum)]
+            oldop = self.pure_operations.get(targs, None)
+            if oldop is not None and oldop.descr is op.descr:
+                self.make_equal_to(op.result, self.getvalue(oldop.result))
+                return True
+        except KeyError:
+            pass
+
+        try:
+            oldopnum = opboolinvers[opboolreflex[op.opnum]]
+            targs = [args[1], args[0], ConstInt(oldopnum)]
+            if self.try_boolinvers(op, targs):
+                return True
+        except KeyError:
+            pass
+
+        return False
+
+        
     def optimize_JUMP(self, op):
         orgop = self.loop.operations[-1]
         exitargs = []

Modified: pypy/branch/fast-ctypes/pypy/jit/metainterp/resoperation.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/metainterp/resoperation.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/metainterp/resoperation.py	Tue Aug 10 20:07:15 2010
@@ -274,3 +274,51 @@
 
 setup(__name__ == '__main__')   # print out the table when run directly
 del _oplist
+
+opboolinvers = {
+    rop.INT_EQ: rop.INT_NE,
+    rop.INT_NE: rop.INT_EQ,
+    rop.INT_LT: rop.INT_GE,
+    rop.INT_GE: rop.INT_LT,
+    rop.INT_GT: rop.INT_LE,
+    rop.INT_LE: rop.INT_GT,
+
+    rop.UINT_LT: rop.UINT_GE,
+    rop.UINT_GE: rop.UINT_LT,
+    rop.UINT_GT: rop.UINT_LE,
+    rop.UINT_LE: rop.UINT_GT,
+
+    rop.FLOAT_EQ: rop.FLOAT_NE,
+    rop.FLOAT_NE: rop.FLOAT_EQ,
+    rop.FLOAT_LT: rop.FLOAT_GE,
+    rop.FLOAT_GE: rop.FLOAT_LT,
+    rop.FLOAT_GT: rop.FLOAT_LE,
+    rop.FLOAT_LE: rop.FLOAT_GT,
+
+    rop.PTR_EQ: rop.PTR_NE,
+    rop.PTR_NE: rop.PTR_EQ,
+    }
+
+opboolreflex = {
+    rop.INT_EQ: rop.INT_EQ,
+    rop.INT_NE: rop.INT_NE,
+    rop.INT_LT: rop.INT_GT,
+    rop.INT_GE: rop.INT_LE,
+    rop.INT_GT: rop.INT_LT,
+    rop.INT_LE: rop.INT_GE,
+
+    rop.UINT_LT: rop.UINT_GT,
+    rop.UINT_GE: rop.UINT_LE,
+    rop.UINT_GT: rop.UINT_LT,
+    rop.UINT_LE: rop.UINT_GE,
+
+    rop.FLOAT_EQ: rop.FLOAT_EQ,
+    rop.FLOAT_NE: rop.FLOAT_NE,
+    rop.FLOAT_LT: rop.FLOAT_GT,
+    rop.FLOAT_GE: rop.FLOAT_LE,
+    rop.FLOAT_GT: rop.FLOAT_LT,
+    rop.FLOAT_LE: rop.FLOAT_GE,
+
+    rop.PTR_EQ: rop.PTR_EQ,
+    rop.PTR_NE: rop.PTR_NE,
+    }

Modified: pypy/branch/fast-ctypes/pypy/jit/metainterp/test/test_executor.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/metainterp/test/test_executor.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/metainterp/test/test_executor.py	Tue Aug 10 20:07:15 2010
@@ -4,14 +4,14 @@
 from pypy.rpython.lltypesystem import lltype, llmemory
 from pypy.jit.metainterp.executor import execute
 from pypy.jit.metainterp.executor import execute_varargs, execute_nonspec
-from pypy.jit.metainterp.resoperation import rop
+from pypy.jit.metainterp.resoperation import rop, opboolinvers, opboolreflex, opname
 from pypy.jit.metainterp.history import BoxInt, ConstInt
 from pypy.jit.metainterp.history import BoxPtr, ConstPtr
 from pypy.jit.metainterp.history import BoxFloat, ConstFloat
 from pypy.jit.metainterp.history import AbstractDescr, Box
 from pypy.jit.metainterp import history
 from pypy.jit.backend.model import AbstractCPU
-
+from pypy.rpython.lltypesystem import  llmemory, rffi
 
 class FakeDescr(AbstractDescr):
     pass
@@ -312,3 +312,40 @@
             assert box.getint() == retvalue
         else:
             assert 0, "rettype is %r" % (rettype,)
+
+def make_args_for_op(op, a, b):
+    n=opname[op]
+    if n[0:3] == 'INT' or n[0:4] == 'UINT':
+        arg1 = ConstInt(a)
+        arg2 = ConstInt(b)
+    elif n[0:5] == 'FLOAT':
+        arg1 = ConstFloat(float(a))
+        arg2 = ConstFloat(float(b))
+    elif n[0:3] == 'PTR':
+        arg1 = ConstPtr(rffi.cast(llmemory.GCREF, a))
+        arg2 = ConstPtr(rffi.cast(llmemory.GCREF, b))
+    else:
+        raise NotImplementedError(
+            "Don't know how to make args for " + n)
+    return arg1, arg2
+
+
+def test_opboolinvers():
+    cpu = FakeCPU()
+    for op1, op2 in opboolinvers.items():
+        for a in (1,2,3):
+            for b in (1,2,3):
+                arg1, arg2 = make_args_for_op(op1, a, b)
+                box1 = execute(cpu, None, op1, None, arg1, arg2)
+                box2 = execute(cpu, None, op2, None, arg1, arg2)
+                assert box1.value == (not box2.value)
+
+def test_opboolreflex():
+    cpu = FakeCPU()
+    for op1, op2 in opboolreflex.items():
+        for a in (1,2,3):
+            for b in (1,2,3):
+                arg1, arg2 = make_args_for_op(op1, a, b)
+                box1 = execute(cpu, None, op1, None, arg1, arg2)
+                box2 = execute(cpu, None, op2, None, arg2, arg1)
+                assert box1.value == box2.value

Modified: pypy/branch/fast-ctypes/pypy/jit/metainterp/test/test_optimizeopt.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/metainterp/test/test_optimizeopt.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/metainterp/test/test_optimizeopt.py	Tue Aug 10 20:07:15 2010
@@ -364,6 +364,74 @@
         """
         self.optimize_loop(ops, 'Not', expected)
 
+    def test_constant_boolrewrite_lt(self):
+        ops = """
+        [i0]
+        i1 = int_lt(i0, 0)
+        guard_true(i1) []
+        i2 = int_ge(i0, 0)
+        guard_false(i2) []
+        jump(i0)
+        """
+        expected = """
+        [i0]
+        i1 = int_lt(i0, 0)
+        guard_true(i1) []
+        jump(i0)
+        """
+        self.optimize_loop(ops, 'Not', expected)
+
+    def test_constant_boolrewrite_gt(self):
+        ops = """
+        [i0]
+        i1 = int_gt(i0, 0)
+        guard_true(i1) []
+        i2 = int_le(i0, 0)
+        guard_false(i2) []
+        jump(i0)
+        """
+        expected = """
+        [i0]
+        i1 = int_gt(i0, 0)
+        guard_true(i1) []
+        jump(i0)
+        """
+        self.optimize_loop(ops, 'Not', expected)
+
+    def test_constant_boolrewrite_reflex(self):
+        ops = """
+        [i0]
+        i1 = int_gt(i0, 0)
+        guard_true(i1) []
+        i2 = int_lt(0, i0)
+        guard_true(i2) []
+        jump(i0)
+        """
+        expected = """
+        [i0]
+        i1 = int_gt(i0, 0)
+        guard_true(i1) []
+        jump(i0)
+        """
+        self.optimize_loop(ops, 'Not', expected)
+
+    def test_constant_boolrewrite_reflex_invers(self):
+        ops = """
+        [i0]
+        i1 = int_gt(i0, 0)
+        guard_true(i1) []
+        i2 = int_ge(0, i0)
+        guard_false(i2) []
+        jump(i0)
+        """
+        expected = """
+        [i0]
+        i1 = int_gt(i0, 0)
+        guard_true(i1) []
+        jump(i0)
+        """
+        self.optimize_loop(ops, 'Not', expected)
+
     def test_remove_consecutive_guard_value_constfold(self):
         ops = """
         []
@@ -411,7 +479,6 @@
         self.optimize_loop(ops, 'Not', expected)
 
     def test_int_is_true_1(self):
-        py.test.skip("XXX implement me")
         ops = """
         [i0]
         i1 = int_is_true(i0)
@@ -806,16 +873,10 @@
         guard_nonnull(p0) []
         i7 = ptr_ne(p0, p1)
         guard_true(i7) []
-        i8 = ptr_eq(p0, p1)
-        guard_false(i8) []
         i9 = ptr_ne(p0, p2)
         guard_true(i9) []
-        i10 = ptr_eq(p0, p2)
-        guard_false(i10) []
         i11 = ptr_ne(p2, p1)
         guard_true(i11) []
-        i12 = ptr_eq(p2, p1)
-        guard_false(i12) []
         jump(p0, p1, p2)
         """
         self.optimize_loop(ops, 'Not, Not, Not', expected2)

Modified: pypy/branch/fast-ctypes/pypy/jit/tl/pypyjit.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/tl/pypyjit.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/tl/pypyjit.py	Tue Aug 10 20:07:15 2010
@@ -37,6 +37,7 @@
 set_opt_level(config, level='jit')
 config.objspace.allworkingmodules = False
 config.objspace.usemodules.pypyjit = True
+config.objspace.usemodules.array = True
 config.objspace.usemodules._weakref = False
 config.objspace.usemodules._sre = False
 set_pypy_opt_level(config, level='jit')

Modified: pypy/branch/fast-ctypes/pypy/jit/tl/pypyjit_demo.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/jit/tl/pypyjit_demo.py	(original)
+++ pypy/branch/fast-ctypes/pypy/jit/tl/pypyjit_demo.py	Tue Aug 10 20:07:15 2010
@@ -1,38 +1,64 @@
-base = object
+## base = object
 
-class Number(base):
-    __slots__ = ('val', )
-    def __init__(self, val=0):
-        self.val = val
-
-    def __add__(self, other):
-        if not isinstance(other, int):
-            other = other.val
-        return Number(val=self.val + other)
+## class Number(base):
+##     __slots__ = ('val', )
+##     def __init__(self, val=0):
+##         self.val = val
+
+##     def __add__(self, other):
+##         if not isinstance(other, int):
+##             other = other.val
+##         return Number(val=self.val + other)
             
-    def __cmp__(self, other):
-        val = self.val
-        if not isinstance(other, int):
-            other = other.val
-        return cmp(val, other)
-
-    def __nonzero__(self):
-        return bool(self.val)
-
-def g(x, inc=2):
-    return x + inc
-
-def f(n, x, inc):
-    while x < n:
-        x = g(x, inc=1)
-    return x
-
-import time
-#t1 = time.time()
-#f(10000000, Number(), 1)
-#t2 = time.time()
-#print t2 - t1
-t1 = time.time()
-f(10000000, 0, 1)
-t2 = time.time()
-print t2 - t1
+##     def __cmp__(self, other):
+##         val = self.val
+##         if not isinstance(other, int):
+##             other = other.val
+##         return cmp(val, other)
+
+##     def __nonzero__(self):
+##         return bool(self.val)
+
+## def g(x, inc=2):
+##     return x + inc
+
+## def f(n, x, inc):
+##     while x < n:
+##         x = g(x, inc=1)
+##     return x
+
+## import time
+## #t1 = time.time()
+## #f(10000000, Number(), 1)
+## #t2 = time.time()
+## #print t2 - t1
+## t1 = time.time()
+## f(10000000, 0, 1)
+## t2 = time.time()
+## print t2 - t1
+
+try:
+    from array import array
+    def f(img):
+        i=0
+        sa=0
+        while i < img.__len__():
+            sa+=img[i]
+            i+=1
+        return sa
+
+    img=array('h',(1,2,3,4))
+    print f(img)
+except Exception, e:
+    print "Exception: ", type(e)
+    print e
+    
+## def f():
+##     a=7
+##     i=0
+##     while i<4:
+##         if  i<0: break
+##         if  i<0: break
+##         i+=1
+
+## f()

Modified: pypy/branch/fast-ctypes/pypy/module/_demo/demo.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/module/_demo/demo.py	(original)
+++ pypy/branch/fast-ctypes/pypy/module/_demo/demo.py	Tue Aug 10 20:07:15 2010
@@ -4,11 +4,14 @@
 from pypy.interpreter.typedef import TypeDef, GetSetProperty
 from pypy.rpython.lltypesystem import rffi, lltype
 from pypy.rpython.tool import rffi_platform
+from pypy.translator.tool.cbuild import ExternalCompilationInfo
 import sys, math
 
 time_t = rffi_platform.getsimpletype('time_t', '#include <time.h>', rffi.LONG)
 
-time = rffi.llexternal('time', [rffi.VOIDP], time_t, includes=['time.h'])
+eci = ExternalCompilationInfo(includes=['time.h'])
+time = rffi.llexternal('time', [int], time_t,
+                       compilation_info=eci)
 
 def get(space, name):
     w_module = space.getbuiltinmodule('_demo')
@@ -20,10 +23,10 @@
         w_DemoError = get(space, 'DemoError')
         msg = "repetition count must be > 0"
         raise OperationError(w_DemoError, space.wrap(msg))
-    starttime = time(None)
+    starttime = time(0)
     for i in range(repetitions):
         space.call_function(w_callable)
-    endtime = time(None)
+    endtime = time(0)
     return space.wrap(endtime - starttime)
 measuretime.unwrap_spec = [ObjSpace, int, W_Root]
 
@@ -62,11 +65,16 @@
         self.x = space.int_w(w_value)
 
 def mytype_new(space, w_subtype, x):
+    if x == 3:
+        return space.wrap(MySubType(space, x))
     return space.wrap(W_MyType(space, x))
 mytype_new.unwrap_spec = [ObjSpace, W_Root, int]
 
 getset_x = GetSetProperty(W_MyType.fget_x, W_MyType.fset_x, cls=W_MyType)
 
+class MySubType(W_MyType):
+    pass
+
 W_MyType.typedef = TypeDef('MyType',
     __new__ = interp2app(mytype_new),
     x = getset_x,

Modified: pypy/branch/fast-ctypes/pypy/module/_stackless/interp_coroutine.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/module/_stackless/interp_coroutine.py	(original)
+++ pypy/branch/fast-ctypes/pypy/module/_stackless/interp_coroutine.py	Tue Aug 10 20:07:15 2010
@@ -265,10 +265,14 @@
             instr += 1
             oparg = ord(code[instr]) | ord(code[instr + 1]) << 8
             nargs = oparg & 0xff
+            nkwds = (oparg >> 8) & 0xff
             if space.config.objspace.opcodes.CALL_METHOD and opcode == map['CALL_METHOD']:
-                chain = resume_state_create(chain, 'CALL_METHOD', frame,
-                                            nargs)
-            elif opcode == map['CALL_FUNCTION'] and (oparg >> 8) & 0xff == 0:
+                if nkwds == 0:     # only positional arguments
+                    chain = resume_state_create(chain, 'CALL_METHOD', frame,
+                                                nargs)
+                else:              # includes keyword arguments
+                    chain = resume_state_create(chain, 'CALL_METHOD_KW', frame)
+            elif opcode == map['CALL_FUNCTION'] and nkwds == 0:
                 # Only positional arguments
                 # case1: ("CALL_FUNCTION", f, nargs, returns=w_result)
                 chain = resume_state_create(chain, 'CALL_FUNCTION', frame,

Modified: pypy/branch/fast-ctypes/pypy/module/pypyjit/policy.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/module/pypyjit/policy.py	(original)
+++ pypy/branch/fast-ctypes/pypy/module/pypyjit/policy.py	Tue Aug 10 20:07:15 2010
@@ -11,7 +11,7 @@
         if '.' in modname:
             modname, _ = modname.split('.', 1)
         if modname in ['pypyjit', 'signal', 'micronumpy', 'math', 'exceptions',
-                       'imp', 'sys']:
+                       'imp', 'sys', 'array']:
             return True
         return False
 

Modified: pypy/branch/fast-ctypes/pypy/module/pypyjit/test/test_pypy_c.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/module/pypyjit/test/test_pypy_c.py	(original)
+++ pypy/branch/fast-ctypes/pypy/module/pypyjit/test/test_pypy_c.py	Tue Aug 10 20:07:15 2010
@@ -615,6 +615,237 @@
             return total
         ''', 170, ([], 4999450000L))
 
+    def test_boolrewrite_invers(self):
+        for a, b, res, ops in (('2000', '2000', 20001000, 51),
+                               ( '500',  '500', 15001500, 81),
+                               ( '300',  '600', 16001700, 83),
+                               (   'a',    'b', 16001700, 89),
+                               (   'a',    'a', 13001700, 85)):
+
+            self.run_source('''
+            def main():
+                sa = 0
+                a = 300
+                b = 600
+                for i in range(1000):
+                    if i < %s: sa += 1
+                    else: sa += 2
+                    if i >= %s: sa += 10000
+                    else: sa += 20000
+                return sa
+            '''%(a, b), ops, ([], res))
+
+    def test_boolrewrite_reflex(self):
+        for a, b, res, ops in (('2000', '2000', 10001000, 51),
+                               ( '500',  '500', 15001500, 81),
+                               ( '300',  '600', 14001700, 83),
+                               (   'a',    'b', 14001700, 89),
+                               (   'a',    'a', 17001700, 85)):
+
+            self.run_source('''
+            def main():
+                sa = 0
+                a = 300
+                b = 600
+                for i in range(1000):
+                    if i < %s: sa += 1
+                    else: sa += 2
+                    if %s > i: sa += 10000
+                    else: sa += 20000
+                return sa
+            '''%(a, b), ops, ([], res))
+
+
+    def test_boolrewrite_correct_invers(self):
+        def opval(i, op, a):
+            if eval('%d %s %d' % (i, op, a)): return 1
+            return 2
+
+        ops = ('<', '>', '<=', '>=', '==', '!=')        
+        for op1 in ops:
+            for op2 in ops:
+                for a,b in ((500, 500), (300, 600)):
+                    res = 0
+                    res += opval(a-1, op1, a) * (a)
+                    res += opval(  a, op1, a) 
+                    res += opval(a+1, op1, a) * (1000 - a - 1)
+                    res += opval(b-1, op2, b) * 10000 * (b)
+                    res += opval(  b, op2, b) * 10000 
+                    res += opval(b+1, op2, b) * 10000 * (1000 - b - 1)
+
+                    self.run_source('''
+                    def main():
+                        sa = 0
+                        for i in range(1000):
+                            if i %s %d: sa += 1
+                            else: sa += 2
+                            if i %s %d: sa += 10000
+                            else: sa += 20000
+                        return sa
+                    '''%(op1, a, op2, b), 83, ([], res))
+
+                    self.run_source('''
+                    def main():
+                        sa = 0
+                        i = 0.0
+                        while i < 250.0:
+                            if i %s %f: sa += 1
+                            else: sa += 2
+                            if i %s %f: sa += 10000
+                            else: sa += 20000
+                            i += 0.25
+                        return sa
+                    '''%(op1, float(a)/4.0, op2, float(b)/4.0), 109, ([], res))
+                    
+
+    def test_boolrewrite_correct_reflex(self):
+        def opval(i, op, a):
+            if eval('%d %s %d' % (i, op, a)): return 1
+            return 2
+
+        ops = ('<', '>', '<=', '>=', '==', '!=')        
+        for op1 in ops:
+            for op2 in ops:
+                for a,b in ((500, 500), (300, 600)):
+                    res = 0
+                    res += opval(a-1, op1, a) * (a)
+                    res += opval(  a, op1, a) 
+                    res += opval(a+1, op1, a) * (1000 - a - 1)
+                    res += opval(b, op2, b-1) * 10000 * (b)
+                    res += opval(b, op2,   b) * 10000
+                    res += opval(b, op2, b+1) * 10000 * (1000 - b - 1)
+
+                    self.run_source('''
+                    def main():
+                        sa = 0
+                        for i in range(1000):
+                            if i %s %d: sa += 1
+                            else: sa += 2
+                            if %d %s i: sa += 10000
+                            else: sa += 20000
+                        return sa
+                    '''%(op1, a, b, op2), 83, ([], res))
+
+                    self.run_source('''
+                    def main():
+                        sa = 0
+                        i = 0.0
+                        while i < 250.0:
+                            if i %s %f: sa += 1
+                            else: sa += 2
+                            if %f %s i: sa += 10000
+                            else: sa += 20000
+                            i += 0.25
+                        return sa
+                    '''%(op1, float(a)/4.0, float(b)/4.0, op2), 109, ([], res))
+
+    def test_boolrewrite_ptr(self):
+        compares = ('a == b', 'b == a', 'a != b', 'b != a', 'a == c', 'c != b')
+        for e1 in compares:
+            for e2 in compares:
+                a, b, c = 1, 2, 3
+                if eval(e1): res = 752 * 1 
+                else: res = 752 * 2 
+                if eval(e2): res += 752 * 10000 
+                else: res += 752 * 20000 
+                a = b
+                if eval(e1): res += 248 * 1
+                else: res += 248 * 2
+                if eval(e2): res += 248 * 10000
+                else: res += 248 * 20000
+
+
+                if 'c' in e1 or 'c' in e2:
+                    n = 337
+                else:
+                    n = 215
+
+                self.run_source('''
+                class tst:
+                    pass
+                def main():
+                    a = tst()
+                    b = tst()
+                    c = tst()
+                    sa = 0
+                    for i in range(1000):
+                        if %s: sa += 1
+                        else: sa += 2
+                        if %s: sa += 10000
+                        else: sa += 20000
+                        if i > 750: a = b
+                    return sa
+                '''%(e1, e2), n, ([], res))
+
+    def test_array_sum(self):
+        for tc, maxops in zip('bhilBHILfd', (38,) * 6 + (40, 40, 41, 38)):
+            res = 19352859
+            if tc in 'IL':
+                res = long(res)
+            elif tc in 'fd':
+                res = float(res)
+            
+            self.run_source('''
+            from array import array
+
+            def main():
+                img = array("%s", range(127) * 5) * 484
+                l, i = 0, 0
+                while i < 640 * 480:
+                    l += img[i]
+                    i += 1
+                return l
+            ''' % tc, maxops, ([], res))
+
+    def test_array_sum_char(self):
+        self.run_source('''
+            from array import array
+
+            def main():
+                img = array("c", "Hello") * 130 * 480
+                l, i = 0, 0
+                while i < 640 * 480:
+                    l += ord(img[i])
+                    i += 1
+                return l
+            ''', 60, ([], 30720000))
+
+    def test_array_sum_unicode(self):
+        self.run_source('''
+            from array import array
+
+            def main():
+                img = array("u", u"Hello") * 130 * 480
+                l, i = 0, 0
+                while i < 640 * 480:
+                    if img[i] == u"l":
+                        l += 1
+                    i += 1
+                return l
+            ''', 65, ([], 122880))
+
+    def test_array_intimg(self):
+        for tc, maxops in zip('ilILd', (67, 67, 69, 69, 61)):
+            res = 73574560
+            if tc in 'IL':
+                res = long(res)
+            elif tc in 'fd':
+                res = float(res)
+            
+            self.run_source('''
+            from array import array
+
+            def main(tc):
+                img = array(tc, range(3)) * (350 * 480)
+                intimg = array(tc, (0,)) * (640 * 480)
+                l, i = 0, 640
+                while i < 640 * 480:
+                    l = l + img[i]
+                    intimg[i] = (intimg[i-640] + l) 
+                    i += 1
+                return intimg[i - 1]
+            ''', maxops, ([tc], res))
+
 class AppTestJIT(PyPyCJITTests):
     def setup_class(cls):
         if not option.runappdirect:
@@ -637,6 +868,7 @@
         cls.counter = 0
         cls.pypy_c = option.pypy_c
 
+
 def has_info(pypy_c, option):
     g = os.popen('"%s" --info' % pypy_c, 'r')
     lines = g.readlines()

Modified: pypy/branch/fast-ctypes/pypy/module/signal/interp_signal.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/module/signal/interp_signal.py	(original)
+++ pypy/branch/fast-ctypes/pypy/module/signal/interp_signal.py	Tue Aug 10 20:07:15 2010
@@ -7,6 +7,7 @@
 from pypy.translator.tool.cbuild import ExternalCompilationInfo
 import py
 from pypy.tool import autopath
+from pypy.rlib import jit
 
 def setup():
     for key, value in cpy_signal.__dict__.items():
@@ -159,10 +160,12 @@
     return space.wrap(SIG_DFL)
 getsignal.unwrap_spec = [ObjSpace, int]
 
+ at jit.dont_look_inside
 def alarm(space, timeout):
     return space.wrap(c_alarm(timeout))
 alarm.unwrap_spec = [ObjSpace, int]
 
+ at jit.dont_look_inside
 def pause(space):
     c_pause()
     return space.w_None
@@ -173,6 +176,7 @@
         raise OperationError(space.w_ValueError,
                              space.wrap("signal number out of range"))
 
+ at jit.dont_look_inside
 def signal(space, signum, w_handler):
     """
     signal(sig, action) -> action

Modified: pypy/branch/fast-ctypes/pypy/objspace/std/callmethod.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/objspace/std/callmethod.py	(original)
+++ pypy/branch/fast-ctypes/pypy/objspace/std/callmethod.py	Tue Aug 10 20:07:15 2010
@@ -12,7 +12,7 @@
 
 from pypy.interpreter import function
 from pypy.objspace.descroperation import object_getattribute
-from pypy.rlib import rstack # for resume points
+from pypy.rlib import jit, rstack # for resume points
 
 # This module exports two extra methods for StdObjSpaceFrame implementing
 # the LOOKUP_METHOD and CALL_METHOD opcodes in an efficient way, as well
@@ -56,16 +56,41 @@
     f.pushvalue(w_value)
     f.pushvalue(None)
 
-def CALL_METHOD(f, nargs, *ignored):
-    # 'nargs' is the argument count excluding the implicit 'self'
-    w_self = f.peekvalue(nargs)
-    w_callable = f.peekvalue(nargs + 1)
-    n = nargs + (w_self is not None)
-    try:
-        w_result = f.space.call_valuestack(w_callable, n, f)
-        rstack.resume_point("CALL_METHOD", f, nargs, returns=w_result)
-    finally:
-        f.dropvalues(nargs + 2)
+ at jit.unroll_safe
+def CALL_METHOD(f, oparg, *ignored):
+    # opargs contains the arg, and kwarg count, excluding the implicit 'self'
+    n_args = oparg & 0xff
+    n_kwargs = (oparg >> 8) & 0xff
+    w_self = f.peekvalue(n_args + (2 * n_kwargs))
+    n = n_args + (w_self is not None)
+    
+    if not n_kwargs:
+        w_callable = f.peekvalue(n_args + (2 * n_kwargs) + 1)
+        try:
+            w_result = f.space.call_valuestack(w_callable, n, f)
+            rstack.resume_point("CALL_METHOD", f, n_args, returns=w_result)
+        finally:
+            f.dropvalues(n_args + 2)
+    else:
+        keywords = [None] * n_kwargs
+        keywords_w = [None] * n_kwargs
+        while True:
+            n_kwargs -= 1
+            if n_kwargs < 0:
+                break
+            w_value = f.popvalue()
+            w_key = f.popvalue()
+            key = f.space.str_w(w_key)
+            keywords[n_kwargs] = key
+            keywords_w[n_kwargs] = w_value
+    
+        arguments = f.popvalues(n)    # includes w_self if it is not None
+        args = f.argument_factory(arguments, keywords, keywords_w, None, None)
+        if w_self is None:
+            f.popvalue()    # removes w_self, which is None
+        w_callable = f.popvalue()
+        w_result = f.space.call_args(w_callable, args)
+        rstack.resume_point("CALL_METHOD_KW", f, returns=w_result)
     f.pushvalue(w_result)
 
 

Modified: pypy/branch/fast-ctypes/pypy/objspace/std/itertype.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/objspace/std/itertype.py	(original)
+++ pypy/branch/fast-ctypes/pypy/objspace/std/itertype.py	Tue Aug 10 20:07:15 2010
@@ -1,5 +1,6 @@
 from pypy.interpreter import gateway
 from pypy.objspace.std.stdtypedef import StdTypeDef
+from pypy.interpreter.error import OperationError
 
 # ____________________________________________________________
 
@@ -8,6 +9,11 @@
     XXX to do: remove this __reduce__ method and do
     a registration with copy_reg, instead.
     """
+
+    # cpython does not support pickling iterators
+    msg = 'Pickling for iterators dissabled as cpython does not support it'
+    raise OperationError(space.w_TypeError, space.wrap(msg))
+
     from pypy.objspace.std.iterobject import W_AbstractSeqIterObject
     assert isinstance(w_self, W_AbstractSeqIterObject)
     from pypy.interpreter.mixedmodule import MixedModule

Modified: pypy/branch/fast-ctypes/pypy/objspace/std/model.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/objspace/std/model.py	(original)
+++ pypy/branch/fast-ctypes/pypy/objspace/std/model.py	Tue Aug 10 20:07:15 2010
@@ -88,6 +88,7 @@
         import pypy.objspace.std.default # register a few catch-all multimethods
 
         import pypy.objspace.std.marshal_impl # install marshal multimethods
+        import pypy.module.array
 
         # the set of implementation types
         self.typeorder = {
@@ -140,6 +141,8 @@
 
         # check if we missed implementations
         for implcls in _registered_implementations:
+            if hasattr(implcls, 'register'):
+                implcls.register(self.typeorder)
             assert (implcls in self.typeorder or
                     implcls in self.imported_but_not_registered), (
                 "please add %r in StdTypeModel.typeorder" % (implcls,))

Modified: pypy/branch/fast-ctypes/pypy/objspace/std/test/test_callmethod.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/objspace/std/test/test_callmethod.py	(original)
+++ pypy/branch/fast-ctypes/pypy/objspace/std/test/test_callmethod.py	Tue Aug 10 20:07:15 2010
@@ -106,6 +106,15 @@
             else:
                 raise Exception("did not raise?")
         """
+    
+    def test_kwargs(self):
+        exec """if 1:
+            class C(object):
+                def f(self, a):
+                    return a + 2
+            
+            assert C().f(a=3) == 5
+        """
 
 
 class AppTestCallMethodWithGetattributeShortcut(AppTestCallMethod):

Modified: pypy/branch/fast-ctypes/pypy/rlib/objectmodel.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/rlib/objectmodel.py	(original)
+++ pypy/branch/fast-ctypes/pypy/rlib/objectmodel.py	Tue Aug 10 20:07:15 2010
@@ -82,6 +82,17 @@
         
 specialize = _Specialize()
 
+def enforceargs(*args):
+    """ Decorate a function with forcing of RPython-level types on arguments.
+    None means no enforcing.
+
+    XXX shouldn't we also add asserts in function body?
+    """
+    def decorator(f):
+        f._annenforceargs_ = args
+        return f
+    return decorator
+
 # ____________________________________________________________
 
 class Symbolic(object):

Modified: pypy/branch/fast-ctypes/pypy/rlib/rmmap.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/rlib/rmmap.py	(original)
+++ pypy/branch/fast-ctypes/pypy/rlib/rmmap.py	Tue Aug 10 20:07:15 2010
@@ -646,8 +646,14 @@
         hintp = rffi.cast(PTR, hint.pos)
         res = c_mmap_safe(hintp, map_size, prot, flags, -1, 0)
         if res == rffi.cast(PTR, -1):
-            raise MemoryError
-        hint.pos += map_size
+            # some systems (some versions of OS/X?) complain if they
+            # are passed a non-zero address.  Try again.
+            hintp = rffi.cast(PTR, 0)
+            res = c_mmap_safe(hintp, map_size, prot, flags, -1, 0)
+            if res == rffi.cast(PTR, -1):
+                raise MemoryError
+        else:
+            hint.pos += map_size
         return res
     alloc._annenforceargs_ = (int,)
 

Modified: pypy/branch/fast-ctypes/pypy/rlib/test/test_objectmodel.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/rlib/test/test_objectmodel.py	(original)
+++ pypy/branch/fast-ctypes/pypy/rlib/test/test_objectmodel.py	Tue Aug 10 20:07:15 2010
@@ -404,6 +404,13 @@
 
     assert f._annspecialcase_ == 'specialize:arg(1)'
 
+def test_enforceargs_decorator():
+    @enforceargs(int, str, None)
+    def f(a, b, c):
+        pass
+
+    assert f._annenforceargs_ == (int, str, None)
+
 def getgraph(f, argtypes):
     from pypy.translator.translator import TranslationContext, graphof
     from pypy.translator.backendopt.all import backend_optimizations

Modified: pypy/branch/fast-ctypes/pypy/rpython/lltypesystem/rstr.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/rpython/lltypesystem/rstr.py	(original)
+++ pypy/branch/fast-ctypes/pypy/rpython/lltypesystem/rstr.py	Tue Aug 10 20:07:15 2010
@@ -2,7 +2,7 @@
 from pypy.tool.pairtype import pairtype
 from pypy.rpython.error import TyperError
 from pypy.rlib.objectmodel import malloc_zero_filled, we_are_translated
-from pypy.rlib.objectmodel import _hash_string
+from pypy.rlib.objectmodel import _hash_string, enforceargs
 from pypy.rlib.debug import ll_assert
 from pypy.rlib.jit import purefunction
 from pypy.rpython.robject import PyObjRepr, pyobj_repr
@@ -56,6 +56,7 @@
                 llmemory.itemoffsetof(TP.chars, 0) +
                 llmemory.sizeof(CHAR_TP) * item)
 
+    @enforceargs(None, None, int, int, int)
     def copy_string_contents(src, dst, srcstart, dststart, length):
         assert srcstart >= 0
         assert dststart >= 0

Modified: pypy/branch/fast-ctypes/pypy/rpython/rstr.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/rpython/rstr.py	(original)
+++ pypy/branch/fast-ctypes/pypy/rpython/rstr.py	Tue Aug 10 20:07:15 2010
@@ -288,8 +288,8 @@
         if not hop.args_s[1].is_constant():
             raise TyperError("encoding must be constant")
         encoding = hop.args_s[1].const
-        if encoding == "ascii":
-            expect = self.lowleveltype   # can be a UniChar
+        if encoding == "ascii" and self.lowleveltype == UniChar:
+            expect = UniChar             # only for unichar.encode('ascii')
         else:
             expect = self.repr           # must be a regular unicode string
         v_self = hop.inputarg(expect, 0)

Modified: pypy/branch/fast-ctypes/pypy/rpython/tool/rfficache.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/rpython/tool/rfficache.py	(original)
+++ pypy/branch/fast-ctypes/pypy/rpython/tool/rfficache.py	Tue Aug 10 20:07:15 2010
@@ -29,7 +29,7 @@
     }
     ''' % (include_string, add_source, str(question)))
     c_file = udir.join("gcctest.c")
-    c_file.write(c_source)
+    c_file.write(str(c_source) + '\n')
     eci = ExternalCompilationInfo()
     return build_executable_cache([c_file], eci)
 

Modified: pypy/branch/fast-ctypes/pypy/translator/c/genc.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/translator/c/genc.py	(original)
+++ pypy/branch/fast-ctypes/pypy/translator/c/genc.py	Tue Aug 10 20:07:15 2010
@@ -1,7 +1,6 @@
 import autopath
 import py
 import sys, os
-from pypy.translator.c.node import PyObjectNode, FuncNode
 from pypy.translator.c.database import LowLevelDatabase
 from pypy.translator.c.extfunc import pre_include_code_lines
 from pypy.translator.llsupport.wrapper import new_wrapper
@@ -196,7 +195,7 @@
 
         all = []
         for node in self.db.globalcontainers():
-            eci = getattr(node, 'compilation_info', None)
+            eci = node.compilation_info()
             if eci:
                 all.append(eci)
         self.merge_eci(*all)
@@ -222,7 +221,7 @@
         graphs = db.all_graphs()
         db.gctransformer.prepare_inline_helpers(graphs)
         for node in db.containerlist:
-            if isinstance(node, FuncNode):
+            if hasattr(node, 'funcgens'):
                 for funcgen in node.funcgens:
                     funcgen.patch_graph(copy_graph=False)
         return db

Modified: pypy/branch/fast-ctypes/pypy/translator/c/node.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/translator/c/node.py	(original)
+++ pypy/branch/fast-ctypes/pypy/translator/c/node.py	Tue Aug 10 20:07:15 2010
@@ -466,15 +466,15 @@
 
 
 class ContainerNode(object):
-    if USESLOTS:
-        __slots__ = """db T obj 
+    if USESLOTS:      # keep the number of slots down!
+        __slots__ = """db obj 
                        typename implementationtypename
-                        name ptrname compilation_info
+                        name ptrname
                         globalcontainer""".split()
+    eci_name = '_compilation_info'
 
     def __init__(self, db, T, obj):
         self.db = db
-        self.T = T
         self.obj = obj
         #self.dependencies = {}
         self.typename = db.gettype(T)  #, who_asks=self)
@@ -489,16 +489,22 @@
         else:
             self.globalcontainer = False
             parentnode = db.getcontainernode(parent)
-            defnode = db.gettypedefnode(parentnode.T)
+            defnode = db.gettypedefnode(parentnode.getTYPE())
             self.name = defnode.access_expr(parentnode.name, parentindex)
         if self.typename != self.implementationtypename:
             if db.gettypedefnode(T).extra_union_for_varlength:
                 self.name += '.b'
-        self.compilation_info = getattr(obj, '_compilation_info', None)
         self.ptrname = '(&%s)' % self.name
 
+    def getTYPE(self):
+        return typeOf(self.obj)
+
     def is_thread_local(self):
-        return hasattr(self.T, "_hints") and self.T._hints.get('thread_local')
+        T = self.getTYPE()
+        return hasattr(T, "_hints") and T._hints.get('thread_local')
+
+    def compilation_info(self):
+        return getattr(self.obj, self.eci_name, None)
 
     def get_declaration(self):
         if self.name[-2:] == '.b':
@@ -546,27 +552,31 @@
         __slots__ = ()
 
     def basename(self):
-        return self.T._name
+        T = self.getTYPE()
+        return T._name
 
     def enum_dependencies(self):
-        for name in self.T._names:
+        T = self.getTYPE()
+        for name in T._names:
             yield getattr(self.obj, name)
 
     def getlength(self):
-        if self.T._arrayfld is None:
+        T = self.getTYPE()
+        if T._arrayfld is None:
             return 1
         else:
-            array = getattr(self.obj, self.T._arrayfld)
+            array = getattr(self.obj, T._arrayfld)
             return len(array.items)
 
     def initializationexpr(self, decoration=''):
+        T = self.getTYPE()
         is_empty = True
         yield '{'
-        defnode = self.db.gettypedefnode(self.T)
+        defnode = self.db.gettypedefnode(T)
 
         data = []
 
-        if needs_gcheader(self.T):
+        if needs_gcheader(T):
             gc_init = self.db.gcpolicy.struct_gcheader_initdata(self)
             data.append(('gcheader', gc_init))
 
@@ -578,16 +588,16 @@
         # '.fieldname = value'.  But here we don't know which of the
         # fields need initialization, so XXX we pick the first one
         # arbitrarily.
-        if hasattr(self.T, "_hints") and self.T._hints.get('union'):
+        if hasattr(T, "_hints") and T._hints.get('union'):
             data = data[0:1]
 
-        if 'get_padding_drop' in self.T._hints:
+        if 'get_padding_drop' in T._hints:
             d = {}
             for name, _ in data:
-                T = defnode.c_struct_field_type(name)
-                typename = self.db.gettype(T)
+                T1 = defnode.c_struct_field_type(name)
+                typename = self.db.gettype(T1)
                 d[name] = cdecl(typename, '')
-            padding_drop = self.T._hints['get_padding_drop'](d)
+            padding_drop = T._hints['get_padding_drop'](d)
         else:
             padding_drop = []
 
@@ -617,9 +627,10 @@
         return 'struct _hashT_%s @' % self.name
 
     def forward_declaration(self):
+        T = self.getTYPE()
         assert self.typename == self.implementationtypename  # no array part
         hash_typename = self.get_hash_typename()
-        hash_offset = self.db.gctransformer.get_hash_offset(self.T)
+        hash_offset = self.db.gctransformer.get_hash_offset(T)
         yield '%s {' % cdecl(hash_typename, '')
         yield '\tunion {'
         yield '\t\t%s;' % cdecl(self.implementationtypename, 'head')
@@ -671,22 +682,23 @@
         return len(self.obj.items)
 
     def initializationexpr(self, decoration=''):
-        defnode = self.db.gettypedefnode(self.T)
+        T = self.getTYPE()
+        defnode = self.db.gettypedefnode(T)
         yield '{'
-        if needs_gcheader(self.T):
+        if needs_gcheader(T):
             gc_init = self.db.gcpolicy.array_gcheader_initdata(self)
             lines = generic_initializationexpr(self.db, gc_init, 'gcheader',
                                                '%sgcheader' % (decoration,))
             for line in lines:
                 yield line
-        if self.T._hints.get('nolength', False):
+        if T._hints.get('nolength', False):
             length = ''
         else:
             length = '%d, ' % len(self.obj.items)
-        if self.T.OF is Void or len(self.obj.items) == 0:
+        if T.OF is Void or len(self.obj.items) == 0:
             yield '\t%s' % length.rstrip(', ')
             yield '}'
-        elif self.T.OF == Char:
+        elif T.OF == Char:
             if len(self.obj.items) and self.obj.items[0] is None:
                 s = ''.join([self.obj.getitem(i) for i in range(len(self.obj.items))])
             else:
@@ -694,7 +706,7 @@
             yield '\t%s%s' % (length, c_char_array_constant(s))
             yield '}'
         else:
-            barebone = barebonearray(self.T)
+            barebone = barebonearray(T)
             if not barebone:
                 yield '\t%s{' % length
             for j in range(len(self.obj.items)):
@@ -722,7 +734,8 @@
             self.ptrname = self.name
 
     def basename(self):
-        return self.T._name
+        T = self.getTYPE()
+        return T._name
 
     def enum_dependencies(self):
         for i in range(self.obj.getlength()):
@@ -732,11 +745,12 @@
         return 1    # not variable-sized!
 
     def initializationexpr(self, decoration=''):
+        T = self.getTYPE()
         assert self.typename == self.implementationtypename  # not var-sized
         is_empty = True
         yield '{'
         # _names == ['item0', 'item1', ...]
-        for j, name in enumerate(self.T._names):
+        for j, name in enumerate(T._names):
             value = getattr(self.obj, name)
             lines = generic_initializationexpr(self.db, value,
                                                '%s[%d]' % (self.name, j),
@@ -777,6 +791,7 @@
 
 class FuncNode(ContainerNode):
     nodekind = 'func'
+    eci_name = 'compilation_info'
     # there not so many node of this kind, slots should not
     # be necessary
 
@@ -794,7 +809,6 @@
         else:
             self.name = (forcename or
                          db.namespace.uniquename('g_' + self.basename()))
-        self.compilation_info = getattr(obj, 'compilation_info', None)
         self.make_funcgens()
         #self.dependencies = {}
         self.typename = db.gettype(T)  #, who_asks=self)
@@ -939,18 +953,20 @@
         return []
 
     def initializationexpr(self, decoration=''):
-        yield 'RPyOpaque_INITEXPR_%s' % (self.T.tag,)
+        T = self.getTYPE()
+        yield 'RPyOpaque_INITEXPR_%s' % (T.tag,)
 
     def startupcode(self):
+        T = self.getTYPE()
         args = [self.ptrname]
         # XXX how to make this code more generic?
-        if self.T.tag == 'ThreadLock':
+        if T.tag == 'ThreadLock':
             lock = self.obj.externalobj
             if lock.locked():
                 args.append('1')
             else:
                 args.append('0')
-        yield 'RPyOpaque_SETUP_%s(%s);' % (self.T.tag, ', '.join(args))
+        yield 'RPyOpaque_SETUP_%s(%s);' % (T.tag, ', '.join(args))
 
 
 def opaquenode_factory(db, T, obj):

Modified: pypy/branch/fast-ctypes/pypy/translator/platform/posix.py
==============================================================================
--- pypy/branch/fast-ctypes/pypy/translator/platform/posix.py	(original)
+++ pypy/branch/fast-ctypes/pypy/translator/platform/posix.py	Tue Aug 10 20:07:15 2010
@@ -14,7 +14,10 @@
 
     def __init__(self, cc=None):
         if cc is None:
-            cc = 'gcc'
+            try:
+                cc = os.environ['CC']
+            except KeyError:
+                cc = 'gcc'
         self.cc = cc
 
     def _libs(self, libraries):