[pypy-commit] pypy jit-simplify-backendintf: In-progress. Three complicated and long functions are gone :-)

arigo noreply at buildbot.pypy.org
Sun Dec 11 23:42:40 CET 2011


Author: Armin Rigo <arigo at tunes.org>
Branch: jit-simplify-backendintf
Changeset: r50402:a4f5ba4d79b4
Date: 2011-12-11 23:42 +0100
http://bitbucket.org/pypy/pypy/changeset/a4f5ba4d79b4/

Log:	In-progress. Three complicated and long functions are gone :-)

diff --git a/pypy/jit/backend/llsupport/regalloc.py b/pypy/jit/backend/llsupport/regalloc.py
--- a/pypy/jit/backend/llsupport/regalloc.py
+++ b/pypy/jit/backend/llsupport/regalloc.py
@@ -69,6 +69,8 @@
         self.bindings[box] = loc
         #
         index = self.get_loc_index(loc)
+        if index < 0:
+            return
         endindex = index + self.frame_size(box.type)
         while len(self.used) < endindex:
             self.used.append(False)
@@ -91,6 +93,8 @@
         #
         size = self.frame_size(box.type)
         baseindex = self.get_loc_index(loc)
+        if baseindex < 0:
+            return
         for i in range(size):
             index = baseindex + i
             assert 0 <= index < len(self.used)
@@ -98,7 +102,8 @@
 
     def try_to_reuse_location(self, box, loc):
         index = self.get_loc_index(loc)
-        assert index >= 0
+        if index < 0:
+            return False
         size = self.frame_size(box.type)
         for i in range(size):
             while (index + i) >= len(self.used):
@@ -158,7 +163,7 @@
         if not we_are_translated() and self.box_types is not None:
             assert isinstance(v, TempBox) or v.type in self.box_types
 
-    def possibly_free_var(self, v):
+    def possibly_free_var(self, v, _hint_dont_reuse_quickly=False):
         """ If v is stored in a register and v is not used beyond the
             current position, then free it.  Must be called at some
             point for all variables that might be in registers.
@@ -168,7 +173,10 @@
             return
         if v not in self.longevity or self.longevity[v][1] <= self.position:
             if v in self.reg_bindings:
-                self.free_regs.append(self.reg_bindings[v])
+                if _hint_dont_reuse_quickly:
+                    self.free_regs.insert(0, self.reg_bindings[v])
+                else:
+                    self.free_regs.append(self.reg_bindings[v])
                 del self.reg_bindings[v]
             if self.frame_manager is not None:
                 self.frame_manager.mark_as_free(v)
diff --git a/pypy/jit/backend/x86/assembler.py b/pypy/jit/backend/x86/assembler.py
--- a/pypy/jit/backend/x86/assembler.py
+++ b/pypy/jit/backend/x86/assembler.py
@@ -421,10 +421,8 @@
 
     def assemble_loop(self, loopname, inputargs, operations, looptoken, log):
         '''adds the following attributes to looptoken:
-               _x86_loop_code       (an integer giving an address)
-               _x86_bootstrap_code  (an integer giving an address)
-               _x86_direct_bootstrap_code  ( "    "     "    "   )
-               _x86_arglocs
+               _x86_function_addr   (address of the generated func, as an int)
+               _x86_loop_code       (debug: addr of the start of the ResOps)
                _x86_debug_checksum
         '''
         # XXX this function is too longish and contains some code
@@ -445,12 +443,11 @@
             operations = self._inject_debugging_code(looptoken, operations)
 
         regalloc = RegAlloc(self, self.cpu.translate_support_code)
-        arglocs, operations = regalloc.prepare_loop(inputargs, operations,
-                                                    looptoken, clt.allgcrefs)
-        looptoken._x86_arglocs = arglocs
-
-        bootstrappos = self.mc.get_relative_pos()
-        stackadjustpos = self._assemble_bootstrap_code(inputargs, arglocs)
+        #
+        self._call_header_with_stack_check()
+        stackadjustpos = self._patchable_stackadjust()
+        operations = regalloc.prepare_loop(inputargs, operations,
+                                           looptoken, clt.allgcrefs)
         looppos = self.mc.get_relative_pos()
         looptoken._x86_loop_code = looppos
         clt.frame_depth = -1     # temporarily
@@ -458,19 +455,17 @@
         frame_depth, param_depth = self._assemble(regalloc, operations)
         clt.frame_depth = frame_depth
         clt.param_depth = param_depth
-
-        directbootstrappos = self.mc.get_relative_pos()
-        self._assemble_bootstrap_direct_call(arglocs, looppos,
-                                             frame_depth+param_depth)
+        #
+        size_excluding_failure_stuff = self.mc.get_relative_pos()
         self.write_pending_failure_recoveries()
-        fullsize = self.mc.get_relative_pos()
+        full_size = self.mc.get_relative_pos()
         #
         rawstart = self.materialize_loop(looptoken)
         debug_start("jit-backend-addr")
         debug_print("Loop %d (%s) has address %x to %x (bootstrap %x)" % (
             looptoken.number, loopname,
             rawstart + looppos,
-            rawstart + directbootstrappos,
+            rawstart + size_excluding_failure_stuff,
             rawstart))
         debug_stop("jit-backend-addr")
         self._patch_stackadjust(rawstart + stackadjustpos,
@@ -481,18 +476,17 @@
         if not we_are_translated():
             # used only by looptoken.dump() -- useful in tests
             looptoken._x86_rawstart = rawstart
-            looptoken._x86_fullsize = fullsize
+            looptoken._x86_fullsize = full_size
             looptoken._x86_ops_offset = ops_offset
+        looptoken._x86_function_addr = rawstart
 
-        looptoken._x86_bootstrap_code = rawstart + bootstrappos
-        looptoken._x86_direct_bootstrap_code = rawstart + directbootstrappos
         self.fixup_target_tokens(rawstart)
         self.teardown()
         # oprofile support
         if self.cpu.profile_agent is not None:
             name = "Loop # %s: %s" % (looptoken.number, loopname)
             self.cpu.profile_agent.native_code_written(name,
-                                                       rawstart, fullsize)
+                                                       rawstart, full_size)
         return ops_offset
 
     def assemble_bridge(self, faildescr, inputargs, operations,
@@ -802,98 +796,6 @@
             self.mc.MOV_ri(ebx.value, rst)           # MOV ebx, rootstacktop
             self.mc.SUB_mi8((ebx.value, 0), 2*WORD)  # SUB [ebx], 2*WORD
 
-    def _assemble_bootstrap_direct_call(self, arglocs, jmppos, stackdepth):
-        if IS_X86_64:
-            return self._assemble_bootstrap_direct_call_64(arglocs, jmppos, stackdepth)
-        # XXX pushing ebx esi and edi is a bit pointless, since we store
-        #     all regsiters anyway, for the case of guard_not_forced
-        # XXX this can be improved greatly. Right now it'll behave like
-        #     a normal call
-        nonfloatlocs, floatlocs = arglocs
-        self._call_header_with_stack_check()
-        self.mc.LEA_rb(esp.value, self._get_offset_of_ebp_from_esp(stackdepth))
-        offset = 2 * WORD
-        tmp = eax
-        xmmtmp = xmm0
-        for i in range(len(nonfloatlocs)):
-            loc = nonfloatlocs[i]
-            if loc is not None:
-                if isinstance(loc, RegLoc):
-                    assert not loc.is_xmm
-                    self.mc.MOV_rb(loc.value, offset)
-                else:
-                    self.mc.MOV_rb(tmp.value, offset)
-                    self.mc.MOV(loc, tmp)
-                offset += WORD
-            loc = floatlocs[i]
-            if loc is not None:
-                if isinstance(loc, RegLoc):
-                    assert loc.is_xmm
-                    self.mc.MOVSD_xb(loc.value, offset)
-                else:
-                    self.mc.MOVSD_xb(xmmtmp.value, offset)
-                    assert isinstance(loc, StackLoc)
-                    self.mc.MOVSD_bx(loc.value, xmmtmp.value)
-                offset += 2 * WORD
-        endpos = self.mc.get_relative_pos() + 5
-        self.mc.JMP_l(jmppos - endpos)
-        assert endpos == self.mc.get_relative_pos()
-
-    def _assemble_bootstrap_direct_call_64(self, arglocs, jmppos, stackdepth):
-        # XXX: Very similar to _emit_call_64
-
-        src_locs = []
-        dst_locs = []
-        xmm_src_locs = []
-        xmm_dst_locs = []
-        get_from_stack = []
-
-        # In reverse order for use with pop()
-        unused_gpr = [r9, r8, ecx, edx, esi, edi]
-        unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
-
-        nonfloatlocs, floatlocs = arglocs
-        self._call_header_with_stack_check()
-        self.mc.LEA_rb(esp.value, self._get_offset_of_ebp_from_esp(stackdepth))
-
-        # The lists are padded with Nones
-        assert len(nonfloatlocs) == len(floatlocs)
-
-        for i in range(len(nonfloatlocs)):
-            loc = nonfloatlocs[i]
-            if loc is not None:
-                if len(unused_gpr) > 0:
-                    src_locs.append(unused_gpr.pop())
-                    dst_locs.append(loc)
-                else:
-                    get_from_stack.append((loc, False))
-
-            floc = floatlocs[i]
-            if floc is not None:
-                if len(unused_xmm) > 0:
-                    xmm_src_locs.append(unused_xmm.pop())
-                    xmm_dst_locs.append(floc)
-                else:
-                    get_from_stack.append((floc, True))
-
-        remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
-        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs, X86_64_XMM_SCRATCH_REG)
-
-        for i in range(len(get_from_stack)):
-            loc, is_xmm = get_from_stack[i]
-            if is_xmm:
-                self.mc.MOVSD_xb(X86_64_XMM_SCRATCH_REG.value, (2 + i) * WORD)
-                self.mc.MOVSD(loc, X86_64_XMM_SCRATCH_REG)
-            else:
-                self.mc.MOV_rb(X86_64_SCRATCH_REG.value, (2 + i) * WORD)
-                # XXX: We're assuming that "loc" won't require regloc to
-                # clobber the scratch register
-                self.mc.MOV(loc, X86_64_SCRATCH_REG)
-
-        endpos = self.mc.get_relative_pos() + 5
-        self.mc.JMP_l(jmppos - endpos)
-        assert endpos == self.mc.get_relative_pos()
-
     def redirect_call_assembler(self, oldlooptoken, newlooptoken):
         # some minimal sanity checking
         oldnonfloatlocs, oldfloatlocs = oldlooptoken._x86_arglocs
@@ -909,45 +811,6 @@
         mc.JMP(imm(target))
         mc.copy_to_raw_memory(oldadr)
 
-    def _assemble_bootstrap_code(self, inputargs, arglocs):
-        nonfloatlocs, floatlocs = arglocs
-        self._call_header()
-        stackadjustpos = self._patchable_stackadjust()
-        tmp = eax
-        xmmtmp = xmm0
-        self.mc.begin_reuse_scratch_register()
-        for i in range(len(nonfloatlocs)):
-            loc = nonfloatlocs[i]
-            if loc is None:
-                continue
-            if isinstance(loc, RegLoc):
-                target = loc
-            else:
-                target = tmp
-            if inputargs[i].type == REF:
-                adr = self.fail_boxes_ptr.get_addr_for_num(i)
-                self.mc.MOV(target, heap(adr))
-                self.mc.MOV(heap(adr), imm0)
-            else:
-                adr = self.fail_boxes_int.get_addr_for_num(i)
-                self.mc.MOV(target, heap(adr))
-            if target is not loc:
-                assert isinstance(loc, StackLoc)
-                self.mc.MOV_br(loc.value, target.value)
-        for i in range(len(floatlocs)):
-            loc = floatlocs[i]
-            if loc is None:
-                continue
-            adr = self.fail_boxes_float.get_addr_for_num(i)
-            if isinstance(loc, RegLoc):
-                self.mc.MOVSD(loc, heap(adr))
-            else:
-                self.mc.MOVSD(xmmtmp, heap(adr))
-                assert isinstance(loc, StackLoc)
-                self.mc.MOVSD_bx(loc.value, xmmtmp.value)
-        self.mc.end_reuse_scratch_register()
-        return stackadjustpos
-
     def dump(self, text):
         if not self.verbose:
             return
@@ -2104,9 +1967,9 @@
         # returns in eax the fail_index
 
         # now we return from the complete frame, which starts from
-        # _assemble_bootstrap_code().  The LEA in _call_footer below throws
-        # away most of the frame, including all the PUSHes that we did just
-        # above.
+        # _call_header_with_stack_check().  The LEA in _call_footer below
+        # throws away most of the frame, including all the PUSHes that we
+        # did just above.
 
         self._call_footer()
         rawstart = mc.materialize(self.cpu.asmmemmgr, [])
diff --git a/pypy/jit/backend/x86/regalloc.py b/pypy/jit/backend/x86/regalloc.py
--- a/pypy/jit/backend/x86/regalloc.py
+++ b/pypy/jit/backend/x86/regalloc.py
@@ -28,7 +28,7 @@
 class X86RegisterManager(RegisterManager):
 
     box_types = [INT, REF]
-    all_regs = [eax, ecx, edx, ebx, esi, edi]
+    all_regs = [ecx, eax, edx, ebx, esi, edi]
     no_lower_byte_regs = [esi, edi]
     save_around_call_regs = [eax, edx, ecx]
     frame_reg = ebp
@@ -60,7 +60,7 @@
 
 class X86_64_RegisterManager(X86RegisterManager):
     # r11 omitted because it's used as scratch
-    all_regs = [eax, ecx, edx, ebx, esi, edi, r8, r9, r10, r12, r13, r14, r15]
+    all_regs = [ecx, eax, edx, ebx, esi, edi, r8, r9, r10, r12, r13, r14, r15]
     no_lower_byte_regs = []
     save_around_call_regs = [eax, ecx, edx, esi, edi, r8, r9, r10]
 
@@ -173,22 +173,26 @@
         operations = cpu.gc_ll_descr.rewrite_assembler(cpu, operations,
                                                        allgcrefs)
         # compute longevity of variables
-        longevity, useful = self._compute_vars_longevity(inputargs, operations)
+        longevity = self._compute_vars_longevity(inputargs, operations)
         self.longevity = longevity
         self.rm = gpr_reg_mgr_cls(longevity,
                                   frame_manager = self.fm,
                                   assembler = self.assembler)
         self.xrm = xmm_reg_mgr_cls(longevity, frame_manager = self.fm,
                                    assembler = self.assembler)
-        return operations, useful
+        return operations
 
     def prepare_loop(self, inputargs, operations, looptoken, allgcrefs):
-        operations, useful = self._prepare(inputargs, operations, allgcrefs)
-        return self._process_inputargs(inputargs, useful), operations
+        operations = self._prepare(inputargs, operations, allgcrefs)
+        self._set_initial_bindings(inputargs)
+        # note: we need to make a copy of inputargs because possibly_free_vars
+        # is also used on op args, which is a non-resizable list
+        self.possibly_free_vars(list(inputargs))
+        return operations
 
     def prepare_bridge(self, prev_depths, inputargs, arglocs, operations,
                        allgcrefs):
-        operations, _ = self._prepare(inputargs, operations, allgcrefs)
+        operations = self._prepare(inputargs, operations, allgcrefs)
         self._update_bindings(arglocs, inputargs)
         self.param_depth = prev_depths[1]
         return operations
@@ -196,46 +200,30 @@
     def reserve_param(self, n):
         self.param_depth = max(self.param_depth, n)
 
-    def _process_inputargs(self, inputargs, useful):
-        # XXX we can sort out here by longevity if we need something
-        # more optimal
-        floatlocs = [None] * len(inputargs)
-        nonfloatlocs = [None] * len(inputargs)
-        # Don't use all_regs[0] for passing arguments around a loop.
-        # Must be kept in sync with consider_jump().
-        # XXX this should probably go to llsupport/regalloc.py
-        xmmtmp = self.xrm.free_regs.pop(0)
-        tmpreg = self.rm.free_regs.pop(0)
-        assert tmpreg == X86RegisterManager.all_regs[0]
-        assert xmmtmp == X86XMMRegisterManager.all_regs[0]
+    def _set_initial_bindings(self, inputargs):
+        if IS_X86_64:
+            return self._set_initial_bindings_64(inputargs)
+        #                   ...
+        # stack layout:     arg2
+        #                   arg1
+        #                   arg0
+        #                   return address
+        #                   saved ebp        <-- ebp points here
+        #                   ...
+        cur_frame_pos = - 1 - FRAME_FIXED_SIZE
+        assert get_ebp_ofs(cur_frame_pos-1) == 2*WORD
+        assert get_ebp_ofs(cur_frame_pos-2) == 3*WORD
+        #
         for i in range(len(inputargs)):
-            arg = inputargs[i]
-            assert not isinstance(arg, Const)
-            reg = None
-            if self.longevity[arg][1] > -1 and arg in useful:
-                if arg.type == FLOAT:
-                    # xxx is it really a good idea?  at the first CALL they
-                    # will all be flushed anyway
-                    reg = self.xrm.try_allocate_reg(arg)
-                else:
-                    reg = self.rm.try_allocate_reg(arg)
-            if reg:
-                loc = reg
+            box = inputargs[i]
+            assert isinstance(box, Box)
+            #
+            if box.type == FLOAT:
+                cur_frame_pos -= 2
             else:
-                loc = self.fm.loc(arg)
-            if arg.type == FLOAT:
-                floatlocs[i] = loc
-            else:
-                nonfloatlocs[i] = loc
-            # otherwise we have it saved on stack, so no worry
-        self.rm.free_regs.insert(0, tmpreg)
-        self.xrm.free_regs.insert(0, xmmtmp)
-        assert tmpreg not in nonfloatlocs
-        assert xmmtmp not in floatlocs
-        # note: we need to make a copy of inputargs because possibly_free_vars
-        # is also used on op args, which is a non-resizable list
-        self.possibly_free_vars(list(inputargs))
-        return nonfloatlocs, floatlocs
+                cur_frame_pos -= 1
+            loc = self.fm.frame_pos(cur_frame_pos, box.type)
+            self.fm.set_binding(box, loc)
 
     def possibly_free_var(self, var):
         if var.type == FLOAT:
@@ -458,7 +446,7 @@
         # only to guard operations or to jump or to finish
         produced = {}
         last_used = {}
-        useful = {}
+        #useful = {}
         for i in range(len(operations)-1, -1, -1):
             op = operations[i]
             if op.result:
@@ -469,8 +457,8 @@
             opnum = op.getopnum()
             for j in range(op.numargs()):
                 arg = op.getarg(j)
-                if opnum != rop.JUMP and opnum != rop.FINISH:
-                    useful[arg] = None
+                #if opnum != rop.JUMP and opnum != rop.FINISH:
+                #    useful[arg] = None
                 if isinstance(arg, Box) and arg not in last_used:
                     last_used[arg] = i
             if op.is_guard():
@@ -496,7 +484,7 @@
                 longevity[arg] = (0, last_used[arg])
                 del last_used[arg]
         assert len(last_used) == 0
-        return longevity, useful
+        return longevity#, useful
 
     def loc(self, v):
         if v is None: # xxx kludgy
@@ -1451,12 +1439,12 @@
         tmpreg = X86RegisterManager.all_regs[0]
         tmpvar = TempBox()
         self.rm.force_allocate_reg(tmpvar, selected_reg=tmpreg)
-        self.rm.possibly_free_var(tmpvar)
+        self.rm.possibly_free_var(tmpvar, _hint_dont_reuse_quickly=True)
         #
         xmmtmp = X86XMMRegisterManager.all_regs[0]
         tmpvar = TempBox()
         self.xrm.force_allocate_reg(tmpvar, selected_reg=xmmtmp)
-        self.xrm.possibly_free_var(tmpvar)
+        self.xrm.possibly_free_var(tmpvar, _hint_dont_reuse_quickly=True)
         #
         # we need to make sure that no variable is stored in ebp
         for arg in inputargs:
diff --git a/pypy/jit/backend/x86/regloc.py b/pypy/jit/backend/x86/regloc.py
--- a/pypy/jit/backend/x86/regloc.py
+++ b/pypy/jit/backend/x86/regloc.py
@@ -44,7 +44,6 @@
     _location_code = 'b'
 
     def __init__(self, position, ebp_offset, num_words, type):
-        assert ebp_offset < 0   # so no confusion with RegLoc.value
         self.position = position
         self.value = ebp_offset
         self.width = num_words * WORD
diff --git a/pypy/jit/backend/x86/runner.py b/pypy/jit/backend/x86/runner.py
--- a/pypy/jit/backend/x86/runner.py
+++ b/pypy/jit/backend/x86/runner.py
@@ -117,7 +117,7 @@
         FUNCPTR = lltype.Ptr(lltype.FuncType(ARGS, lltype.Signed))
         #
         def execute_token(executable_token, *args):
-            addr = executable_token._x86_direct_bootstrap_code
+            addr = executable_token._x86_function_addr
             func = rffi.cast(FUNCPTR, addr)
             #llop.debug_print(lltype.Void, ">>>> Entering", addr)
             prev_interpreter = None   # help flow space
diff --git a/pypy/jit/backend/x86/test/test_runner.py b/pypy/jit/backend/x86/test/test_runner.py
--- a/pypy/jit/backend/x86/test/test_runner.py
+++ b/pypy/jit/backend/x86/test/test_runner.py
@@ -328,9 +328,8 @@
                     inputargs = [i for i in (a, b) if isinstance(i, Box)]
                     looptoken = JitCellToken()
                     self.cpu.compile_loop(inputargs, ops, looptoken)
-                    for i, box in enumerate(inputargs):
-                        self.cpu.set_future_value_int(i, box.value)
-                    self.cpu.execute_token(looptoken)
+                    inputvalues = [box.value for box in inputargs]
+                    self.cpu.execute_token(looptoken, *inputvalues)
                     result = self.cpu.get_latest_value_int(0)
                     expected = execute(self.cpu, None, op, None, a, b).value
                     if guard == rop.GUARD_FALSE:
@@ -396,8 +395,7 @@
         assert address >= loopaddress + loopsize
         assert size >= 10 # randomish number
 
-        self.cpu.set_future_value_int(0, 2)
-        fail = self.cpu.execute_token(looptoken)
+        fail = self.cpu.execute_token(looptoken, 2)
         assert fail.identifier == 2
         res = self.cpu.get_latest_value_int(0)
         assert res == 20
@@ -503,9 +501,7 @@
             looptoken = JitCellToken()
             self.cpu.compile_loop([i1, i2], ops, looptoken)
 
-            self.cpu.set_future_value_int(0, 123450)
-            self.cpu.set_future_value_int(1, 123408)
-            fail = self.cpu.execute_token(looptoken)
+            fail = self.cpu.execute_token(looptoken, 123450, 123408)
             assert fail.identifier == 0
             assert self.cpu.get_latest_value_int(0) == 42
             assert self.cpu.get_latest_value_int(1) == 42
@@ -537,8 +533,7 @@
             self.cpu.assembler.set_debug(True)
             looptoken = JitCellToken()
             self.cpu.compile_loop(ops.inputargs, ops.operations, looptoken)
-            self.cpu.set_future_value_int(0, 0)
-            self.cpu.execute_token(looptoken)
+            self.cpu.execute_token(looptoken, 0)
             # check debugging info
             struct = self.cpu.assembler.loop_run_counters[0]
             assert struct.i == 10
@@ -561,7 +556,6 @@
         self.cpu.assembler.set_debug(True)
         looptoken = JitCellToken()
         self.cpu.compile_loop(ops.inputargs, ops.operations, looptoken)
-        self.cpu.set_future_value_int(0, 0)
-        self.cpu.execute_token(looptoken)
+        self.cpu.execute_token(looptoken, 0)
         assert looptoken._x86_debug_checksum == sum([op.getopnum()
                                                      for op in ops.operations])


More information about the pypy-commit mailing list