[pypy-commit] pypy default: implement malloc_cond_varsize, malloc_slowpath_varsize for the different types and enable can_inline_varsize_malloc on ARM
bivab
noreply at buildbot.pypy.org
Sat Apr 20 08:42:08 CEST 2013
Author: David Schneider <david.schneider at picle.org>
Branch:
Changeset: r63517:d6520ed985b4
Date: 2013-04-20 00:49 +0200
http://bitbucket.org/pypy/pypy/changeset/d6520ed985b4/
Log: implement malloc_cond_varsize, malloc_slowpath_varsize for the
different types and enable can_inline_varsize_malloc on ARM
diff --git a/rpython/jit/backend/arm/assembler.py b/rpython/jit/backend/arm/assembler.py
--- a/rpython/jit/backend/arm/assembler.py
+++ b/rpython/jit/backend/arm/assembler.py
@@ -3,6 +3,7 @@
import os
from rpython.jit.backend.arm import conditions as c, registers as r
+from rpython.jit.backend.arm import shift
from rpython.jit.backend.arm.arch import (WORD, DOUBLE_WORD, FUNC_ALIGN,
JITFRAME_FIXED_SIZE)
from rpython.jit.backend.arm.codebuilder import InstrBuilder, OverwritingBuilder
@@ -12,8 +13,9 @@
CoreRegisterManager, check_imm_arg, VFPRegisterManager,
operations as regalloc_operations,
operations_with_guard as regalloc_operations_with_guard)
-from rpython.jit.backend.llsupport import jitframe
+from rpython.jit.backend.llsupport import jitframe, rewrite
from rpython.jit.backend.llsupport.assembler import DEBUG_COUNTER, debug_bridge, BaseAssembler
+from rpython.jit.backend.llsupport.regalloc import get_scale, valid_addressing_size
from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
from rpython.jit.backend.model import CompiledLoopToken
from rpython.jit.codewriter.effectinfo import EffectInfo
@@ -250,27 +252,59 @@
self.wb_slowpath[withcards + 2 * withfloats] = rawstart
def _build_malloc_slowpath(self, kind):
- if kind != 'fixed':
- return 0
+ """ While arriving on slowpath, we have a gcpattern on stack 0.
+ The arguments are passed in r0 and r10, as follows:
+
+ kind == 'fixed': nursery_head in r0 and the size in r1 - r0.
+
+ kind == 'str/unicode': length of the string to allocate in r0.
+
+ kind == 'var': length to allocate in r1, tid in r0,
+ and itemsize on the stack.
+
+ This function must preserve all registers apart from r0 and r1.
+ """
+ assert kind in ['fixed', 'str', 'unicode', 'var']
mc = InstrBuilder(self.cpu.arch_version)
+ #
self._push_all_regs_to_jitframe(mc, [r.r0, r.r1], self.cpu.supports_floats)
+ #
+ if kind == 'fixed':
+ addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
+ elif kind == 'str':
+ addr = self.cpu.gc_ll_descr.get_malloc_fn_addr('malloc_str')
+ elif kind == 'unicode':
+ addr = self.cpu.gc_ll_descr.get_malloc_fn_addr('malloc_unicode')
+ else:
+ addr = self.cpu.gc_ll_descr.get_malloc_slowpath_array_addr()
+ if kind == 'fixed':
+ # stack layout: [gcmap]
+ # At this point we know that the values we need to compute the size
+ # are stored in r0 and r1.
+ mc.SUB_rr(r.r0.value, r.r1.value, r.r0.value) # compute the size we want
+
+ if hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
+ mc.MOV_rr(r.r1.value, r.fp.value)
+ elif kind == 'str' or kind == 'unicode':
+ # stack layout: [gcmap]
+ mc.MOV_rr(r.r0.value, r.r1.value)
+ else: # var
+ # stack layout: [gcmap][itemsize]...
+ # tid is in r0
+ # length is in r1
+ mc.MOV_rr(r.r2.value, r.r1.value)
+ mc.MOV_rr(r.r1.value, r.r0.value)
+ mc.POP([r.r0.value]) # load itemsize
+ # store the gc pattern
+ mc.POP([r.r4.value])
ofs = self.cpu.get_ofs_of_frame_field('jf_gcmap')
- # store the gc pattern
- mc.POP([r.r2.value])
- self.store_reg(mc, r.r2, r.fp, ofs)
+ self.store_reg(mc, r.r4, r.fp, ofs)
+ #
# We need to push two registers here because we are going to make a
# call an therefore the stack needs to be 8-byte aligned
mc.PUSH([r.ip.value, r.lr.value])
- # At this point we know that the values we need to compute the size
- # are stored in r0 and r1.
- mc.SUB_rr(r.r0.value, r.r1.value, r.r0.value) # compute the size we want
-
- if hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
- mc.MOV_rr(r.r1.value, r.fp.value)
-
- addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
+ #
mc.BL(addr)
-
#
# If the slowpath malloc failed, we raise a MemoryError that
# always interrupts the current loop, as a "good enough"
@@ -290,8 +324,9 @@
# return
mc.POP([r.ip.value, r.pc.value])
+ #
rawstart = mc.materialize(self.cpu.asmmemmgr, [])
- self.malloc_slowpath = rawstart
+ return rawstart
def _reload_frame_if_necessary(self, mc):
gcrootmap = self.cpu.gc_ll_descr.gcrootmap
@@ -1170,21 +1205,17 @@
else:
raise AssertionError('Trying to pop to an invalid location')
- def malloc_cond(self, nursery_free_adr, nursery_top_adr, sizeloc, gcmap):
- if sizeloc.is_imm(): # must be correctly aligned
- assert sizeloc.value & (WORD-1) == 0
+ def malloc_cond(self, nursery_free_adr, nursery_top_adr, size, gcmap):
+ assert size & (WORD-1) == 0
self.mc.gen_load_int(r.r0.value, nursery_free_adr)
self.mc.LDR_ri(r.r0.value, r.r0.value)
- if sizeloc.is_imm():
- if check_imm_arg(sizeloc.value):
- self.mc.ADD_ri(r.r1.value, r.r0.value, sizeloc.value)
- else:
- self.mc.gen_load_int(r.r1.value, sizeloc.value)
- self.mc.ADD_rr(r.r1.value, r.r0.value, r.r1.value)
+ if check_imm_arg(size):
+ self.mc.ADD_ri(r.r1.value, r.r0.value, size)
else:
- self.mc.ADD_rr(r.r1.value, r.r0.value, sizeloc.value)
+ self.mc.gen_load_int(r.r1.value, size)
+ self.mc.ADD_rr(r.r1.value, r.r0.value, r.r1.value)
self.mc.gen_load_int(r.ip.value, nursery_top_adr)
self.mc.LDR_ri(r.ip.value, r.ip.value)
@@ -1205,6 +1236,128 @@
self.mc.gen_load_int(r.ip.value, nursery_free_adr)
self.mc.STR_ri(r.r1.value, r.ip.value)
+ def malloc_cond_varsize_frame(self, nursery_free_adr, nursery_top_adr,
+ sizeloc, gcmap):
+ if sizeloc is r.r0:
+ self.mc.MOV_rr(r.r1.value, r.r0.value)
+ sizeloc = r.r1
+ self.mc.gen_load_int(r.r0.value, nursery_free_adr)
+ self.mc.LDR_ri(r.r0.value, r.r0.value)
+ #
+ self.mc.ADD_rr(r.r1.value, r.r0.value, sizeloc.value)
+ #
+ self.mc.gen_load_int(r.ip.value, nursery_top_adr)
+ self.mc.LDR_ri(r.ip.value, r.ip.value)
+
+ self.mc.CMP_rr(r.r1.value, r.ip.value)
+ #
+ self.push_gcmap(self.mc, gcmap, push=True, cond=c.HI)
+
+ self.mc.BL(self.malloc_slowpath, c=c.HI)
+
+ self.mc.gen_load_int(r.ip.value, nursery_free_adr)
+ self.mc.STR_ri(r.r1.value, r.ip.value)
+
+ def malloc_cond_varsize(self, kind, nursery_free_adr, nursery_top_adr,
+ lengthloc, itemsize, maxlength, gcmap,
+ arraydescr):
+ from rpython.jit.backend.llsupport.descr import ArrayDescr
+ assert isinstance(arraydescr, ArrayDescr)
+
+ # lengthloc is the length of the array, which we must not modify!
+ assert lengthloc is not r.r0 and lengthloc is not r.r1
+ if lengthloc.is_reg():
+ varsizeloc = lengthloc
+ else:
+ assert lengthloc.is_stack()
+ self.regalloc_mov(lengthloc, r.r1)
+ varsizeloc = r.r1
+ #
+ if check_imm_arg(maxlength):
+ self.mc.CMP_ri(varsizeloc.value, maxlength)
+ else:
+ self.mc.gen_load_int(r.ip.value, maxlength)
+ self.mc.CMP_rr(varsizeloc.value, r.ip.value)
+ jmp_adr0 = self.mc.currpos() # jump to (large)
+ self.mc.BKPT()
+ #
+ self.mc.gen_load_int(r.r0.value, nursery_free_adr)
+ self.mc.LDR_ri(r.r0.value, r.r0.value)
+
+
+ if valid_addressing_size(itemsize):
+ shiftsize = get_scale(itemsize)
+ else:
+ shiftsize = self._mul_const_scaled(self.mc, r.lr, varsizeloc,
+ itemsize)
+ varsizeloc = r.lr
+ # now varsizeloc is a register != r0. The size of
+ # the variable part of the array is (varsizeloc << shiftsize)
+ assert arraydescr.basesize >= self.gc_minimal_size_in_nursery
+ constsize = arraydescr.basesize + self.gc_size_of_header
+ force_realignment = (itemsize % WORD) != 0
+ if force_realignment:
+ constsize += WORD - 1
+ self.mc.gen_load_int(r.ip.value, constsize)
+ # constsize + (varsizeloc << shiftsize)
+ self.mc.ADD_rr(r.r1.value, r.ip.value, varsizeloc.value,
+ imm=shiftsize, shifttype=shift.LSL)
+ self.mc.ADD_rr(r.r1.value, r.r1.value, r.r0.value)
+ if force_realignment:
+ self.mc.MVN_ri(r.ip.value, imm=(WORD - 1))
+ self.mc.AND_rr(r.r1.value, r.r1.value, r.ip.value)
+ # now r1 contains the total size in bytes, rounded up to a multiple
+ # of WORD, plus nursery_free_adr
+ #
+ self.mc.gen_load_int(r.ip.value, nursery_top_adr)
+ self.mc.LDR_ri(r.ip.value, r.ip.value)
+
+ self.mc.CMP_rr(r.r1.value, r.ip.value)
+ jmp_adr1 = self.mc.currpos() # jump to (after-call)
+ self.mc.BKPT()
+ #
+ # (large)
+ currpos = self.mc.currpos()
+ pmc = OverwritingBuilder(self.mc, jmp_adr0, WORD)
+ pmc.B_offs(currpos, c.GT)
+ #
+ # save the gcmap
+ self.push_gcmap(self.mc, gcmap, push=True)
+ #
+ if kind == rewrite.FLAG_ARRAY:
+ self.mc.gen_load_int(r.r0.value, arraydescr.tid)
+ self.regalloc_mov(lengthloc, r.r1)
+ self.regalloc_push(imm(itemsize))
+ addr = self.malloc_slowpath_varsize
+ else:
+ if kind == rewrite.FLAG_STR:
+ addr = self.malloc_slowpath_str
+ else:
+ assert kind == rewrite.FLAG_UNICODE
+ addr = self.malloc_slowpath_unicode
+ self.regalloc_mov(lengthloc, r.r1)
+ self.mc.BL(addr)
+ #
+ jmp_location = self.mc.currpos() # jump to (done)
+ self.mc.BKPT()
+ # (after-call)
+ currpos = self.mc.currpos()
+ pmc = OverwritingBuilder(self.mc, jmp_adr1, WORD)
+ pmc.B_offs(currpos, c.LS)
+ #
+ # write down the tid, but not if it's the result of the CALL
+ self.mc.gen_load_int(r.ip.value, arraydescr.tid)
+ self.mc.STR_ri(r.ip.value, r.r0.value)
+
+ # while we're at it, this line is not needed if we've done the CALL
+ self.mc.gen_load_int(r.ip.value, nursery_free_adr)
+ self.mc.STR_ri(r.r1.value, r.ip.value)
+ # (done)
+ # skip instructions after call
+ currpos = self.mc.currpos()
+ pmc = OverwritingBuilder(self.mc, jmp_location, WORD)
+ pmc.B_offs(currpos)
+
def push_gcmap(self, mc, gcmap, push=False, store=False, cond=c.AL):
ptr = rffi.cast(lltype.Signed, gcmap)
if push:
@@ -1222,6 +1375,32 @@
mc.gen_load_int(r.ip.value, 0)
self.store_reg(mc, r.ip, r.fp, ofs)
+ def _mul_const_scaled(self, mc, targetreg, sourcereg, itemsize):
+ """Produce one operation to do roughly
+ targetreg = sourcereg * itemsize
+ except that the targetreg may still need shifting by 0,1,2,3.
+ """
+ if (itemsize & 7) == 0:
+ shiftsize = 3
+ elif (itemsize & 3) == 0:
+ shiftsize = 2
+ elif (itemsize & 1) == 0:
+ shiftsize = 1
+ else:
+ shiftsize = 0
+ itemsize >>= shiftsize
+ #
+ if valid_addressing_size(itemsize - 1):
+ self.mc.ADD_rr(targetreg.value, sourcereg.value, sourcereg.value,
+ imm=get_scale(itemsize - 1), shifttype=shift.LSL)
+ elif valid_addressing_size(itemsize):
+ self.mc.LSL_ri(targetreg.value, sourcereg.value,
+ get_scale(itemsize))
+ else:
+ mc.gen_load_int(targetreg.value, itemsize.value)
+ mc.MUL(targetreg.value, sourcereg.value, targetreg.value)
+ #
+ return shiftsize
def not_implemented(msg):
os.write(2, '[ARM/asm] %s\n' % msg)
diff --git a/rpython/jit/backend/arm/opassembler.py b/rpython/jit/backend/arm/opassembler.py
--- a/rpython/jit/backend/arm/opassembler.py
+++ b/rpython/jit/backend/arm/opassembler.py
@@ -191,7 +191,7 @@
return fcond
def _emit_guard(self, op, arglocs, fcond, save_exc,
- is_guard_not_invalidated=False,
+ is_guard_not_invalidated=False,
is_guard_not_forced=False):
assert isinstance(save_exc, bool)
assert isinstance(fcond, int)
@@ -297,7 +297,7 @@
return self._emit_guard(op, locs, fcond, save_exc=False,
is_guard_not_invalidated=True)
- def emit_op_label(self, op, arglocs, regalloc, fcond):
+ def emit_op_label(self, op, arglocs, regalloc, fcond):
self._check_frame_depth_debug(self.mc)
return fcond
@@ -1334,23 +1334,6 @@
self._alignment_check()
return fcond
- def emit_op_call_malloc_nursery(self, op, arglocs, regalloc, fcond):
- # registers r0 and r1 are allocated for this call
- assert len(arglocs) == 1
- sizeloc = arglocs[0]
- gc_ll_descr = self.cpu.gc_ll_descr
- gcmap = regalloc.get_gcmap([r.r0, r.r1])
- self.malloc_cond(
- gc_ll_descr.get_nursery_free_addr(),
- gc_ll_descr.get_nursery_top_addr(),
- sizeloc,
- gcmap
- )
- self._alignment_check()
- return fcond
- emit_op_call_malloc_nursery_varsize_frame = emit_op_call_malloc_nursery
-
-
def _alignment_check(self):
if not self.debug:
return
@@ -1436,7 +1419,7 @@
self.mc.VCVT_f64_f32(r.vfp_ip.value, arg.value)
self.mc.VMOV_rc(res.value, r.ip.value, r.vfp_ip.value)
return fcond
-
+
def emit_op_cast_singlefloat_to_float(self, op, arglocs, regalloc, fcond):
arg, res = arglocs
assert res.is_vfp_reg()
diff --git a/rpython/jit/backend/arm/regalloc.py b/rpython/jit/backend/arm/regalloc.py
--- a/rpython/jit/backend/arm/regalloc.py
+++ b/rpython/jit/backend/arm/regalloc.py
@@ -2,7 +2,8 @@
from rpython.rlib import rgc
from rpython.rlib.debug import debug_print, debug_start, debug_stop
from rpython.jit.backend.llsupport.regalloc import FrameManager, \
- RegisterManager, TempBox, compute_vars_longevity, BaseRegalloc
+ RegisterManager, TempBox, compute_vars_longevity, BaseRegalloc, \
+ get_scale
from rpython.jit.backend.arm import registers as r
from rpython.jit.backend.arm import locations
from rpython.jit.backend.arm.locations import imm, get_fp_offset
@@ -1011,20 +1012,72 @@
self.rm.force_allocate_reg(op.result, selected_reg=r.r0)
t = TempInt()
self.rm.force_allocate_reg(t, selected_reg=r.r1)
+
+ sizeloc = size_box.getint()
+ gc_ll_descr = self.cpu.gc_ll_descr
+ gcmap = self.get_gcmap([r.r0, r.r1])
self.possibly_free_var(t)
- return [imm(size)]
+ self.assembler.malloc_cond(
+ gc_ll_descr.get_nursery_free_addr(),
+ gc_ll_descr.get_nursery_top_addr(),
+ sizeloc,
+ gcmap
+ )
+ self.assembler._alignment_check()
def prepare_op_call_malloc_nursery_varsize_frame(self, op, fcond):
size_box = op.getarg(0)
- assert isinstance(size_box, BoxInt)
-
+ assert isinstance(size_box, BoxInt) # we cannot have a const here!
+ # sizeloc must be in a register, but we can free it now
+ # (we take care explicitly of conflicts with r0 or r1)
+ sizeloc = self.rm.make_sure_var_in_reg(size_box)
+ self.rm.possibly_free_var(size_box)
+ #
self.rm.force_allocate_reg(op.result, selected_reg=r.r0)
+ #
t = TempInt()
self.rm.force_allocate_reg(t, selected_reg=r.r1)
- argloc = self.make_sure_var_in_reg(size_box,
- forbidden_vars=[op.result, t])
+ #
+ gcmap = self.get_gcmap([r.r0, r.r1])
self.possibly_free_var(t)
- return [argloc]
+ #
+ gc_ll_descr = self.assembler.cpu.gc_ll_descr
+ self.assembler.malloc_cond_varsize_frame(
+ gc_ll_descr.get_nursery_free_addr(),
+ gc_ll_descr.get_nursery_top_addr(),
+ sizeloc,
+ gcmap
+ )
+ self.assembler._alignment_check()
+
+ def prepare_op_call_malloc_nursery_varsize(self, op, fcond):
+ gc_ll_descr = self.assembler.cpu.gc_ll_descr
+ if not hasattr(gc_ll_descr, 'max_size_of_young_obj'):
+ raise Exception("unreachable code")
+ # for boehm, this function should never be called
+ arraydescr = op.getdescr()
+ length_box = op.getarg(2)
+ assert isinstance(length_box, BoxInt) # we cannot have a const here!
+ # the result will be in r0
+ self.rm.force_allocate_reg(op.result, selected_reg=r.r0)
+ # we need r1 as a temporary
+ tmp_box = TempBox()
+ self.rm.force_allocate_reg(tmp_box, selected_reg=r.r1)
+ gcmap = self.get_gcmap([r.r0, r.r1]) # allocate the gcmap *before*
+ self.rm.possibly_free_var(tmp_box)
+ # length_box always survives: it's typically also present in the
+ # next operation that will copy it inside the new array. It's
+ # fine to load it from the stack too, as long as it's != r0, r1.
+ lengthloc = self.rm.loc(length_box)
+ self.rm.possibly_free_var(length_box)
+ #
+ itemsize = op.getarg(1).getint()
+ maxlength = (gc_ll_descr.max_size_of_young_obj - WORD * 2) / itemsize
+ self.assembler.malloc_cond_varsize(
+ op.getarg(0).getint(),
+ gc_ll_descr.get_nursery_free_addr(),
+ gc_ll_descr.get_nursery_top_addr(),
+ lengthloc, itemsize, maxlength, gcmap, arraydescr)
prepare_op_debug_merge_point = void
prepare_op_jit_debug = void
@@ -1211,13 +1264,6 @@
operations_with_guard = [notimplemented_with_guard] * (rop._LAST + 1)
-def get_scale(size):
- scale = 0
- while (1 << scale) < size:
- scale += 1
- assert (1 << scale) == size
- return scale
-
for key, value in rop.__dict__.items():
key = key.lower()
if key.startswith('_'):
diff --git a/rpython/jit/backend/arm/runner.py b/rpython/jit/backend/arm/runner.py
--- a/rpython/jit/backend/arm/runner.py
+++ b/rpython/jit/backend/arm/runner.py
@@ -19,6 +19,7 @@
supports_longlong = False # XXX requires an implementation of
# read_timestamp that works in user mode
supports_singlefloats = not detect_hardfloat()
+ can_inline_varsize_malloc = True
from rpython.jit.backend.arm.arch import JITFRAME_FIXED_SIZE
all_reg_indexes = range(len(all_regs))
More information about the pypy-commit
mailing list