[pypy-commit] pypy vecopt: float32/64 addition of vectors now uses packed vector load x86
plan_rich
noreply at buildbot.pypy.org
Thu May 14 11:34:25 CEST 2015
Author: Richard Plangger <rich at pasra.at>
Branch: vecopt
Changeset: r77319:dcbabaa3d2d9
Date: 2015-05-14 11:21 +0200
http://bitbucket.org/pypy/pypy/changeset/dcbabaa3d2d9/
Log: float32/64 addition of vectors now uses packed vector load x86 +
constant/variable expansion
diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -18,6 +18,9 @@
if not self.CPUClass.vector_extension:
py.test.skip("needs vector extension to run (for now)")
+ def assert_float_equal(self, f1, f2, delta=0.0001):
+ assert abs(f1-f2) < delta
+
def setup_class(cls):
default = """
a = [1,2,3,4]
@@ -55,12 +58,19 @@
w_res = i.getitem(s)
if isinstance(w_res, boxes.W_Float64Box):
return w_res.value
+ if isinstance(w_res, boxes.W_Float32Box):
+ return float(w_res.value)
elif isinstance(w_res, boxes.W_Int64Box):
return float(w_res.value)
+ elif isinstance(w_res, boxes.W_Int32Box):
+ return float(int(w_res.value))
+ elif isinstance(w_res, boxes.W_Int16Box):
+ return float(int(w_res.value))
elif isinstance(w_res, boxes.W_LongBox):
return float(w_res.value)
elif isinstance(w_res, boxes.W_BoolBox):
return float(w_res.value)
+ print "ERROR: did not implement return type for interpreter"
raise TypeError(w_res)
if self.graph is None:
@@ -80,51 +90,44 @@
retval = self.interp.eval_graph(self.graph, [i])
return retval
- def define_add_float():
+ def define_float32_add():
return """
a = |30|
b = a + a
- b -> 3
+ b -> 15
"""
+ def test_float32_add(self):
+ result = self.run("float32_add")
+ self.assert_float_equal(result, 15.0 + 15.0)
- def define_add_float32():
+ def define_float_add():
return """
a = astype(|30|, float32)
b = a + a
- b -> 3
+ b -> 17
"""
+ def test_float_add(self):
+ result = self.run("float_add")
+ self.assert_float_equal(result, 17.0 + 17.0)
- def test_add_float(self):
- result = self.run("add_float")
- assert result == 3 + 3
+ def define_float32_add_const():
+ return """
+ a = astype(|30|, float32)
+ b = a + 77.345
+ b -> 29
+ """
+ def test_float32_add_const(self):
+ result = self.run("float32_add_const")
+ self.assert_float_equal(result, 29.0 + 77.345)
- def test_add_float32(self):
- result = self.run("add_float32")
- assert result == 3.0 + 3.0
-
- def define_add_float32_const():
+ def define_float_add_const():
return """
- a = astype(|30|, float32) + 3.0
+ a = |30| + 25.5
a -> 29
"""
-
- def define_add_float_const():
- return """
- a = astype(|30|, float32) + 3.0
- a -> 29
- """
-
- def test_add_float_const(self):
- result = self.run("add_float_const")
- assert result == 29.0 + 3.0
- self.check_trace_count(1)
- def test_add_float22_const(self):
- result = self.run("add_float_const")
- assert result == 29.0 + 3.0
- self.check_trace_count(1)
- result = self.run("add_float32_const")
- assert result == 29.0 + 3.0
- self.check_trace_count(1)
+ def test_float_add_const(self):
+ result = self.run("float_add_const")
+ self.assert_float_equal(result, 29.0 + 25.5)
def define_pow():
return """
diff --git a/rpython/jit/backend/llgraph/runner.py b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -148,6 +148,7 @@
class ArrayDescr(AbstractDescr):
def __init__(self, A):
self.A = self.OUTERA = A
+ self.concrete_type = '\x00'
if isinstance(A, lltype.Struct):
self.A = A._flds[A._arrayfld]
diff --git a/rpython/jit/backend/llsupport/descr.py b/rpython/jit/backend/llsupport/descr.py
--- a/rpython/jit/backend/llsupport/descr.py
+++ b/rpython/jit/backend/llsupport/descr.py
@@ -2,7 +2,7 @@
from rpython.rtyper.lltypesystem import lltype, rffi, llmemory
from rpython.rtyper.lltypesystem.lloperation import llop
from rpython.jit.backend.llsupport import symbolic, support
-from rpython.jit.metainterp.history import AbstractDescr, getkind
+from rpython.jit.metainterp.history import AbstractDescr, getkind, FLOAT, INT
from rpython.jit.metainterp import history
from rpython.jit.codewriter import heaptracker, longlong
from rpython.jit.codewriter.longlong import is_longlong
@@ -192,7 +192,7 @@
lendescr = None
flag = '\x00'
vinfo = None
- loaded_float = False
+ concrete_type = '\x00'
def __init__(self, basesize, itemsize, lendescr, flag):
self.basesize = basesize
@@ -261,10 +261,11 @@
lendescr = get_field_arraylen_descr(gccache, ARRAY_OR_STRUCT)
flag = get_type_flag(ARRAY_INSIDE.OF)
arraydescr = ArrayDescr(basesize, itemsize, lendescr, flag)
- if ARRAY_INSIDE.OF is lltype.SingleFloat:
- # it would be optimal to set the flag as FLOAT_TYPE
- # but it is not possible???
- arraydescr.loaded_float = True
+ if ARRAY_INSIDE.OF is lltype.SingleFloat or \
+ ARRAY_INSIDE.OF is lltype.Float:
+ # it would be better to set the flag as FLOAT_TYPE
+ # for single float -> leads to problems
+ arraydescr.concrete_type = FLOAT
if ARRAY_OR_STRUCT._gckind == 'gc':
gccache.init_array_descr(ARRAY_OR_STRUCT, arraydescr)
cache[ARRAY_OR_STRUCT] = arraydescr
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -2474,9 +2474,9 @@
self.mc.MOVDQU(resloc, src_addr)
else:
if itemsize == 4:
- self.mc.MOVSS(resloc, src_addr)
+ self.mc.MOVUPS(resloc, src_addr)
elif itemsize == 8:
- self.mc.MOVSD(resloc, src_addr)
+ self.mc.MOVUPD(resloc, src_addr)
def genop_discard_vec_setarrayitem_raw(self, op, arglocs):
# considers item scale (raw_store does not)
@@ -2500,9 +2500,9 @@
self.mc.MOVDQU(dest_loc, value_loc)
else:
if itemsize == 4:
- self.mc.MOVSS(dest_loc, value_loc)
+ self.mc.MOVUPS(dest_loc, value_loc)
elif itemsize == 8:
- self.mc.MOVSD(dest_loc, value_loc)
+ self.mc.MOVUPD(dest_loc, value_loc)
def genop_vec_int_add(self, op, arglocs, resloc):
loc0, loc1, itemsize_loc = arglocs
diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -645,6 +645,8 @@
MOVAPD = _binaryop('MOVAPD')
MOVDQA = _binaryop('MOVDQA')
MOVDQU = _binaryop('MOVDQU')
+ MOVUPS = _binaryop('MOVUPS')
+ MOVUPD = _binaryop('MOVUPD')
ADDSD = _binaryop('ADDSD')
ADDPD = _binaryop('ADDPD')
SUBSD = _binaryop('SUBSD')
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -726,9 +726,6 @@
MOVD32_xs = xmminsn('\x66', rex_nw, '\x0F\x6E', register(1, 8), stack_sp(2))
PSRAD_xi = xmminsn('\x66', rex_nw, '\x0F\x72', register(1), '\xE0', immediate(2, 'b'))
- MOVUPS_mx = xmminsn(rex_nw, '\x0F\x11', register(2, 8), mem_reg_plus_const(1))
- MOVUPS_jx = xmminsn(rex_nw, '\x0F\x11', register(2, 8), abs_(1))
- MOVUPS_ax = xmminsn(rex_nw, '\x0F\x11', register(2, 8), mem_reg_plus_scaled_reg_plus_const(1))
MOVSS_xx = xmminsn('\xF3', rex_nw, '\x0F\x10', register(1,8), register(2), '\xC0')
@@ -906,14 +903,14 @@
define_modrm_modes('MOVAPD_*x', ['\x66', rex_nw, '\x0F\x29', register(2,8)],
regtype='XMM')
-define_modrm_modes('MOVDQA_x*', ['\x66', rex_nw, '\x0F\x6F', register(1, 8)],
- regtype='XMM')
-define_modrm_modes('MOVDQA_*x', ['\x66', rex_nw, '\x0F\x7F', register(2, 8)],
- regtype='XMM')
-define_modrm_modes('MOVDQU_x*', ['\xF3', rex_nw, '\x0F\x6F', register(1, 8)],
- regtype='XMM')
-define_modrm_modes('MOVDQU_*x', ['\xF3', rex_nw, '\x0F\x7F', register(2, 8)],
- regtype='XMM')
+define_modrm_modes('MOVDQA_x*', ['\x66', rex_nw, '\x0F\x6F', register(1, 8)], regtype='XMM')
+define_modrm_modes('MOVDQA_*x', ['\x66', rex_nw, '\x0F\x7F', register(2, 8)], regtype='XMM')
+define_modrm_modes('MOVDQU_x*', ['\xF3', rex_nw, '\x0F\x6F', register(1, 8)], regtype='XMM')
+define_modrm_modes('MOVDQU_*x', ['\xF3', rex_nw, '\x0F\x7F', register(2, 8)], regtype='XMM')
+define_modrm_modes('MOVUPS_x*', [ rex_nw, '\x0F\x10', register(1, 8)], regtype='XMM')
+define_modrm_modes('MOVUPS_*x', [ rex_nw, '\x0F\x11', register(2, 8)], regtype='XMM')
+define_modrm_modes('MOVUPD_x*', ['\x66', rex_nw, '\x0F\x10', register(1, 8)], regtype='XMM')
+define_modrm_modes('MOVUPD_*x', ['\x66', rex_nw, '\x0F\x11', register(2, 8)], regtype='XMM')
define_modrm_modes('SQRTSD_x*', ['\xF2', rex_nw, '\x0F\x51', register(1,8)], regtype='XMM')
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -1192,6 +1192,7 @@
v224 = vec_float_add(v219, v222, 2)
v225 = vec_cast_float_to_singlefloat(v223, 2)
v226 = vec_cast_float_to_singlefloat(v224, 2)
+ vec_box_pack(v225, v226, 2, 2)
vec_raw_store(p2, i4, v225, 4, descr=singlefloatarraydescr)
jump(p0, p1, p2, i210, i189)
"""
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -19,12 +19,6 @@
def __str__(self):
return 'NotAVectorizeableLoop()'
-def dprint(*args):
- if not we_are_translated():
- for arg in args:
- print arg,
- print
-
def debug_print_operations(loop):
if not we_are_translated():
print('--- loop instr numbered ---')
@@ -48,14 +42,13 @@
inline_short_preamble, start_state, False)
orig_ops = loop.operations
try:
- debug_print_operations(loop)
+ jitdriver_sd.profiler.count(Counters.OPT_VECTORIZE_TRY)
opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop, optimizations)
opt.propagate_all_forward()
- debug_print_operations(loop)
+ jitdriver_sd.profiler.count(Counters.OPT_VECTORIZED)
except NotAVectorizeableLoop:
+ # vectorization is not possible, propagate only normal optimizations
loop.operations = orig_ops
- # vectorization is not possible, propagate only normal optimizations
- pass
class VectorizingOptimizer(Optimizer):
""" Try to unroll the loop and find instructions to group """
@@ -371,7 +364,6 @@
i += 1
if len_before == len(self.packset.packs):
break
- print self.packset.packs
def schedule(self):
self.guard_early_exit = -1
@@ -393,37 +385,28 @@
self.clear_newoperations()
def unpack_from_vector(self, op, sched_data):
- box_to_vbox = sched_data.box_to_vbox
+ args = op.getarglist()
for i, arg in enumerate(op.getarglist()):
if isinstance(arg, Box):
- arg = sched_data.unpack_rename(arg)
- op.setarg(i, arg)
- (j, vbox) = box_to_vbox.get(arg, (-1, None))
- if vbox:
- arg_cloned = arg.clonebox()
- cj = ConstInt(j)
- ci = ConstInt(1)
- unpack_op = ResOperation(rop.VEC_BOX_UNPACK, [vbox, cj, ci], arg_cloned)
- self.emit_operation(unpack_op)
- sched_data.rename_unpacked(arg, arg_cloned)
- op.setarg(i, arg_cloned)
+ self._unpack_from_vector(args, i, arg, sched_data)
if op.is_guard():
fail_args = op.getfailargs()
for i, arg in enumerate(fail_args):
if arg and isinstance(arg, Box):
- arg = sched_data.unpack_rename(arg)
- fail_args[i] = arg
- (j, vbox) = box_to_vbox.get(arg, (-1, None))
- if vbox:
- arg_cloned = arg.clonebox()
- cj = ConstInt(j)
- ci = ConstInt(vbox.item_count)
- unpack_op = ResOperation(rop.VEC_BOX_UNPACK, [vbox, cj, ci], arg_cloned)
- self.emit_operation(unpack_op)
- sched_data.rename_unpacked(arg, arg_cloned)
- fail_args[i] = arg_cloned
+ self._unpack_from_vector(fail_args, i, arg, sched_data)
-
+ def _unpack_from_vector(self, args, i, arg, sched_data):
+ arg = sched_data.unpack_rename(arg)
+ args[i] = arg
+ (j, vbox) = sched_data.box_to_vbox.get(arg, (-1, None))
+ if vbox:
+ arg_cloned = arg.clonebox()
+ cj = ConstInt(j)
+ ci = ConstInt(1)
+ unpack_op = ResOperation(rop.VEC_BOX_UNPACK, [vbox, cj, ci], arg_cloned)
+ self.emit_operation(unpack_op)
+ sched_data.rename_unpacked(arg, arg_cloned)
+ args[i] = arg_cloned
def analyse_index_calculations(self):
if len(self.loop.operations) <= 1 or self.early_exit_idx == -1:
@@ -517,15 +500,6 @@
self.loop.operations = self._newoperations[:]
- def check_adjacent_at_runtime(self, mem_a, mem_b):
- ivar_a = mem_a.index_var
- ivar_b = mem_b.index_var
- if ivar_a.mods:
- print "guard(", ivar_a.mods[1], " is adjacent)"
- if ivar_b.mods:
- print "guard(", ivar_b.mods[1], " is adjacent)"
- pass
-
def must_unpack_result_to_exec(op, target_op):
# TODO either move to resop or util
if op.getoperation().vector != -1:
@@ -575,11 +549,14 @@
@staticmethod
def by_descr(descr):
_t = INT
- if descr.is_array_of_floats() or descr.loaded_float:
+ if descr.is_array_of_floats() or descr.concrete_type == FLOAT:
_t = FLOAT
pt = PackType(_t, descr.get_item_size_in_bytes(), descr.is_item_signed())
return pt
+ def is_valid(self):
+ return self.type != PackType.UNKNOWN_TYPE and self.size > 0
+
def record_vbox(self, vbox):
if self.type == PackType.UNKNOWN_TYPE:
self.type = vbox.type
@@ -657,9 +634,8 @@
self.pack = pack
# properties that hold for the pack are:
# isomorphism (see func above)
-
if pack.ptype is None:
- self.propagete_ptype()
+ self.propagate_ptype()
self.preamble_ops = []
if pack.is_overloaded(self.vec_reg_size):
@@ -699,7 +675,7 @@
self.preamble_ops.append(vop)
- def propagete_ptype(self):
+ def propagate_ptype(self):
op0 = self.pack.operations[0].getoperation()
packargs = ROP_ARG_RES_VECTOR.get(op0.vector, None)
if packargs is None:
@@ -708,22 +684,16 @@
ptype = packargs.getpacktype()
for i,arg in enumerate(args):
if packargs.vector_arg(i):
- vbox = self.get_vbox_for(arg)
+ _, vbox = self.box_to_vbox.get(arg, (-1, None))
if vbox is not None:
ptype.record_vbox(vbox)
else:
- ptype.size = arg
- raise NotImplementedError
+ # vbox of a variable/constant is not present here
+ pass
+ if not we_are_translated():
+ assert ptype.is_valid()
self.pack.ptype = ptype
-
- def get_vbox_for(self, arg):
- try:
- _, vbox = self.box_to_vbox[arg]
- return vbox
- except KeyError:
- return None
-
def vector_result(self, vop, packargs):
ops = self.pack.operations
result = vop.result
@@ -743,11 +713,12 @@
i += 1
def box_vector(self, ptype):
+ """ TODO remove this? """
return BoxVector(ptype.type, self.pack_ops, ptype.size, ptype.signed)
def vector_arg(self, vop, argidx, expand):
ops = self.pack.operations
- vbox = self.get_vbox_for(vop.getarg(argidx))
+ _, vbox = self.box_to_vbox.get(vop.getarg(argidx), (-1, None))
if not vbox:
if expand:
vbox = self.expand_box_to_vector_box(vop, argidx)
@@ -759,24 +730,31 @@
packed = vbox.item_count
if packed < packable:
args = [op.getoperation().getarg(argidx) for op in ops]
- self.package(vbox, packed, args)
+ self.package(vbox, packed, args, packable)
vop.setarg(argidx, vbox)
return vbox
- def package(self, tgt_box, index, args):
+ def package(self, tgt_box, index, args, packable):
+ """ If there are two vector boxes:
+ v1 = [<empty>,<emtpy>,X,Y]
+ v2 = [A,B,<empty>,<empty>]
+ this function creates a box pack instruction to merge them to:
+ v1/2 = [A,B,X,Y]
+ """
arg_count = len(args)
i = index
- while i < arg_count:
+ while i < arg_count and tgt_box.item_count < packable:
arg = args[i]
pos, src_box = self.box_to_vbox.get(arg, (-1, None))
- if pos != 0:
+ if pos == -1:
i += 1
continue
op = ResOperation(rop.VEC_BOX_PACK,
[tgt_box, src_box, ConstInt(i),
ConstInt(src_box.item_count)], None)
self.preamble_ops.append(op)
- i += 1
+ tgt_box.item_count += src_box.item_count
+ i += src_box.item_count
def expand_box_to_vector_box(self, vop, argidx):
arg = vop.getarg(argidx)
@@ -792,7 +770,6 @@
i += 1
vbox = BoxVector(arg.type, self.pack_ops)
- print "creating vectorbox", vbox, "of type", arg.type
if all_same_box:
expand_op = ResOperation(rop.VEC_EXPAND, [arg, ConstInt(self.pack_ops)], vbox)
self.preamble_ops.append(expand_op)
diff --git a/rpython/rlib/jit.py b/rpython/rlib/jit.py
--- a/rpython/rlib/jit.py
+++ b/rpython/rlib/jit.py
@@ -1147,6 +1147,8 @@
OPT_OPS
OPT_GUARDS
OPT_FORCINGS
+ OPT_VECTORIZE_TRY
+ OPT_VECTORIZED
ABORT_TOO_LONG
ABORT_BRIDGE
ABORT_BAD_LOOP
More information about the pypy-commit
mailing list