[pypy-commit] pypy vecopt: finished the unpacking for float32/64 and int32/64. added x86 packed mul operations (don't know if we ever can use them for int64)
plan_rich
noreply at buildbot.pypy.org
Tue May 19 16:59:46 CEST 2015
Author: Richard Plangger <rich at pasra.at>
Branch: vecopt
Changeset: r77392:199b27a762f8
Date: 2015-05-19 16:14 +0200
http://bitbucket.org/pypy/pypy/changeset/199b27a762f8/
Log: finished the unpacking for float32/64 and int32/64. added x86 packed
mul operations (don't know if we ever can use them for int64) typed
the vector box arguments (including count). they are able to
automatically unpack/pack instructions if they are not in place
rewritten most of unpack/pack float (as mentioned earlier) and using
insertps for float32
diff --git a/pypy/module/micronumpy/compile.py b/pypy/module/micronumpy/compile.py
--- a/pypy/module/micronumpy/compile.py
+++ b/pypy/module/micronumpy/compile.py
@@ -209,7 +209,9 @@
return self.wrap(1)
def mul(self, w_obj1, w_obj2):
- return self.wrap(1)
+ assert isinstance(w_obj1, boxes.W_GenericBox)
+ assert isinstance(w_obj2, boxes.W_GenericBox)
+ return w_obj1.descr_mul(self, w_obj2)
def pow(self, w_obj1, w_obj2, _):
return self.wrap(1)
@@ -324,7 +326,7 @@
return W_TypeObject(w_obj.typedef.name)
def call_function(self, tp, w_dtype, *args):
- if tp is self.w_float
+ if tp is self.w_float:
if isinstance(w_dtype, boxes.W_Float64Box):
return FloatObject(float(w_dtype.value))
if isinstance(w_dtype, boxes.W_Float32Box):
diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -200,28 +200,83 @@
return """
a = astype(|30|, int32)
b = a + 1i
+ d = astype(|30|, int32)
+ c = d + 2.0
x1 = b -> 7
x2 = b -> 8
- x3 = b -> 9
- x4 = b -> 10
- r = x1 + x2 + x3 + x4
- r
+ x3 = c -> 11
+ x4 = c -> 12
+ x1 + x2 + x3 + x4
"""
- #return """
- #a = astype(|30|, int32)
- #b = a + 1i
- #c = a + 2.0
- #x1 = b -> 7
- #x2 = b -> 8
- #x3 = c -> 11
- #x4 = c -> 12
- #x1 + x2 + x3 + x4
- #"""
def test_int32_add_const(self):
result = self.run("int32_add_const")
- assert int(result) == 7+1+8+1+9+1+10+1
- self.check_vectorized(1, 1)
+ assert int(result) == 7+1+8+1+11+2+12+2
+ self.check_vectorized(2, 2)
+ def define_int_mul_array():
+ return """
+ a = astype(|30|, int)
+ b = astype(|30|, int)
+ c = a * b
+ x1 = c -> 7
+ x2 = c -> 8
+ x3 = c -> 11
+ x4 = c -> 12
+ x1 + x2 + x3 + x4
+ """
+ def test_int_mul_array(self):
+ py.test.skip("how to multiply quad word integers?")
+ result = self.run("int_mul_array")
+ assert int(result) == 7*7+8*8+11*11+12*12
+ self.check_vectorized(2, 2)
+
+ def define_float_mul_array():
+ return """
+ a = astype(|30|, float)
+ b = astype(|30|, float)
+ c = a * b
+ x1 = c -> 7
+ x2 = c -> 8
+ x3 = c -> 11
+ x4 = c -> 12
+ x1 + x2 + x3 + x4
+ """
+ def test_float_mul_array(self):
+ result = self.run("float_mul_array")
+ assert int(result) == 7*7+8*8+11*11+12*12
+ self.check_vectorized(2, 2)
+
+ def define_int32_mul_array():
+ return """
+ a = astype(|30|, int32)
+ b = astype(|30|, int32)
+ c = a * b
+ x1 = c -> 7
+ x2 = c -> 8
+ x3 = c -> 11
+ x4 = c -> 12
+ x1 + x2 + x3 + x4
+ """
+ def test_int32_mul_array(self):
+ result = self.run("int32_mul_array")
+ assert int(result) == 7*7+8*8+11*11+12*12
+ self.check_vectorized(2, 2)
+
+ def define_float32_mul_array():
+ return """
+ a = astype(|30|, float32)
+ b = astype(|30|, float32)
+ c = a * b
+ x1 = c -> 7
+ x2 = c -> 8
+ x3 = c -> 11
+ x4 = c -> 12
+ x1 + x2 + x3 + x4
+ """
+ def test_float32_mul_array(self):
+ result = self.run("float32_mul_array")
+ assert int(result) == 7*7+8*8+11*11+12*12
+ self.check_vectorized(2, 2)
def define_pow():
return """
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -2510,6 +2510,18 @@
elif itemsize == 8:
self.mc.MOVUPD(dest_loc, value_loc)
+ def genop_vec_int_mul(self, op, arglocs, resloc):
+ loc0, loc1, itemsize_loc = arglocs
+ itemsize = itemsize_loc.value
+ if itemsize == 2:
+ self.mc.PMULLW(loc0, loc1)
+ elif itemsize == 4:
+ self.mc.PMULLD(loc0, loc1)
+ elif itemsize == 8:
+ self.mc.PMULDQ(loc0, loc1)
+ else:
+ raise NotImplementedError("did not implement integer mul")
+
def genop_vec_int_add(self, op, arglocs, resloc):
loc0, loc1, itemsize_loc = arglocs
itemsize = itemsize_loc.value
@@ -2553,10 +2565,10 @@
srcloc, sizeloc, tosizeloc = arglocs
size = sizeloc.value
tosize = tosizeloc.value
+ if size == tosize:
+ return # already the right size
if size == 4 and tosize == 8:
scratch = X86_64_SCRATCH_REG.value
- print resloc, "[0] <- int64(", srcloc, "[0])"
- print resloc, "[1] <- int64(", srcloc, "[1])"
self.mc.PEXTRD_rxi(scratch, srcloc.value, 1)
self.mc.PINSRQ_xri(resloc.value, scratch, 1)
self.mc.PEXTRD_rxi(scratch, srcloc.value, 0)
@@ -2564,16 +2576,11 @@
elif size == 8 and tosize == 4:
# is there a better sequence to move them?
scratch = X86_64_SCRATCH_REG.value
- #print resloc, "[0] <- int32(", srcloc, "[0])"
- #66 48 0f 7e c0 movq %xmm0,%rax
- print resloc, "[1] <- int32(", srcloc, "[1])"
- #self.mc.MOVDQ(scratch, srcloc)
- #self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0)
- #self.mc.PINSRD_xri(resloc.value, scratch, 0)
- #self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
- #self.mc.PINSRD_xri(resloc.value, scratch, 1)
+ self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0)
+ self.mc.PINSRD_xri(resloc.value, scratch, 0)
+ self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
+ self.mc.PINSRD_xri(resloc.value, scratch, 1)
else:
- py.test.set_trace()
raise NotImplementedError("sign ext missing")
def genop_vec_float_expand(self, op, arglocs, resloc):
@@ -2584,52 +2591,24 @@
elif count == 2:
self.mc.MOVDDUP(resloc, loc0)
- def _shuffle_by_index(self, src_loc, tmp_loc, item_type, size, index, count):
- if index == 0 and count == 1:
- return src_loc
- select = 0
- if item_type == FLOAT:
- if size == 4:
- self.mc.MOVUPS(tmp_loc, src_loc) # TODO could be aligned if xx
- i = 0
- while i < count:
- select |= (index+i<<(i*2))
- i += 1
- self.mc.SHUFPS_xxi(tmp_loc.value, tmp_loc.value, select)
- return tmp_loc
- else:
- raise NotImplementedError("shuffle by index for float64 not impl")
- else:
- raise NotImplementedError("shuffle by index for non floats")
-
- def genop_vec_float_pack(self, op, arglocs, resloc):
- resultloc, fromloc, tmploc = arglocs
- result = op.result
- indexarg = op.getarg(2)
- countarg = op.getarg(2)
- assert isinstance(result, BoxVector)
- assert isinstance(indexarg, ConstInt)
- assert isinstance(countarg, ConstInt)
- index = indexarg.value
- count = countarg.value
- size = result.item_size
- if size == 4:
- if count == 1:
- raise NotImplementedError("pack: float single pack")
- elif count == 2:
- select = (1 << 2) # move 0 -> 0, 1 -> 1 for toloc
- if index == 0:
- # move 0 -> 2, 1 -> 3 for fromloc
- self.mc.SHUFPS_xxi(resultloc.value, fromloc.value, select | (1 << 2))
- elif index == 2:
- # move 0 -> 2, 1 -> 3 for fromloc
- self.mc.SHUFPS_xxi(resultloc.value, fromloc.value, select | (1 << 6))
- else:
- raise NotImplementedError("pack: only index in {0,2} supported")
- else:
- raise NotImplementedError("pack: count 3 for single float pack not supported")
- elif size == 8:
- raise NotImplementedError("pack: float double pack")
+ # TODO remove
+ #def _shuffle_by_index(self, src_loc, tmp_loc, item_type, size, index, count):
+ # if index == 0 and count == 1:
+ # return src_loc
+ # select = 0
+ # if item_type == FLOAT:
+ # if size == 4:
+ # self.mc.MOVUPS(tmp_loc, src_loc) # TODO could be aligned if xx
+ # i = 0
+ # while i < count:
+ # select |= (index+i<<(i*2))
+ # i += 1
+ # self.mc.SHUFPS_xxi(tmp_loc.value, tmp_loc.value, select)
+ # return tmp_loc
+ # else:
+ # raise NotImplementedError("shuffle by index for float64 not impl")
+ # else:
+ # raise NotImplementedError("shuffle by index for non floats")
def genop_vec_int_pack(self, op, arglocs, resloc):
resultloc, sourceloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
@@ -2640,7 +2619,6 @@
si = srcidx
ri = residx
k = count
- print resultloc,"[", residx, "] <- ",sourceloc,"[",srcidx,"] count", count
while k > 0:
if size == 8:
if resultloc.is_xmm:
@@ -2672,23 +2650,86 @@
genop_vec_int_unpack = genop_vec_int_pack
- def genop_vec_float_unpack(self, op, arglocs, resloc):
- loc0, tmploc, indexloc, countloc = arglocs
+ def genop_vec_float_pack(self, op, arglocs, resultloc):
+ resloc, srcloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
count = countloc.value
- index = indexloc.value
- box = op.getarg(0)
- assert isinstance(box, BoxVector)
- item_type = box.item_type
- size = box.item_size
+ residx = residxloc.value
+ srcidx = srcidxloc.value
+ size = sizeloc.value
if size == 4:
- tmploc = self._shuffle_by_index(loc0, tmploc, item_type, size, index, count)
- self.mc.MOVD32_rx(resloc.value, tmploc.value)
+ si = srcidx
+ ri = residx
+ k = count
+ while k > 0:
+ if resloc.is_xmm:
+ src = srcloc.value
+ if not srcloc.is_xmm:
+ # if source is a normal register (unpack)
+ assert count == 1
+ assert si == 0
+ self.mc.MOVSD(X86_64_XMM_SCRATCH_REG, srcloc)
+ src = X86_64_XMM_SCRATCH_REG.value
+ select = ((si & 0x3) << 6)|((ri & 0x3) << 4)
+ self.mc.INSERTPS_xxi(resloc.value, src, select)
+ else:
+ self.mc.PEXTRD_rxi(resloc.value, srcloc.value, si)
+ si += 1
+ ri += 1
+ k -= 1
elif size == 8:
- pass
- #if index == 1:
- # self.mc.SHUFPD_xxi(resloc, loc0, 0|(1<<2))
- #else:
- # self.mc.UNPCKHPD(resloc, loc0)
+ assert resloc.is_xmm
+ if srcloc.is_xmm:
+ if srcidx == 0:
+ if residx == 0:
+ # r = (s[0], r[1])
+ self.mc.MOVSD(resloc, srcloc)
+ else:
+ assert residx == 1
+ # r = (r[0], s[0])
+ self.mc.UNPCKLPD(resloc, srcloc)
+ else:
+ assert srcidx == 1
+ if residx == 0:
+ source = resloc.value
+ if resloc.value != srcloc.value:
+ self.mc.MOVUPD(resloc, srcloc)
+ # r = (s[1], r[0])
+ self.mc.SHUFPD_xxi(resloc.value, source, 1)
+ else:
+ assert residx == 1
+ # r = (r[0], s[1])
+ self.mc.SHUFPD_xxi(resloc.value, srcloc.value, 2)
+
+ genop_vec_float_unpack = genop_vec_float_pack
+ #(self, op, arglocs, resloc):
+ # resultloc, fromloc, tmploc = arglocs
+ # result = op.result
+ # indexarg = op.getarg(2)
+ # countarg = op.getarg(2)
+ # assert isinstance(result, BoxVector)
+ # assert isinstance(indexarg, ConstInt)
+ # assert isinstance(countarg, ConstInt)
+ # index = indexarg.value
+ # count = countarg.value
+ # size = result.item_size
+ # if size == 4:
+ # if count == 1:
+ # raise NotImplementedError("pack: float single pack")
+ # elif count == 2:
+ # select = (1 << 2) # move 0 -> 0, 1 -> 1 for toloc
+ # if index == 0:
+ # # move 0 -> 2, 1 -> 3 for fromloc
+ # self.mc.SHUFPS_xxi(resultloc.value, fromloc.value, select | (1 << 2))
+ # elif index == 2:
+ # # move 0 -> 2, 1 -> 3 for fromloc
+ # self.mc.SHUFPS_xxi(resultloc.value, fromloc.value, select | (1 << 6))
+ # else:
+ # raise NotImplementedError("pack: only index in {0,2} supported")
+ # else:
+ # raise NotImplementedError("pack: count 3 for single float pack not supported")
+ # elif size == 8:
+ # raise NotImplementedError("pack: float double pack")
+
def genop_vec_cast_float_to_singlefloat(self, op, arglocs, resloc):
@@ -2702,15 +2743,15 @@
def genop_vec_cast_singlefloat_to_float(self, op, arglocs, resloc):
loc0, tmploc, indexloc = arglocs
- index = indexloc.value
- if index == 0:
- self.mc.CVTPS2PD(resloc, loc0)
- else:
- assert index == 2
- self.mc.MOVUPS(tmploc, loc0) # TODO could be aligned if xx
- select = (2<<0)|(3<<2) # move pos 2->0,3->1
- self.mc.SHUFPS_xxi(tmploc.value, tmploc.value, select)
- self.mc.CVTPS2PD(resloc, tmploc) # expand
+ self.mc.CVTPS2PD(resloc, arglocs[0])
+ #index = indexloc.value
+ #if index == 0:
+ #else:
+ # assert index == 2
+ # self.mc.MOVUPS(tmploc, loc0) # TODO could be aligned if xx
+ # select = (2<<0)|(3<<2) # move pos 2->0,3->1
+ # self.mc.SHUFPS_xxi(tmploc.value, tmploc.value, select)
+ # self.mc.CVTPS2PD(resloc, tmploc) # expand
# ________________________________________
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -1535,28 +1535,6 @@
consider_vec_float_eq = consider_vec_logic
del consider_vec_logic
- def consider_vec_float_pack(self, op):
- args = op.getarglist()
- loc1 = self.make_sure_var_in_reg(op.getarg(1), args)
- result = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
- tmpxvar = TempBox()
- tmploc = self.xrm.force_allocate_reg(tmpxvar)
- self.xrm.possibly_free_var(tmpxvar)
- self.perform(op, [result, loc1, tmploc], result)
-
- def consider_vec_float_unpack(self, op):
- count = op.getarg(2)
- index = op.getarg(1)
- assert isinstance(count, ConstInt)
- assert isinstance(index, ConstInt)
- args = op.getarglist()
- loc0 = self.xrm.make_sure_var_in_reg(op.getarg(0), args)
- result = self.force_allocate_reg(op.result, args)
- tmpxvar = TempBox()
- tmploc = self.xrm.force_allocate_reg(tmpxvar, args)
- self.xrm.possibly_free_var(tmpxvar)
- self.perform(op, [loc0, tmploc, imm(index.value), imm(count.value)], result)
-
def consider_vec_int_pack(self, op):
index = op.getarg(2)
count = op.getarg(3)
@@ -1572,6 +1550,8 @@
arglocs = [resloc, srcloc, imm(index.value), imm(0), imm(count.value), imm(size)]
self.perform(op, arglocs, resloc)
+ consider_vec_float_pack = consider_vec_int_pack
+
def consider_vec_int_unpack(self, op):
index = op.getarg(1)
count = op.getarg(2)
@@ -1579,14 +1559,23 @@
assert isinstance(count, ConstInt)
args = op.getarglist()
srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
- resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+ if isinstance(op.result, BoxVector):
+ resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+ assert isinstance(op.result, BoxVector)
+ size = op.result.item_size
+ else:
+ # unpack into iX box
+ resloc = self.force_allocate_reg(op.result, args)
+ arg = op.getarg(0)
+ assert isinstance(arg, BoxVector)
+ size = arg.item_size
residx = 0
- assert isinstance(op.result, BoxVector)
args = op.getarglist()
- size = op.result.item_size
arglocs = [resloc, srcloc, imm(residx), imm(index.value), imm(count.value), imm(size)]
self.perform(op, arglocs, resloc)
+ consider_vec_float_unpack = consider_vec_int_unpack
+
def consider_vec_float_expand(self, op):
args = op.getarglist()
srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -681,10 +681,16 @@
PADDD = _binaryop('PADDD')
PADDW = _binaryop('PADDW')
PADDB = _binaryop('PADDB')
+
PSUBQ = _binaryop('PSUBQ')
PSUBD = _binaryop('PSUBD')
PSUBW = _binaryop('PSUBW')
PSUBQ = _binaryop('PSUBQ')
+
+ PMULDQ = _binaryop('PMULDQ')
+ PMULLD = _binaryop('PMULLD')
+ PMULLW = _binaryop('PMULLW')
+
PAND = _binaryop('PAND')
POR = _binaryop('POR')
PXOR = _binaryop('PXOR')
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -740,6 +740,7 @@
UNPCKHPS_xx = xmminsn( rex_nw, '\x0F\x15', register(1, 8), register(2), '\xC0')
MOVDDUP_xx = xmminsn('\xF2', rex_nw, '\x0F\x12', register(1, 8), register(2), '\xC0')
SHUFPS_xxi = xmminsn(rex_nw, '\x0F\xC6', register(1,8), register(2), '\xC0', immediate(3, 'b'))
+ SHUFPD_xxi = xmminsn('\x66', rex_nw, '\x0F\xC6', register(1,8), register(2), '\xC0', immediate(3, 'b'))
PSHUFD_xxi = xmminsn('\x66', rex_nw, '\x0F\x70', register(1,8), register(2), '\xC0', immediate(3, 'b'))
@@ -748,10 +749,13 @@
PEXTRD_rxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x16', register(2,8), register(1), '\xC0', immediate(3, 'b'))
PEXTRW_rxi = xmminsn('\x66', rex_nw, '\x0F\xC4', register(2,8), register(1), '\xC0', immediate(3, 'b'))
PEXTRB_rxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x14', register(2,8), register(1), '\xC0', immediate(3, 'b'))
+ EXTRACTPS_rxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x17', register(2,8), register(1), '\xC0', immediate(3, 'b'))
+
PINSRQ_xri = xmminsn('\x66', rex_w, '\x0F\x3A\x22', register(1,8), register(2), '\xC0', immediate(3, 'b'))
PINSRD_xri = xmminsn('\x66', rex_nw, '\x0F\x3A\x22', register(1,8), register(2), '\xC0', immediate(3, 'b'))
PINSRW_xri = xmminsn('\x66', rex_nw, '\x0F\xC5', register(1,8), register(2), '\xC0', immediate(3, 'b'))
PINSRB_xri = xmminsn('\x66', rex_nw, '\x0F\x3A\x20', register(1,8), register(2), '\xC0', immediate(3, 'b'))
+ INSERTPS_xxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x21', register(1,8), register(2), '\xC0', immediate(3, 'b'))
# ------------------------------------------------------------
@@ -971,10 +975,16 @@
define_pxmm_insn('PADDD_x*', '\xFE')
define_pxmm_insn('PADDW_x*', '\xFD')
define_pxmm_insn('PADDB_x*', '\xFC')
+
define_pxmm_insn('PSUBQ_x*', '\xFB')
define_pxmm_insn('PSUBD_x*', '\xFA')
define_pxmm_insn('PSUBW_x*', '\xF9')
define_pxmm_insn('PSUBB_x*', '\xF8')
+
+define_pxmm_insn('PMULDQ_x*', '\x38\x28')
+define_pxmm_insn('PMULLD_x*', '\x38\x40')
+define_pxmm_insn('PMULLW_x*', '\xD5')
+
define_pxmm_insn('PAND_x*', '\xDB')
define_pxmm_insn('POR_x*', '\xEB')
define_pxmm_insn('PXOR_x*', '\xEF')
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -402,6 +402,7 @@
(j, vbox) = sched_data.box_to_vbox.get(arg, (-1, None))
if vbox:
arg_cloned = arg.clonebox()
+ py.test.set_trace()
cj = ConstInt(j)
ci = ConstInt(1)
opnum = rop.VEC_FLOAT_UNPACK
@@ -533,11 +534,12 @@
class PackType(PrimitiveTypeMixin):
UNKNOWN_TYPE = '-'
- def __init__(self, type, size, signed):
+ def __init__(self, type, size, signed, count=-1):
assert type in (FLOAT, INT, PackType.UNKNOWN_TYPE)
self.type = type
self.size = size
self.signed = signed
+ self.count = count
def gettype(self):
return self.type
@@ -551,6 +553,9 @@
def get_byte_size(self):
return self.size
+ def getcount(self):
+ return self.count
+
@staticmethod
def by_descr(descr):
_t = INT
@@ -563,7 +568,7 @@
return self.type != PackType.UNKNOWN_TYPE and self.size > 0
def new_vector_box(self, count):
- return BoxVector(self.type, count, self.size, self.signed)
+ return BoxVector(self.type, count, self.size, self.signed, self.count)
def record_vbox(self, vbox):
if self.type == PackType.UNKNOWN_TYPE:
@@ -581,14 +586,14 @@
class OpToVectorOp(object):
- def __init__(self, arg_ptypes, result_ptype, has_ptype=False, index=-1, result_vsize_arg=-1):
+ def __init__(self, arg_ptypes, result_ptype, has_ptype=False, result_vsize_arg=-1):
self.arg_ptypes = arg_ptypes
self.result_ptype = result_ptype
self.has_ptype = has_ptype
- # TODO remove them?
- self.result = result_ptype != None
self.result_vsize_arg = result_vsize_arg
- self.index = index
+
+ def has_result(self):
+ return self.result_ptype != None
def get_result_ptype(self):
return self.result_ptype
@@ -604,9 +609,12 @@
return self.arg_ptypes[i] is not None
PT_FLOAT = PackType(FLOAT, 4, False)
+PT_FLOAT_2 = PackType(FLOAT, 4, False, count=2)
PT_DOUBLE = PackType(FLOAT, 8, False)
PT_INT_GENERIC = PackType(INT, -1, True)
PT_INT64 = PackType(INT, 8, True)
+PT_INT32 = PackType(INT, 4, True)
+PT_INT32_2 = PackType(INT, 4, True, count=2)
PT_FLOAT_GENERIC = PackType(INT, -1, True)
PT_GENERIC = PackType(PackType.UNKNOWN_TYPE, -1, True)
@@ -626,11 +634,10 @@
rop.VEC_RAW_STORE: OpToVectorOp((None,None,PT_GENERIC,), None, has_ptype=True),
rop.VEC_SETARRAYITEM_RAW: OpToVectorOp((None,None,PT_GENERIC,), None, has_ptype=True),
- rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT: OpToVectorOp((PT_DOUBLE,), PT_FLOAT),
- # TODO remove index
- rop.VEC_CAST_SINGLEFLOAT_TO_FLOAT: OpToVectorOp((PT_FLOAT,), PT_DOUBLE, index=1),
- rop.VEC_CAST_FLOAT_TO_INT: OpToVectorOp((PT_DOUBLE,), PT_INT64),
- rop.VEC_CAST_INT_TO_FLOAT: OpToVectorOp((PT_INT64,), PT_DOUBLE),
+ rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT: OpToVectorOp((PT_DOUBLE,), PT_FLOAT_2),
+ rop.VEC_CAST_SINGLEFLOAT_TO_FLOAT: OpToVectorOp((PT_FLOAT_2,), PT_DOUBLE),
+ rop.VEC_CAST_FLOAT_TO_INT: OpToVectorOp((PT_DOUBLE,), PT_INT32_2),
+ rop.VEC_CAST_INT_TO_FLOAT: OpToVectorOp((PT_INT32_2,), PT_DOUBLE),
}
@@ -684,9 +691,6 @@
if tovector is None:
raise NotImplementedError("vecop map entry missing. trans: pack -> vop")
- if tovector.index != -1:
- args.append(ConstInt(self.pack_off))
-
args.append(ConstInt(self.pack_ops))
vop = ResOperation(op0.vector, args, op0.result, op0.getdescr())
@@ -698,7 +702,7 @@
if arg_ptype.size == -1:
arg_ptype = self.pack.ptype
self.vector_arg(vop, i, arg_ptype)
- if tovector.result:
+ if tovector.has_result():
self.vector_result(vop, tovector)
self.preamble_ops.append(vop)
@@ -742,11 +746,13 @@
#
vop.result = vbox
i = self.pack_off
+ off = 0 # assumption. the result is always placed at index [0,...,x]
end = i + self.pack_ops
while i < end:
op = ops[i].getoperation()
- self.box_to_vbox[op.result] = (i, vbox)
+ self.box_to_vbox[op.result] = (off, vbox)
i += 1
+ off += 1
def box_vector(self, ptype):
""" TODO remove this? """
@@ -770,6 +776,16 @@
# the argument has more items than the operation is able to process!
vbox = self.unpack(vbox, self.pack_off, packable, arg_ptype)
vbox = self.extend(vbox, arg_ptype)
+
+ # The instruction takes less items than the vector has.
+ # Unpack if not at pack_off 0
+ count = arg_ptype.getcount()
+ if count != -1 and count < vbox.item_count:
+ if self.pack_off == 0:
+ pass # right place already
+ else:
+ vbox = self.unpack(vbox, self.pack_off, count, arg_ptype)
+
vop.setarg(argidx, vbox)
return vbox
diff --git a/rpython/jit/metainterp/resoperation.py b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -467,7 +467,7 @@
# double -> float: v2 = cast(v1, 2) equal to v2 = (v1[0], v1[1], X, X)
'VEC_CAST_FLOAT_TO_SINGLEFLOAT/2',
# v4 = cast(v3, 0, 2), v4 = (v3[0], v3[1])
- 'VEC_CAST_SINGLEFLOAT_TO_FLOAT/3',
+ 'VEC_CAST_SINGLEFLOAT_TO_FLOAT/2',
'VEC_CAST_FLOAT_TO_INT/2',
'VEC_CAST_INT_TO_FLOAT/2',
More information about the pypy-commit
mailing list