[pypy-commit] pypy vecopt: finished the unpacking for float32/64 and int32/64. added x86 packed mul operations (don't know if we ever can use them for int64)

Tue May 19 16:59:46 CEST 2015

Author: Richard Plangger <rich at pasra.at>
Branch: vecopt
Changeset: r77392:199b27a762f8
Date: 2015-05-19 16:14 +0200
http://bitbucket.org/pypy/pypy/changeset/199b27a762f8/

Log:	finished the unpacking for float32/64 and int32/64. added x86 packed
	mul operations (don't know if we ever can use them for int64) typed
	the vector box arguments (including count). they are able to
	automatically unpack/pack instructions if they are not in place
	rewritten most of unpack/pack float (as mentioned earlier) and using
	insertps for float32

diff --git a/pypy/module/micronumpy/compile.py b/pypy/module/micronumpy/compile.py
--- a/pypy/module/micronumpy/compile.py
+++ b/pypy/module/micronumpy/compile.py
@@ -209,7 +209,9 @@
         return self.wrap(1)
 
     def mul(self, w_obj1, w_obj2):
-        return self.wrap(1)
+        assert isinstance(w_obj1, boxes.W_GenericBox) 
+        assert isinstance(w_obj2, boxes.W_GenericBox) 
+        return w_obj1.descr_mul(self, w_obj2)
 
     def pow(self, w_obj1, w_obj2, _):
         return self.wrap(1)
@@ -324,7 +326,7 @@
         return W_TypeObject(w_obj.typedef.name)
 
     def call_function(self, tp, w_dtype, *args):
-        if tp is self.w_float
+        if tp is self.w_float:
             if isinstance(w_dtype, boxes.W_Float64Box):
                 return FloatObject(float(w_dtype.value))
             if isinstance(w_dtype, boxes.W_Float32Box):
diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -200,28 +200,83 @@
         return """
         a = astype(|30|, int32)
         b = a + 1i
+        d = astype(|30|, int32)
+        c = d + 2.0
         x1 = b -> 7
         x2 = b -> 8
-        x3 = b -> 9
-        x4 = b -> 10
-        r = x1 + x2 + x3 + x4
-        r
+        x3 = c -> 11
+        x4 = c -> 12
+        x1 + x2 + x3 + x4
         """
-        #return """
-        #a = astype(|30|, int32)
-        #b = a + 1i
-        #c = a + 2.0
-        #x1 = b -> 7
-        #x2 = b -> 8
-        #x3 = c -> 11
-        #x4 = c -> 12
-        #x1 + x2 + x3 + x4
-        #"""
     def test_int32_add_const(self):
         result = self.run("int32_add_const")
-        assert int(result) == 7+1+8+1+9+1+10+1
-        self.check_vectorized(1, 1)
+        assert int(result) == 7+1+8+1+11+2+12+2
+        self.check_vectorized(2, 2)
 
+    def define_int_mul_array():
+        return """
+        a = astype(|30|, int)
+        b = astype(|30|, int)
+        c = a * b
+        x1 = c -> 7
+        x2 = c -> 8
+        x3 = c -> 11
+        x4 = c -> 12
+        x1 + x2 + x3 + x4
+        """
+    def test_int_mul_array(self):
+        py.test.skip("how to multiply quad word integers?")
+        result = self.run("int_mul_array")
+        assert int(result) == 7*7+8*8+11*11+12*12
+        self.check_vectorized(2, 2)
+
+    def define_float_mul_array():
+        return """
+        a = astype(|30|, float)
+        b = astype(|30|, float)
+        c = a * b
+        x1 = c -> 7
+        x2 = c -> 8
+        x3 = c -> 11
+        x4 = c -> 12
+        x1 + x2 + x3 + x4
+        """
+    def test_float_mul_array(self):
+        result = self.run("float_mul_array")
+        assert int(result) == 7*7+8*8+11*11+12*12
+        self.check_vectorized(2, 2)
+
+    def define_int32_mul_array():
+        return """
+        a = astype(|30|, int32)
+        b = astype(|30|, int32)
+        c = a * b
+        x1 = c -> 7
+        x2 = c -> 8
+        x3 = c -> 11
+        x4 = c -> 12
+        x1 + x2 + x3 + x4
+        """
+    def test_int32_mul_array(self):
+        result = self.run("int32_mul_array")
+        assert int(result) == 7*7+8*8+11*11+12*12
+        self.check_vectorized(2, 2)
+
+    def define_float32_mul_array():
+        return """
+        a = astype(|30|, float32)
+        b = astype(|30|, float32)
+        c = a * b
+        x1 = c -> 7
+        x2 = c -> 8
+        x3 = c -> 11
+        x4 = c -> 12
+        x1 + x2 + x3 + x4
+        """
+    def test_float32_mul_array(self):
+        result = self.run("float32_mul_array")
+        assert int(result) == 7*7+8*8+11*11+12*12
+        self.check_vectorized(2, 2)
 
     def define_pow():
         return """
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -2510,6 +2510,18 @@
             elif itemsize == 8:
                 self.mc.MOVUPD(dest_loc, value_loc)
 
+    def genop_vec_int_mul(self, op, arglocs, resloc):
+        loc0, loc1, itemsize_loc = arglocs
+        itemsize = itemsize_loc.value
+        if itemsize == 2:
+            self.mc.PMULLW(loc0, loc1)
+        elif itemsize == 4:
+            self.mc.PMULLD(loc0, loc1)
+        elif itemsize == 8:
+            self.mc.PMULDQ(loc0, loc1)
+        else:
+            raise NotImplementedError("did not implement integer mul")
+
     def genop_vec_int_add(self, op, arglocs, resloc):
         loc0, loc1, itemsize_loc = arglocs
         itemsize = itemsize_loc.value
@@ -2553,10 +2565,10 @@
         srcloc, sizeloc, tosizeloc = arglocs
         size = sizeloc.value
         tosize = tosizeloc.value
+        if size == tosize:
+            return # already the right size
         if size == 4 and tosize == 8:
             scratch = X86_64_SCRATCH_REG.value
-            print resloc, "[0] <- int64(", srcloc, "[0])"
-            print resloc, "[1] <- int64(", srcloc, "[1])"
             self.mc.PEXTRD_rxi(scratch, srcloc.value, 1)
             self.mc.PINSRQ_xri(resloc.value, scratch, 1)
             self.mc.PEXTRD_rxi(scratch, srcloc.value, 0)
@@ -2564,16 +2576,11 @@
         elif size == 8 and tosize == 4:
             # is there a better sequence to move them?
             scratch = X86_64_SCRATCH_REG.value
-            #print resloc, "[0] <- int32(", srcloc, "[0])"
-            #66 48 0f 7e c0     movq   %xmm0,%rax
-            print resloc, "[1] <- int32(", srcloc, "[1])"
-            #self.mc.MOVDQ(scratch, srcloc)
-            #self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0)
-            #self.mc.PINSRD_xri(resloc.value, scratch, 0)
-            #self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
-            #self.mc.PINSRD_xri(resloc.value, scratch, 1)
+            self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0)
+            self.mc.PINSRD_xri(resloc.value, scratch, 0)
+            self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
+            self.mc.PINSRD_xri(resloc.value, scratch, 1)
         else:
-            py.test.set_trace()
             raise NotImplementedError("sign ext missing")
 
     def genop_vec_float_expand(self, op, arglocs, resloc):
@@ -2584,52 +2591,24 @@
         elif count == 2:
             self.mc.MOVDDUP(resloc, loc0)
 
-    def _shuffle_by_index(self, src_loc, tmp_loc, item_type, size, index, count):
-        if index == 0 and count == 1:
-            return src_loc
-        select = 0
-        if item_type == FLOAT:
-            if size == 4:
-                self.mc.MOVUPS(tmp_loc, src_loc) # TODO could be aligned if xx
-                i = 0
-                while i < count:
-                    select |= (index+i<<(i*2))
-                    i += 1
-                self.mc.SHUFPS_xxi(tmp_loc.value, tmp_loc.value, select)
-                return tmp_loc
-            else:
-                raise NotImplementedError("shuffle by index for float64 not impl")
-        else:
-            raise NotImplementedError("shuffle by index for non floats")
-
-    def genop_vec_float_pack(self, op, arglocs, resloc):
-        resultloc, fromloc, tmploc = arglocs
-        result = op.result
-        indexarg = op.getarg(2)
-        countarg = op.getarg(2)
-        assert isinstance(result, BoxVector)
-        assert isinstance(indexarg, ConstInt)
-        assert isinstance(countarg, ConstInt)
-        index = indexarg.value
-        count = countarg.value
-        size = result.item_size
-        if size == 4:
-            if count == 1:
-                raise NotImplementedError("pack: float single pack")
-            elif count == 2:
-                select = (1 << 2) # move 0 -> 0, 1 -> 1 for toloc
-                if index == 0:
-                    # move 0 -> 2, 1 -> 3 for fromloc
-                    self.mc.SHUFPS_xxi(resultloc.value, fromloc.value, select | (1 << 2))
-                elif index == 2:
-                    # move 0 -> 2, 1 -> 3 for fromloc
-                    self.mc.SHUFPS_xxi(resultloc.value, fromloc.value, select | (1 << 6))
-                else:
-                    raise NotImplementedError("pack: only index in {0,2} supported")
-            else:
-                raise NotImplementedError("pack: count 3 for single float pack not supported")
-        elif size == 8:
-            raise NotImplementedError("pack: float double pack")
+    # TODO remove
+    #def _shuffle_by_index(self, src_loc, tmp_loc, item_type, size, index, count):
+    #    if index == 0 and count == 1:
+    #        return src_loc
+    #    select = 0
+    #    if item_type == FLOAT:
+    #        if size == 4:
+    #            self.mc.MOVUPS(tmp_loc, src_loc) # TODO could be aligned if xx
+    #            i = 0
+    #            while i < count:
+    #                select |= (index+i<<(i*2))
+    #                i += 1
+    #            self.mc.SHUFPS_xxi(tmp_loc.value, tmp_loc.value, select)
+    #            return tmp_loc
+    #        else:
+    #            raise NotImplementedError("shuffle by index for float64 not impl")
+    #    else:
+    #        raise NotImplementedError("shuffle by index for non floats")
 
     def genop_vec_int_pack(self, op, arglocs, resloc):
         resultloc, sourceloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
@@ -2640,7 +2619,6 @@
         si = srcidx
         ri = residx
         k = count
-        print resultloc,"[", residx, "] <- ",sourceloc,"[",srcidx,"] count", count
         while k > 0:
             if size == 8:
                 if resultloc.is_xmm:
@@ -2672,23 +2650,86 @@
 
     genop_vec_int_unpack = genop_vec_int_pack
 
-    def genop_vec_float_unpack(self, op, arglocs, resloc):
-        loc0, tmploc, indexloc, countloc = arglocs
+    def genop_vec_float_pack(self, op, arglocs, resultloc):
+        resloc, srcloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
         count = countloc.value
-        index = indexloc.value
-        box = op.getarg(0)
-        assert isinstance(box, BoxVector)
-        item_type = box.item_type
-        size = box.item_size
+        residx = residxloc.value
+        srcidx = srcidxloc.value
+        size = sizeloc.value
         if size == 4:
-            tmploc = self._shuffle_by_index(loc0, tmploc, item_type, size, index, count)
-            self.mc.MOVD32_rx(resloc.value, tmploc.value)
+            si = srcidx
+            ri = residx
+            k = count
+            while k > 0:
+                if resloc.is_xmm:
+                    src = srcloc.value
+                    if not srcloc.is_xmm:
+                        # if source is a normal register (unpack)
+                        assert count == 1
+                        assert si == 0
+                        self.mc.MOVSD(X86_64_XMM_SCRATCH_REG, srcloc)
+                        src = X86_64_XMM_SCRATCH_REG.value
+                    select = ((si & 0x3) << 6)|((ri & 0x3) << 4)
+                    self.mc.INSERTPS_xxi(resloc.value, src, select)
+                else:
+                    self.mc.PEXTRD_rxi(resloc.value, srcloc.value, si)
+                si += 1
+                ri += 1
+                k -= 1
         elif size == 8:
-            pass
-            #if index == 1:
-            #    self.mc.SHUFPD_xxi(resloc, loc0, 0|(1<<2))
-            #else:
-            #    self.mc.UNPCKHPD(resloc, loc0)
+            assert resloc.is_xmm
+            if srcloc.is_xmm:
+                if srcidx == 0:
+                    if residx == 0:
+                        # r = (s[0], r[1])
+                        self.mc.MOVSD(resloc, srcloc)
+                    else:
+                        assert residx == 1
+                        # r = (r[0], s[0])
+                        self.mc.UNPCKLPD(resloc, srcloc)
+                else:
+                    assert srcidx == 1
+                    if residx == 0:
+                        source = resloc.value
+                        if resloc.value != srcloc.value:
+                            self.mc.MOVUPD(resloc, srcloc)
+                        # r = (s[1], r[0])
+                        self.mc.SHUFPD_xxi(resloc.value, source, 1)
+                    else:
+                        assert residx == 1
+                        # r = (r[0], s[1])
+                        self.mc.SHUFPD_xxi(resloc.value, srcloc.value, 2)
+
+    genop_vec_float_unpack = genop_vec_float_pack
+    #(self, op, arglocs, resloc):
+    #    resultloc, fromloc, tmploc = arglocs
+    #    result = op.result
+    #    indexarg = op.getarg(2)
+    #    countarg = op.getarg(2)
+    #    assert isinstance(result, BoxVector)
+    #    assert isinstance(indexarg, ConstInt)
+    #    assert isinstance(countarg, ConstInt)
+    #    index = indexarg.value
+    #    count = countarg.value
+    #    size = result.item_size
+    #    if size == 4:
+    #        if count == 1:
+    #            raise NotImplementedError("pack: float single pack")
+    #        elif count == 2:
+    #            select = (1 << 2) # move 0 -> 0, 1 -> 1 for toloc
+    #            if index == 0:
+    #                # move 0 -> 2, 1 -> 3 for fromloc
+    #                self.mc.SHUFPS_xxi(resultloc.value, fromloc.value, select | (1 << 2))
+    #            elif index == 2:
+    #                # move 0 -> 2, 1 -> 3 for fromloc
+    #                self.mc.SHUFPS_xxi(resultloc.value, fromloc.value, select | (1 << 6))
+    #            else:
+    #                raise NotImplementedError("pack: only index in {0,2} supported")
+    #        else:
+    #            raise NotImplementedError("pack: count 3 for single float pack not supported")
+    #    elif size == 8:
+    #        raise NotImplementedError("pack: float double pack")
+
 
 
     def genop_vec_cast_float_to_singlefloat(self, op, arglocs, resloc):
@@ -2702,15 +2743,15 @@
 
     def genop_vec_cast_singlefloat_to_float(self, op, arglocs, resloc):
         loc0, tmploc, indexloc = arglocs
-        index = indexloc.value
-        if index == 0:
-            self.mc.CVTPS2PD(resloc, loc0)
-        else:
-            assert index == 2
-            self.mc.MOVUPS(tmploc, loc0) # TODO could be aligned if xx
-            select = (2<<0)|(3<<2) # move pos 2->0,3->1
-            self.mc.SHUFPS_xxi(tmploc.value, tmploc.value, select)
-            self.mc.CVTPS2PD(resloc, tmploc) # expand
+        self.mc.CVTPS2PD(resloc, arglocs[0])
+        #index = indexloc.value
+        #if index == 0:
+        #else:
+        #    assert index == 2
+        #    self.mc.MOVUPS(tmploc, loc0) # TODO could be aligned if xx
+        #    select = (2<<0)|(3<<2) # move pos 2->0,3->1
+        #    self.mc.SHUFPS_xxi(tmploc.value, tmploc.value, select)
+        #    self.mc.CVTPS2PD(resloc, tmploc) # expand
 
     # ________________________________________
 
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -1535,28 +1535,6 @@
     consider_vec_float_eq = consider_vec_logic
     del consider_vec_logic
 
-    def consider_vec_float_pack(self, op):
-        args = op.getarglist()
-        loc1 = self.make_sure_var_in_reg(op.getarg(1), args)
-        result =  self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
-        tmpxvar = TempBox()
-        tmploc = self.xrm.force_allocate_reg(tmpxvar)
-        self.xrm.possibly_free_var(tmpxvar)
-        self.perform(op, [result, loc1, tmploc], result)
-
-    def consider_vec_float_unpack(self, op):
-        count = op.getarg(2)
-        index = op.getarg(1)
-        assert isinstance(count, ConstInt)
-        assert isinstance(index, ConstInt)
-        args = op.getarglist()
-        loc0 = self.xrm.make_sure_var_in_reg(op.getarg(0), args)
-        result = self.force_allocate_reg(op.result, args)
-        tmpxvar = TempBox()
-        tmploc = self.xrm.force_allocate_reg(tmpxvar, args)
-        self.xrm.possibly_free_var(tmpxvar)
-        self.perform(op, [loc0, tmploc, imm(index.value), imm(count.value)], result)
-
     def consider_vec_int_pack(self, op):
         index = op.getarg(2)
         count = op.getarg(3)
@@ -1572,6 +1550,8 @@
         arglocs = [resloc, srcloc, imm(index.value), imm(0), imm(count.value), imm(size)]
         self.perform(op, arglocs, resloc)
 
+    consider_vec_float_pack = consider_vec_int_pack
+
     def consider_vec_int_unpack(self, op):
         index = op.getarg(1)
         count = op.getarg(2)
@@ -1579,14 +1559,23 @@
         assert isinstance(count, ConstInt)
         args = op.getarglist()
         srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
-        resloc =  self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+        if isinstance(op.result, BoxVector):
+            resloc =  self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+            assert isinstance(op.result, BoxVector)
+            size = op.result.item_size
+        else:
+            # unpack into iX box
+            resloc =  self.force_allocate_reg(op.result, args)
+            arg = op.getarg(0)
+            assert isinstance(arg, BoxVector)
+            size = arg.item_size
         residx = 0
-        assert isinstance(op.result, BoxVector)
         args = op.getarglist()
-        size = op.result.item_size
         arglocs = [resloc, srcloc, imm(residx), imm(index.value), imm(count.value), imm(size)]
         self.perform(op, arglocs, resloc)
 
+    consider_vec_float_unpack = consider_vec_int_unpack
+
     def consider_vec_float_expand(self, op):
         args = op.getarglist()
         srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -681,10 +681,16 @@
     PADDD = _binaryop('PADDD')
     PADDW = _binaryop('PADDW')
     PADDB = _binaryop('PADDB')
+
     PSUBQ = _binaryop('PSUBQ')
     PSUBD = _binaryop('PSUBD')
     PSUBW = _binaryop('PSUBW')
     PSUBQ = _binaryop('PSUBQ')
+
+    PMULDQ = _binaryop('PMULDQ')
+    PMULLD = _binaryop('PMULLD')
+    PMULLW = _binaryop('PMULLW')
+
     PAND  = _binaryop('PAND')
     POR   = _binaryop('POR')
     PXOR  = _binaryop('PXOR')
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -740,6 +740,7 @@
     UNPCKHPS_xx = xmminsn(        rex_nw, '\x0F\x15', register(1, 8), register(2), '\xC0')
     MOVDDUP_xx = xmminsn('\xF2', rex_nw, '\x0F\x12', register(1, 8), register(2), '\xC0')
     SHUFPS_xxi = xmminsn(rex_nw, '\x0F\xC6', register(1,8), register(2), '\xC0', immediate(3, 'b'))
+    SHUFPD_xxi = xmminsn('\x66', rex_nw, '\x0F\xC6', register(1,8), register(2), '\xC0', immediate(3, 'b'))
 
     PSHUFD_xxi = xmminsn('\x66', rex_nw, '\x0F\x70', register(1,8), register(2), '\xC0', immediate(3, 'b'))
 
@@ -748,10 +749,13 @@
     PEXTRD_rxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x16', register(2,8), register(1), '\xC0', immediate(3, 'b'))
     PEXTRW_rxi = xmminsn('\x66', rex_nw, '\x0F\xC4', register(2,8), register(1), '\xC0', immediate(3, 'b'))
     PEXTRB_rxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x14', register(2,8), register(1), '\xC0', immediate(3, 'b'))
+    EXTRACTPS_rxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x17', register(2,8), register(1), '\xC0', immediate(3, 'b'))
+    
     PINSRQ_xri = xmminsn('\x66', rex_w, '\x0F\x3A\x22', register(1,8), register(2), '\xC0', immediate(3, 'b'))
     PINSRD_xri = xmminsn('\x66', rex_nw, '\x0F\x3A\x22', register(1,8), register(2), '\xC0', immediate(3, 'b'))
     PINSRW_xri = xmminsn('\x66', rex_nw, '\x0F\xC5', register(1,8), register(2), '\xC0', immediate(3, 'b'))
     PINSRB_xri = xmminsn('\x66', rex_nw, '\x0F\x3A\x20', register(1,8), register(2), '\xC0', immediate(3, 'b'))
+    INSERTPS_xxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x21', register(1,8), register(2), '\xC0', immediate(3, 'b'))
 
     # ------------------------------------------------------------
 
@@ -971,10 +975,16 @@
 define_pxmm_insn('PADDD_x*',     '\xFE')
 define_pxmm_insn('PADDW_x*',     '\xFD')
 define_pxmm_insn('PADDB_x*',     '\xFC')
+
 define_pxmm_insn('PSUBQ_x*',     '\xFB')
 define_pxmm_insn('PSUBD_x*',     '\xFA')
 define_pxmm_insn('PSUBW_x*',     '\xF9')
 define_pxmm_insn('PSUBB_x*',     '\xF8')
+
+define_pxmm_insn('PMULDQ_x*',    '\x38\x28')
+define_pxmm_insn('PMULLD_x*',    '\x38\x40')
+define_pxmm_insn('PMULLW_x*',    '\xD5')
+
 define_pxmm_insn('PAND_x*',      '\xDB')
 define_pxmm_insn('POR_x*',       '\xEB')
 define_pxmm_insn('PXOR_x*',      '\xEF')
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -402,6 +402,7 @@
         (j, vbox) = sched_data.box_to_vbox.get(arg, (-1, None))
         if vbox:
             arg_cloned = arg.clonebox()
+            py.test.set_trace()
             cj = ConstInt(j)
             ci = ConstInt(1)
             opnum = rop.VEC_FLOAT_UNPACK
@@ -533,11 +534,12 @@
 class PackType(PrimitiveTypeMixin):
     UNKNOWN_TYPE = '-'
 
-    def __init__(self, type, size, signed):
+    def __init__(self, type, size, signed, count=-1):
         assert type in (FLOAT, INT, PackType.UNKNOWN_TYPE)
         self.type = type
         self.size = size
         self.signed = signed
+        self.count = count
 
     def gettype(self):
         return self.type
@@ -551,6 +553,9 @@
     def get_byte_size(self):
         return self.size
 
+    def getcount(self):
+        return self.count
+
     @staticmethod
     def by_descr(descr):
         _t = INT
@@ -563,7 +568,7 @@
         return self.type != PackType.UNKNOWN_TYPE and self.size > 0
 
     def new_vector_box(self, count):
-        return BoxVector(self.type, count, self.size, self.signed)
+        return BoxVector(self.type, count, self.size, self.signed, self.count)
 
     def record_vbox(self, vbox):
         if self.type == PackType.UNKNOWN_TYPE:
@@ -581,14 +586,14 @@
 
 
 class OpToVectorOp(object):
-    def __init__(self, arg_ptypes, result_ptype, has_ptype=False, index=-1, result_vsize_arg=-1):
+    def __init__(self, arg_ptypes, result_ptype, has_ptype=False, result_vsize_arg=-1):
         self.arg_ptypes = arg_ptypes
         self.result_ptype = result_ptype
         self.has_ptype = has_ptype
-        # TODO remove them?
-        self.result = result_ptype != None
         self.result_vsize_arg = result_vsize_arg
-        self.index = index
+
+    def has_result(self):
+        return self.result_ptype != None
 
     def get_result_ptype(self):
         return self.result_ptype
@@ -604,9 +609,12 @@
         return self.arg_ptypes[i] is not None
 
 PT_FLOAT = PackType(FLOAT, 4, False)
+PT_FLOAT_2 = PackType(FLOAT, 4, False, count=2)
 PT_DOUBLE = PackType(FLOAT, 8, False)
 PT_INT_GENERIC = PackType(INT, -1, True)
 PT_INT64 = PackType(INT, 8, True)
+PT_INT32 = PackType(INT, 4, True)
+PT_INT32_2 = PackType(INT, 4, True, count=2)
 PT_FLOAT_GENERIC = PackType(INT, -1, True)
 PT_GENERIC = PackType(PackType.UNKNOWN_TYPE, -1, True)
 
@@ -626,11 +634,10 @@
     rop.VEC_RAW_STORE:        OpToVectorOp((None,None,PT_GENERIC,), None, has_ptype=True),
     rop.VEC_SETARRAYITEM_RAW: OpToVectorOp((None,None,PT_GENERIC,), None, has_ptype=True),
 
-    rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT: OpToVectorOp((PT_DOUBLE,), PT_FLOAT),
-    # TODO remove index
-    rop.VEC_CAST_SINGLEFLOAT_TO_FLOAT: OpToVectorOp((PT_FLOAT,), PT_DOUBLE, index=1),
-    rop.VEC_CAST_FLOAT_TO_INT: OpToVectorOp((PT_DOUBLE,), PT_INT64),
-    rop.VEC_CAST_INT_TO_FLOAT: OpToVectorOp((PT_INT64,), PT_DOUBLE),
+    rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT: OpToVectorOp((PT_DOUBLE,), PT_FLOAT_2),
+    rop.VEC_CAST_SINGLEFLOAT_TO_FLOAT: OpToVectorOp((PT_FLOAT_2,), PT_DOUBLE),
+    rop.VEC_CAST_FLOAT_TO_INT: OpToVectorOp((PT_DOUBLE,), PT_INT32_2),
+    rop.VEC_CAST_INT_TO_FLOAT: OpToVectorOp((PT_INT32_2,), PT_DOUBLE),
 }
 
 
@@ -684,9 +691,6 @@
         if tovector is None:
             raise NotImplementedError("vecop map entry missing. trans: pack -> vop")
 
-        if tovector.index != -1:
-            args.append(ConstInt(self.pack_off))
-
         args.append(ConstInt(self.pack_ops))
         vop = ResOperation(op0.vector, args, op0.result, op0.getdescr())
 
@@ -698,7 +702,7 @@
                 if arg_ptype.size == -1:
                     arg_ptype = self.pack.ptype
                 self.vector_arg(vop, i, arg_ptype)
-        if tovector.result:
+        if tovector.has_result():
             self.vector_result(vop, tovector)
 
         self.preamble_ops.append(vop)
@@ -742,11 +746,13 @@
         #
         vop.result = vbox
         i = self.pack_off
+        off = 0 # assumption. the result is always placed at index [0,...,x]
         end = i + self.pack_ops
         while i < end:
             op = ops[i].getoperation()
-            self.box_to_vbox[op.result] = (i, vbox)
+            self.box_to_vbox[op.result] = (off, vbox)
             i += 1
+            off += 1
 
     def box_vector(self, ptype):
         """ TODO remove this? """
@@ -770,6 +776,16 @@
             # the argument has more items than the operation is able to process!
             vbox = self.unpack(vbox, self.pack_off, packable, arg_ptype)
             vbox = self.extend(vbox, arg_ptype)
+
+        # The instruction takes less items than the vector has.
+        # Unpack if not at pack_off 0
+        count = arg_ptype.getcount()
+        if count != -1 and count < vbox.item_count:
+            if self.pack_off == 0:
+                pass # right place already
+            else:
+                vbox = self.unpack(vbox, self.pack_off, count, arg_ptype)
+
         vop.setarg(argidx, vbox)
         return vbox
 
diff --git a/rpython/jit/metainterp/resoperation.py b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -467,7 +467,7 @@
     # double -> float: v2 = cast(v1, 2) equal to v2 = (v1[0], v1[1], X, X)
     'VEC_CAST_FLOAT_TO_SINGLEFLOAT/2',
     # v4 = cast(v3, 0, 2), v4 = (v3[0], v3[1])
-    'VEC_CAST_SINGLEFLOAT_TO_FLOAT/3',
+    'VEC_CAST_SINGLEFLOAT_TO_FLOAT/2',
     'VEC_CAST_FLOAT_TO_INT/2',
     'VEC_CAST_INT_TO_FLOAT/2',