[pypy-commit] pypy vecopt: added integer/float types to zjit test (int 8, 16, 32, 64, float 32, 64)

Tue May 12 16:21:48 CEST 2015

Author: Richard Plangger <rich at pasra.at>
Branch: vecopt
Changeset: r77298:40afd88ea5d8
Date: 2015-05-12 16:21 +0200
http://bitbucket.org/pypy/pypy/changeset/40afd88ea5d8/

Log:	added integer/float types to zjit test (int 8,16,32,64, float 32,64)
	extended test_add to use different types than int64/float64 added
	vector cast operation (float -> single float) added IR test case to
	check if type size differences are handled correctly (when
	shrinking) extended the transformation of a pack to a vector
	operation. if an operation (like casting) shrinks the size of vector
	elements, pack instructions copy them in place

diff --git a/pypy/module/micronumpy/compile.py b/pypy/module/micronumpy/compile.py
--- a/pypy/module/micronumpy/compile.py
+++ b/pypy/module/micronumpy/compile.py
@@ -632,8 +632,16 @@
     def execute(self, interp):
         if self.v == 'int':
             dtype = get_dtype_cache(interp.space).w_int64dtype
+        elif self.v == 'int8':
+            dtype = get_dtype_cache(interp.space).w_int8dtype
+        elif self.v == 'int16':
+            dtype = get_dtype_cache(interp.space).w_int16dtype
+        elif self.v == 'int32':
+            dtype = get_dtype_cache(interp.space).w_int32dtype
         elif self.v == 'float':
             dtype = get_dtype_cache(interp.space).w_float64dtype
+        elif self.v == 'float32':
+            dtype = get_dtype_cache(interp.space).w_float32dtype
         else:
             raise BadToken('unknown v to dtype "%s"' % self.v)
         return dtype
@@ -864,8 +872,20 @@
                     stack.append(ArrayClass())
                 elif token.v.strip(' ') == 'int':
                     stack.append(DtypeClass('int'))
+                elif token.v.strip(' ') == 'int8':
+                    stack.append(DtypeClass('int8'))
+                elif token.v.strip(' ') == 'int16':
+                    stack.append(DtypeClass('int16'))
+                elif token.v.strip(' ') == 'int32':
+                    stack.append(DtypeClass('int32'))
+                elif token.v.strip(' ') == 'int64':
+                    stack.append(DtypeClass('int'))
                 elif token.v.strip(' ') == 'float':
                     stack.append(DtypeClass('float'))
+                elif token.v.strip(' ') == 'float32':
+                    stack.append(DtypeClass('float32'))
+                elif token.v.strip(' ') == 'float64':
+                    stack.append(DtypeClass('float'))
                 else:
                     stack.append(Variable(token.v.strip(' ')))
             elif token.name == 'array_left':
diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -80,26 +80,44 @@
         retval = self.interp.eval_graph(self.graph, [i])
         return retval
 
-    def define_add():
+    def define_add_float():
         return """
         a = |30|
         b = a + a
         b -> 3
         """
 
-    def test_add(self):
-        result = self.run("add")
+    def define_add_float32():
+        return """
+        a = astype(|30|, float32)
+        b = a + a
+        b -> 3
+        """
+
+    def test_add_float(self):
+        result = self.run("add_float")
         assert result == 3 + 3
+        result = self.run("add_float32")
+        assert result == 3.0 + 3.0
 
-    def define_add_const():
+    def define_add_float32_const():
         return """
-        a = |30| + 3
+        a = astype(|30|, float32) + 3.0
         a -> 29
         """
 
-    def test_add_const(self):
-        result = self.run("add_const")
-        assert result == 29 + 3
+    def define_add_float_const():
+        return """
+        a = astype(|30|, float32) + 3.0
+        a -> 29
+        """
+
+    def test_add_float_const(self):
+        result = self.run("add_float_const")
+        assert result == 29.0 + 3.0
+        self.check_trace_count(1)
+        result = self.run("add_float32_const")
+        assert result == 29.0 + 3.0
         self.check_trace_count(1)
 
     def define_pow():
diff --git a/rpython/jit/backend/llgraph/runner.py b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -692,6 +692,9 @@
     bh_vec_float_eq.argtypes = ['f','f','i']
     bh_vec_float_eq.resulttype = 'i'
 
+    def bh_vec_cast_float_to_singlefloat(self, vx):
+        return vx
+
     def bh_vec_box(self, size):
         return [0] * size
 
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -2549,6 +2549,13 @@
         exec py.code.Source(_source).compile()
     del genop_vec_float_arith
 
+    def genop_vec_expand(self, op, arglocs, resloc):
+        loc0, sizeloc = arglocs
+        size = sizeloc.value
+        if size == 2:
+            pass
+
+
     def genop_vec_box_unpack(self, op, arglocs, resloc):
         loc0, indexloc, sizeloc = arglocs
         size = sizeloc.value
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -1535,16 +1535,6 @@
     consider_vec_float_eq = consider_vec_logic
     del consider_vec_logic
 
-    def consider_vec_int_signext(self, op):
-        # there is not much we can do in this case. arithmetic is
-        # done on the vector register, if there is a wrap around,
-        # it is lost, because the register does not have enough bits
-        # to save it.
-        #argloc = self.loc(op.getarg(0))
-        self.xrm.force_result_in_reg(op.result, op.getarg(0))
-        #if op.getarg(1).value != op.getarg(2).value:
-        #    raise NotImplementedError("signext not implemented")
-
     def consider_vec_box_pack(self, op):
         count = op.getarg(3)
         index = op.getarg(2)
@@ -1574,6 +1564,24 @@
         result = self.force_allocate_reg(op.result, args)
         self.perform(op, [loc0, imm(count.value)], result)
 
+    def consider_vec_cast_float_to_singlefloat(self, op):
+        size = op.getarg(1)
+        args = op.getarglist()
+        loc0 = self.make_sure_var_in_reg(op.getarg(0), args)
+        result = self.force_allocate_reg(op.result, args)
+        self.perform(op, [loc0, imm(size.value)], result)
+
+    def consider_vec_int_signext(self, op):
+        # there is not much we can do in this case. arithmetic is
+        # done on the vector register, if there is a wrap around,
+        # it is lost, because the register does not have enough bits
+        # to save it.
+        #argloc = self.loc(op.getarg(0))
+        self.xrm.force_result_in_reg(op.result, op.getarg(0))
+        #if op.getarg(1).value != op.getarg(2).value:
+        #    raise NotImplementedError("signext not implemented")
+
+
     def consider_vec_box(self, op):
         # pseudo instruction, needed to create a new variable
         pass
diff --git a/rpython/jit/metainterp/executor.py b/rpython/jit/metainterp/executor.py
--- a/rpython/jit/metainterp/executor.py
+++ b/rpython/jit/metainterp/executor.py
@@ -342,16 +342,11 @@
                          rop.LABEL,
                          rop.VEC_RAW_LOAD,
                          rop.VEC_RAW_STORE,
-                         rop.VEC_BOX_PACK,
-                         rop.VEC_BOX_UNPACK,
-                         rop.VEC_EXPAND,
-                         rop.VEC_BOX,
                          rop.VEC_GETARRAYITEM_RAW,
                          rop.VEC_SETARRAYITEM_RAW,
                          ):      # list of opcodes never executed by pyjitpl
                 continue
-             # trace will generate such an op
-            if rop._VEC_ARITHMETIC_FIRST <= value <= rop._VEC_ARITHMETIC_LAST:
+            if rop._VEC_PURE_FIRST <= value <= rop._VEC_PURE_LAST:
                 continue
 
             raise AssertionError("missing %r" % (key,))
diff --git a/rpython/jit/metainterp/history.py b/rpython/jit/metainterp/history.py
--- a/rpython/jit/metainterp/history.py
+++ b/rpython/jit/metainterp/history.py
@@ -512,14 +512,52 @@
 
 # ____________________________________________________________
 
-class BoxVector(Box):
+class PrimitiveTypeMixin(object):
+    def gettype(self):
+        raise NotImplementedError
+    def getsize(self):
+        raise NotImplementedError
+    def getsigned(self):
+        raise NotImplementedError
+
+    def matches_type(self, other):
+        assert isinstance(other, PrimitiveTypeMixin)
+        return self.gettype() == other.gettype()
+
+    def matches_size(self, other):
+        assert isinstance(other, PrimitiveTypeMixin)
+        return self.getsize() == other.getsize()
+
+    def matches_sign(self, other):
+        assert isinstance(other, PrimitiveTypeMixin)
+        return self.getsigend() == other.signed()
+
+    def matches(self, other):
+        if isinstance(other, PrimitiveTypeMixin):
+            return self.matches_type(other) and \
+                   self.matches_size(other) and \
+                   self.matches_sign(other)
+        return False
+
+
+
+class BoxVector(Box, PrimitiveTypeMixin):
     type = VECTOR
-    _attrs_ = ('item_type','item_count')
+    _attrs_ = ('item_type','item_count','item_size','signed')
     _extended_display = False
 
-    def __init__(self, item_type=FLOAT, item_count=2):
+    def __init__(self, item_type=FLOAT, item_count=2, item_size=8, signed=True):
         self.item_type = item_type
         self.item_count = item_count
+        self.item_size = item_size
+        self.signed = signed
+
+    def gettype(self):
+        return self.item_type
+    def getsize(self):
+        return self.item_size
+    def getsigned(self):
+        return self.signed
 
     def forget_value(self):
         raise NotImplementedError("cannot forget value of vector")
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -1109,6 +1109,41 @@
         except NotAVectorizeableLoop:
             pass
 
+    def test_shrink_vector_size(self):
+        ops = """
+        [p0,p1,i1]
+        guard_early_exit() []
+        f1 = getarrayitem_raw(p0, i1, descr=floatarraydescr)
+        i2 = cast_float_to_singlefloat(f1)
+        setarrayitem_raw(p1, i1, i2, descr=singlefloatarraydescr)
+        i3 = int_add(i1, 1)
+        i4 = int_ge(i3, 36)
+        guard_false(i4) []
+        jump(p0, p1, i3)
+        """
+        opt = """
+        [p0, p1, i1]
+        guard_early_exit() []
+        i3 = int_add(i1, 1)
+        i4 = int_ge(i3, 36)
+        i5 = int_add(i1, 2)
+        i8 = int_ge(i5, 36)
+        i6 = int_add(i1, 3)
+        i11 = int_ge(i6, 36)
+        i7 = int_add(i1, 4)
+        i14 = int_ge(i7, 36)
+        guard_false(i14) []
+        v17 = vec_getarrayitem_raw(p0, i1, 2, descr=floatarraydescr)
+        v18 = vec_getarrayitem_raw(p0, i5, 2, descr=floatarraydescr)
+        v19 = vec_cast_float_to_singlefloat(v17, 2)
+        v20 = vec_cast_float_to_singlefloat(v18, 2)
+        v21 = vec_box(4)
+        vec_box_pack(v21, v20, 2)
+        vec_setarrayitem_raw(p1, i1, v21, 4, descr=singlefloatarraydescr)
+        jump(p0, p1, i7)
+        """
+        vopt = self.vectorize(self.parse_loop(ops))
+        self.assert_equal(vopt.loop, self.parse_loop(opt))
 
 
 class TestLLtype(BaseTestVectorize, LLtypeMixin):
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -5,7 +5,7 @@
 from rpython.jit.metainterp.optimizeopt.unroll import optimize_unroll
 from rpython.jit.metainterp.compile import ResumeAtLoopHeaderDescr
 from rpython.jit.metainterp.history import (ConstInt, VECTOR, FLOAT, INT,
-        BoxVector, TargetToken, JitCellToken, Box)
+        BoxVector, TargetToken, JitCellToken, Box, PrimitiveTypeMixin)
 from rpython.jit.metainterp.optimizeopt.optimizer import Optimizer, Optimization
 from rpython.jit.metainterp.optimizeopt.util import make_dispatcher_method
 from rpython.jit.metainterp.optimizeopt.dependency import (DependencyGraph, 
@@ -300,11 +300,9 @@
                 if node_a.is_before(node_b):
                     if memref_a.is_adjacent_to(memref_b):
                         if self.packset.can_be_packed(node_a, node_b):
-                            self.packset.add_pair(node_a, node_b)
-                    #if memref_a.is_adjacent_with_runtime_check(memref_b, graph):
-                    #    if self.packset.can_be_packed(node_a, node_b):
-                    #        self.check_adjacent_at_runtime(memref_a, memref_b)
-                    #        self.packset.add_pair(node_a, node_b)
+                            pair = Pair(node_a,node_b)
+                            pair.ptype = PackType.by_descr(node_a.getoperation().getdescr())
+                            self.packset.packs.append(pair)
 
     def extend_packset(self):
         pack_count = self.packset.pack_count()
@@ -346,7 +344,7 @@
         if savings >= 0:
             assert candidate[0] is not None
             assert candidate[1] is not None
-            self.packset.add_pair(*candidate)
+            self.packset.add_pair(candidate[0], candidate[1])
 
     def combine_packset(self):
         if len(self.packset.packs) == 0:
@@ -373,11 +371,12 @@
                 i += 1
             if len_before == len(self.packset.packs):
                 break
+        print self.packset.packs
 
     def schedule(self):
         self.guard_early_exit = -1
         self.clear_newoperations()
-        sched_data = VecScheduleData()
+        sched_data = VecScheduleData(self.metainterp_sd.cpu.vector_register_size)
         scheduler = Scheduler(self.dependency_graph, sched_data)
         while scheduler.has_more():
             position = len(self._newoperations)
@@ -553,12 +552,50 @@
     # this might be an indicator for edge removal
     return True
 
+class PackArgs(object):
+    def __init__(self, arg_pos, result=True):
+        self.mask = 0
+        for p in arg_pos:
+            self.mask |= (1<<(p+1))
+        if result:
+            self.mask |= 1
+
+    def arg_is_set(self, i):
+        return bool((1<<(i+1)) & self.mask)
+
+    def result_is_set(self):
+        return bool(1 & self.mask)
+
+
+ROP_ARG_RES_VECTOR = {
+    rop.VEC_INT_ADD:     PackArgs((0,1)),
+    rop.VEC_INT_SUB:     PackArgs((0,1)),
+    rop.VEC_INT_MUL:     PackArgs((0,1)),
+    rop.VEC_INT_SIGNEXT: PackArgs((0,)),
+
+    rop.VEC_FLOAT_ADD:   PackArgs((0,1)),
+    rop.VEC_FLOAT_SUB:   PackArgs((0,1)),
+    rop.VEC_FLOAT_MUL:   PackArgs((0,1)),
+    rop.VEC_FLOAT_EQ:    PackArgs((0,1)),
+
+    rop.VEC_RAW_LOAD:         PackArgs(()),
+    rop.VEC_GETARRAYITEM_RAW: PackArgs(()),
+    rop.VEC_RAW_STORE:        PackArgs((2,), result=False),
+    rop.VEC_SETARRAYITEM_RAW: PackArgs((2,), result=False),
+
+    rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT: PackArgs((0,)),
+}
+
+
 class VecScheduleData(SchedulerData):
-    def __init__(self):
+    def __init__(self, vec_reg_size):
         self.box_to_vbox = {}
         self.unpack_rename_map = {}
         self.preamble_ops = None
         self.expansion_byte_count = -1
+        self.vec_reg_size = vec_reg_size
+        self.pack_ops = -1
+        self.pack_off = -1
 
     def unpack_rename(self, arg):
         return self.unpack_rename_map.get(arg, arg)
@@ -572,15 +609,57 @@
         self.pack = pack
         # properties that hold for the pack are:
         # isomorphism (see func above)
-        op0 = pack.operations[0].getoperation()
+
+        if pack.ptype is None:
+            self.propagete_ptype()
+
+        self.preamble_ops = []
+        if pack.is_overloaded(self.vec_reg_size):
+            self.preamble_ops = []
+            stride = pack.size_in_bytes() // self.vec_reg_size
+            for i in range(0, op_count, stride):
+                self.pack_off = i
+                self.pack_ops = stride
+                self._as_vector_op()
+            return self.preamble_ops
+        else:
+            self.pack_off = 0
+            self.pack_ops = op_count
+            self._as_vector_op()
+            return self.preamble_ops
+
+    def _as_vector_op(self):
+        op0 = self.pack.operations[self.pack_off].getoperation()
         assert op0.vector != -1
         args = op0.getarglist()[:]
-        args.append(ConstInt(op_count))
+        args.append(ConstInt(self.pack_ops))
         vop = ResOperation(op0.vector, args, op0.result, op0.getdescr())
-        self.preamble_ops = []
-        self._inspect_operation(vop)
+
+        packargs = ROP_ARG_RES_VECTOR.get(op0.vector, None)
+        if packargs is None:
+            raise NotImplementedError("vecop map entry missing. trans: pack -> vop")
+
+        for i,arg in enumerate(args):
+            if packargs.arg_is_set(i):
+                self.vector_arg(vop, i, True)
+        if packargs.result_is_set():
+            self.vector_result(vop)
+
         self.preamble_ops.append(vop)
-        return self.preamble_ops
+
+    def propagete_ptype(self):
+        op0 = self.pack.operations[self.pack_off].getoperation()
+        packargs = ROP_ARG_RES_VECTOR.get(op0.vector, None)
+        if packargs is None:
+            raise NotImplementedError("vecop map entry missing. trans: pack -> vop")
+        args = op0.getarglist()[:]
+        ptype = PackType(PackType.UNKNOWN_TYPE, 0, True)
+        for i,arg in enumerate(args):
+            if packargs.arg_is_set(i):
+                vbox = self.get_vbox_for(arg)
+                ptype.record_vbox(vbox)
+        self.pack.ptype = ptype
+
 
     def get_vbox_for(self, arg):
         try:
@@ -589,18 +668,21 @@
         except KeyError:
             return None
 
-    def vector_result(self, vop, type):
+    def vector_result(self, vop):
         ops = self.pack.operations
         result = vop.result
-        vbox = BoxVector(type, len(ops))
-        vop.result = vbox
-        i = 0
-        while i < len(ops):
+        vop.result = vbox = self.box_vector(self.pack.ptype)
+        i = self.pack_off
+        end = i + self.pack_ops
+        while i < end:
             op = ops[i].getoperation()
             self.box_to_vbox[op.result] = (i, vbox)
             i += 1
 
-    def vector_arg(self, vop, argidx, expand=True):
+    def box_vector(self, ptype):
+        return BoxVector(ptype.type, self.pack_ops, ptype.size, ptype.signed)
+
+    def vector_arg(self, vop, argidx, expand):
         ops = self.pack.operations
         vbox = self.get_vbox_for(vop.getarg(argidx))
         if not vbox:
@@ -609,26 +691,50 @@
             else:
                 assert False, "not allowed to expand" \
                               ", but do not have a vector box as arg"
+        # vbox is a primitive type mixin
+        if self.pack.ptype.getsize() < vbox.getsize():
+            packable = self.vec_reg_size // self.pack.ptype.getsize()
+            packed = vbox.item_count
+            vbox = self.pack_arguments(packed, [op.getoperation().getarg(argidx) for op in ops])
         vop.setarg(argidx, vbox)
         return vbox
 
+    def pack_arguments(self, index, args):
+        i = index
+        vbox = self.box_vector(self.pack.ptype)
+        op = ResOperation(rop.VEC_BOX, [ConstInt(len(args))], vbox)
+        self.preamble_ops.append(op)
+        arg_count = len(args)
+        while i < arg_count:
+            arg = args[i]
+            vbox2 = self.get_vbox_for(arg)
+            if vbox2 is None:
+                raise NotImplementedError
+            op = ResOperation(rop.VEC_BOX_PACK, [vbox, vbox2, ConstInt(i)], None)
+            self.preamble_ops.append(op)
+            i += vbox.item_count
+        return vbox
+
     def expand_box_to_vector_box(self, vop, argidx):
         arg = vop.getarg(argidx)
         all_same_box = True
         ops = self.pack.operations
-        for i in range(len(ops)):
+        i = self.pack_off
+        end = i + self.pack_ops
+        while i < end:
             op = ops[i]
             if arg is not op.getoperation().getarg(argidx):
                 all_same_box = False
                 break
+            i += 1
 
-        vbox = BoxVector(arg.type, len(ops))
-        print "creating vectorbox", vbox, "of type",arg.type
+        vbox = BoxVector(arg.type, self.pack_ops)
+        print "creating vectorbox", vbox, "of type", arg.type
         if all_same_box:
-            expand_op = ResOperation(rop.VEC_EXPAND, [arg, ConstInt(len(ops))], vbox)
+            expand_op = ResOperation(rop.VEC_EXPAND, [arg, ConstInt(self.pack_ops)], vbox)
             self.preamble_ops.append(expand_op)
         else:
-            resop = ResOperation(rop.VEC_BOX, [ConstInt(len(ops))], vbox)
+            resop = ResOperation(rop.VEC_BOX, [ConstInt(self.pack_ops)], vbox)
             self.preamble_ops.append(resop)
             for i,op in enumerate(ops):
                 arg = op.getoperation().getarg(argidx)
@@ -637,43 +743,6 @@
                 self.preamble_ops.append(resop)
         return vbox
 
-    bin_arith_trans = """
-    def _vectorize_{name}(self, vop):
-        self.vector_arg(vop, 0)
-        self.vector_arg(vop, 1)
-        self.vector_result(vop, vop.result.type)
-    """
-    for name in ['VEC_FLOAT_SUB','VEC_FLOAT_MUL','VEC_FLOAT_ADD',
-                 'VEC_INT_ADD','VEC_INT_MUL', 'VEC_INT_SUB',
-                ]:
-        exec py.code.Source(bin_arith_trans.format(name=name)).compile()
-    del bin_arith_trans
-
-    def _vectorize_VEC_FLOAT_EQ(self, vop):
-        self.vector_arg(vop, 0)
-        self.vector_arg(vop, 1)
-        self.vector_result(vop, INT)
-
-    def _vectorize_VEC_INT_SIGNEXT(self, vop):
-        self.vector_arg(vop, 0)
-        # arg 1 is a constant
-        self.vector_result(vop, vop.result.type)
-
-    def _vectorize_VEC_RAW_LOAD(self, vop):
-        descr = vop.getdescr()
-        self.vector_result(vop, vop.result.type)
-    def _vectorize_VEC_GETARRAYITEM_RAW(self, vop):
-        descr = vop.getdescr()
-        self.vector_result(vop, vop.result.type)
-
-    def _vectorize_VEC_RAW_STORE(self, vop):
-        self.vector_arg(vop, 2)
-    def _vectorize_VEC_SETARRAYITEM_RAW(self, vop):
-        self.vector_arg(vop, 2)
-
-VecScheduleData._inspect_operation = \
-        make_dispatcher_method(VecScheduleData, '_vectorize_')
-
 def isomorphic(l_op, r_op):
     """ Same instructions have the same operation name.
     TODO what about parameters?
@@ -682,6 +751,45 @@
         return True
     return False
 
+class PackType(PrimitiveTypeMixin):
+    UNKNOWN_TYPE = '-'
+
+    def __init__(self, type, size, signed):
+        self.type = type
+        self.size = size
+        self.signed = signed
+
+    def gettype(self):
+        return self.type
+
+    def getsize(self):
+        return self.size
+
+    def getsigned(self):
+        return self.signed
+
+    def get_byte_size(self):
+        return self.size
+
+    @staticmethod
+    def by_descr(descr):
+        _t = INT
+        if descr.is_array_of_floats():
+            _t = FLOAT
+        pt = PackType(_t, descr.get_item_size_in_bytes(), descr.is_item_signed())
+        return pt
+
+    def record_vbox(self, vbox):
+        if self.type == PackType.UNKNOWN_TYPE:
+            self.type = vbox.type
+            self.signed = vbox.signed
+        if vbox.item_size > self.size:
+            self.size = vbox.item_size
+
+    def __repr__(self):
+        return 'PackType(%s, %s, %s)' % (self.type, self.size, self.signed)
+
+
 class PackSet(object):
 
     def __init__(self, dependency_graph, operations, unroll_count,
@@ -696,9 +804,8 @@
         return len(self.packs)
 
     def add_pair(self, l, r):
-        if l.op.is_guard():
-            assert False
-        self.packs.append(Pair(l,r))
+        p = Pair(l,r)
+        self.packs.append(p)
 
     def can_be_packed(self, lnode, rnode):
         if isomorphic(lnode.getoperation(), rnode.getoperation()):
@@ -755,8 +862,8 @@
         operations = pack_i.operations
         for op in pack_j.operations[1:]:
             operations.append(op)
-        self.packs[i] = Pack(operations)
-
+        self.packs[i] = pack = Pack(operations)
+        pack.ptype = pack_i.ptype
 
         # instead of deleting an item in the center of pack array,
         # the last element is assigned to position j and
@@ -784,6 +891,7 @@
     def __init__(self, ops):
         self.operations = ops
         self.savings = 0
+        self.ptype = None
         for node in self.operations:
             node.pack = self
 
@@ -797,6 +905,13 @@
         leftmost = other.operations[0]
         return rightmost == leftmost
 
+    def size_in_bytes(self):
+        return self.ptype.get_byte_size() * len(self.operations)
+
+    def is_overloaded(self, vec_reg_byte_size):
+        size = self.size_in_bytes()
+        return size > vec_reg_byte_size
+
     def __repr__(self):
         return "Pack(%r)" % self.operations
 
diff --git a/rpython/jit/metainterp/resoperation.py b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -451,6 +451,7 @@
     'CONVERT_LONGLONG_BYTES_TO_FLOAT/1',
     #
     # vector operations
+    '_VEC_PURE_FIRST',
     '_VEC_ARITHMETIC_FIRST',
     'VEC_INT_ADD/3',
     'VEC_INT_SUB/3',
@@ -459,13 +460,17 @@
     'VEC_FLOAT_SUB/3',
     'VEC_FLOAT_MUL/3',
     'VEC_FLOAT_DIV/3',
+    '_VEC_ARITHMETIC_LAST',
     'VEC_FLOAT_EQ/3',
+
     'VEC_INT_SIGNEXT/3',
-    '_VEC_ARITHMETIC_LAST',
+    'VEC_CAST_FLOAT_TO_SINGLEFLOAT/2',
+
     'VEC_BOX_UNPACK/3',          # iX|fX = VEC_BOX_UNPACK(vX, index, item_count)
     'VEC_BOX_PACK/4',            # VEC_BOX_PACK(vX, var/const, index, item_count)
     'VEC_EXPAND/2',              # vX = VEC_EXPAND(var/const, item_count)
     'VEC_BOX/1',
+    '_VEC_PURE_LAST',
     #
     'INT_LT/2b',
     'INT_LE/2b',
@@ -716,7 +721,6 @@
 _opvector = {
     rop.RAW_LOAD:         rop.VEC_RAW_LOAD,
     rop.GETARRAYITEM_RAW: rop.VEC_GETARRAYITEM_RAW,
-
     rop.RAW_STORE:        rop.VEC_RAW_STORE,
     rop.SETARRAYITEM_RAW: rop.VEC_SETARRAYITEM_RAW,
 
@@ -730,6 +734,7 @@
     rop.FLOAT_EQ:  rop.VEC_FLOAT_EQ,
 
     rop.INT_SIGNEXT: rop.VEC_INT_SIGNEXT,
+    rop.CAST_FLOAT_TO_SINGLEFLOAT: rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT,
 }
 
 def setup2():