[pypy-svn] pypy numpy-exp: Make a basic SSE operation on array work. It doesn't guarantee as of now

fijal commits-noreply at bitbucket.org
Sun Feb 13 16:16:11 CET 2011


Author: Maciej Fijalkowski <fijall at gmail.com>
Branch: numpy-exp
Changeset: r41881:19755a39b71e
Date: 2011-02-12 20:28 +0200
http://bitbucket.org/pypy/pypy/changeset/19755a39b71e/

Log:	Make a basic SSE operation on array work. It doesn't guarantee as of
	now that arrays are aligned. Have to think for a second about how to
	do it, probably by operating on raw arrays with no length and
	calling special functions for POSIX. *VERY* experimental

diff --git a/pypy/jit/backend/x86/rx86.py b/pypy/jit/backend/x86/rx86.py
--- a/pypy/jit/backend/x86/rx86.py
+++ b/pypy/jit/backend/x86/rx86.py
@@ -656,10 +656,15 @@
 
 define_modrm_modes('MOVSD_x*', ['\xF2', rex_nw, '\x0F\x10', register(1,8)], regtype='XMM')
 define_modrm_modes('MOVSD_*x', ['\xF2', rex_nw, '\x0F\x11', register(2,8)], regtype='XMM')
+define_modrm_modes('MOVAPD_x*', ['\x66', rex_nw, '\x0F\x28', register(1,8)],
+                   regtype='XMM')
+define_modrm_modes('MOVAPD_*x', ['\x66', rex_nw, '\x0F\x29', register(2,8)],
+                   regtype='XMM')
 
 define_modrm_modes('XCHG_r*', [rex_w, '\x87', register(1, 8)])
 
 define_modrm_modes('ADDSD_x*', ['\xF2', rex_nw, '\x0F\x58', register(1, 8)], regtype='XMM')
+define_modrm_modes('ADDPD_x*', ['\x66', rex_nw, '\x0F\x58', register(1, 8)], regtype='XMM')
 define_modrm_modes('SUBSD_x*', ['\xF2', rex_nw, '\x0F\x5C', register(1, 8)], regtype='XMM')
 define_modrm_modes('MULSD_x*', ['\xF2', rex_nw, '\x0F\x59', register(1, 8)], regtype='XMM')
 define_modrm_modes('DIVSD_x*', ['\xF2', rex_nw, '\x0F\x5E', register(1, 8)], regtype='XMM')

diff --git a/pypy/jit/backend/x86/regloc.py b/pypy/jit/backend/x86/regloc.py
--- a/pypy/jit/backend/x86/regloc.py
+++ b/pypy/jit/backend/x86/regloc.py
@@ -500,7 +500,9 @@
     LEA = _binaryop('LEA')
 
     MOVSD = _binaryop('MOVSD')
+    MOVAPD = _binaryop('MOVAPD')
     ADDSD = _binaryop('ADDSD')
+    ADDPD = _binaryop('ADDPD')
     SUBSD = _binaryop('SUBSD')
     MULSD = _binaryop('MULSD')
     DIVSD = _binaryop('DIVSD')

diff --git a/pypy/jit/metainterp/pyjitpl.py b/pypy/jit/metainterp/pyjitpl.py
--- a/pypy/jit/metainterp/pyjitpl.py
+++ b/pypy/jit/metainterp/pyjitpl.py
@@ -394,6 +394,11 @@
     opimpl_getarrayitem_gc_r = _opimpl_getarrayitem_gc_any
     opimpl_getarrayitem_gc_f = _opimpl_getarrayitem_gc_any
 
+    @arguments("descr", "box", "box", "box", "box")
+    def opimpl_sse_float_add(self, arraydescr, array1, array2, arrayres, index):
+        return self.execute_with_descr(rop.SSE_FLOAT_ADD, arraydescr, array1,
+                                       array2, arrayres, index)
+
     @arguments("box", "descr", "box")
     def _opimpl_getarrayitem_raw_any(self, arraybox, arraydescr, indexbox):
         return self.execute_with_descr(rop.GETARRAYITEM_RAW,

diff --git a/pypy/jit/backend/x86/regalloc.py b/pypy/jit/backend/x86/regalloc.py
--- a/pypy/jit/backend/x86/regalloc.py
+++ b/pypy/jit/backend/x86/regalloc.py
@@ -1053,6 +1053,33 @@
         self.Perform(op, [base_loc, ofs_loc, imm(itemsize), imm(ofs),
                           sign_loc], result_loc)
 
+    def _read_elem_into_xmmreg(self, xmmreg, array, index, ofs_loc, arraydescr):
+        itemsize, ofs, _, _, _ = self._unpack_arraydescr(arraydescr)
+        base_loc = self.rm.make_sure_var_in_reg(array, [index])
+        self.assembler.move_from_array_to_xmm(base_loc, ofs_loc,
+                                              imm(itemsize), imm(ofs),
+                                              xmmreg)
+
+    def consider_sse_float_add(self, op):
+        box1 = TempBox()
+        box2 = TempBox()
+        loc1 = self.xrm.force_allocate_reg(box1)
+        loc2 = self.xrm.force_allocate_reg(box2, [box1])
+        arraydescr = op.getdescr()
+        ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(3))
+        self._read_elem_into_xmmreg(loc1, op.getarg(0), op.getarg(3), ofs_loc,
+                                    arraydescr)
+        if op.getarg(1) != op.getarg(0):
+            self._read_elem_into_xmmreg(loc2, op.getarg(1), op.getarg(3),
+                                        ofs_loc, arraydescr)
+        base_loc = self.rm.make_sure_var_in_reg(op.getarg(2), [op.getarg(3)])
+        itemsize, ofs, _, _, _ = self._unpack_arraydescr(arraydescr)
+        self.possibly_free_vars(op.getarglist())
+        self.xrm.possibly_free_var(box1)
+        self.xrm.possibly_free_var(box2)
+        self.PerformDiscard(op, [base_loc, ofs_loc, imm(itemsize), imm(ofs),
+                                 loc1, loc2])
+
     consider_getarrayitem_raw = consider_getarrayitem_gc
     consider_getarrayitem_gc_pure = consider_getarrayitem_gc
 

diff --git a/pypy/translator/c/src/float.h b/pypy/translator/c/src/float.h
--- a/pypy/translator/c/src/float.h
+++ b/pypy/translator/c/src/float.h
@@ -43,3 +43,7 @@
 #define OP_CAST_FLOAT_TO_LONGLONG(x,r) r = (long long)(x)
 #define OP_CAST_FLOAT_TO_ULONGLONG(x,r) r = (unsigned long long)(x)
 #endif
+
+/*** those operations don't do anything because they're in
+     if we_are_jitted() path ***/
+#define OP_SSE_FLOAT_ADD(a, b, c, d, e)

diff --git a/pypy/jit/metainterp/executor.py b/pypy/jit/metainterp/executor.py
--- a/pypy/jit/metainterp/executor.py
+++ b/pypy/jit/metainterp/executor.py
@@ -122,6 +122,17 @@
     else:
         cpu.bh_setarrayitem_raw_i(arraydescr, array, index, itembox.getint())
 
+def do_sse_float_add(cpu, _, array1, array2, arrayres, indexbox, arraydescr):
+    onebox = do_getarrayitem_raw(cpu, _, array1, indexbox, arraydescr)
+    twobox = do_getarrayitem_raw(cpu, _, array2, indexbox, arraydescr)
+    res = onebox.getfloat() + twobox.getfloat()
+    do_setarrayitem_raw(cpu, _, arrayres, indexbox, BoxFloat(res), arraydescr)
+    indexbox = BoxInt(indexbox.getint() + 1)
+    onebox = do_getarrayitem_raw(cpu, _, array1, indexbox, arraydescr)
+    twobox = do_getarrayitem_raw(cpu, _, array2, indexbox, arraydescr)
+    res = onebox.getfloat() + twobox.getfloat()
+    do_setarrayitem_raw(cpu, _, arrayres, indexbox, BoxFloat(res), arraydescr)
+
 def do_getfield_gc(cpu, _, structbox, fielddescr):
     struct = structbox.getref_base()
     if fielddescr.is_pointer_field():

diff --git a/pypy/jit/metainterp/blackhole.py b/pypy/jit/metainterp/blackhole.py
--- a/pypy/jit/metainterp/blackhole.py
+++ b/pypy/jit/metainterp/blackhole.py
@@ -1036,6 +1036,15 @@
     def bhimpl_setarrayitem_raw_f(cpu, array, arraydescr, index, newvalue):
         cpu.bh_setarrayitem_raw_f(arraydescr, array, index, newvalue)
 
+    @arguments("cpu", "d", "i", "i", "i", "i")
+    def bhimpl_sse_float_add(cpu, arraydescr, array1, array2, array_res, index):
+        one = cpu.bh_getarrayitem_raw_f(arraydescr, array1, index)
+        two = cpu.bh_getarrayitem_raw_f(arraydescr, array2, index)
+        cpu.bh_setarrayitem_raw_f(arraydescr, array_res, index, one + two)
+        one = cpu.bh_getarrayitem_raw_f(arraydescr, array1, index + 1)
+        two = cpu.bh_getarrayitem_raw_f(arraydescr, array2, index + 1)
+        cpu.bh_setarrayitem_raw_f(arraydescr, array_res, index + 1, one + two)
+
     # note, there is no 'r' here, since it can't happen
 
     @arguments("cpu", "r", "d", returns="i")

diff --git a/pypy/rpython/lltypesystem/lloperation.py b/pypy/rpython/lltypesystem/lloperation.py
--- a/pypy/rpython/lltypesystem/lloperation.py
+++ b/pypy/rpython/lltypesystem/lloperation.py
@@ -443,7 +443,7 @@
 
     # __________ vectorization ops _______
 
-    'sse_float_add': LLOp(sideeffects=False, canrun=True),
+    'sse_float_add': LLOp(canrun=True),
 
     # __________ GC operations __________
 

diff --git a/pypy/jit/codewriter/jtransform.py b/pypy/jit/codewriter/jtransform.py
--- a/pypy/jit/codewriter/jtransform.py
+++ b/pypy/jit/codewriter/jtransform.py
@@ -526,6 +526,14 @@
         return self._do_builtin_call(op, 'raw_free', [op.args[0]],
                                      extra = (ARRAY,), extrakey = ARRAY)
 
+    def rewrite_op_sse_float_add(self, op):
+        ARRAY = op.args[0].concretetype.TO
+        arraydescr = self.cpu.arraydescrof(ARRAY)
+        kind = getkind(op.result.concretetype)
+        assert kind == 'void'
+        return SpaceOperation('sse_float_add',
+                              [arraydescr] + op.args, op.result)
+
     def rewrite_op_getarrayitem(self, op):
         ARRAY = op.args[0].concretetype.TO
         if self._array_of_voids(ARRAY):

diff --git a/pypy/jit/backend/llgraph/llimpl.py b/pypy/jit/backend/llgraph/llimpl.py
--- a/pypy/jit/backend/llgraph/llimpl.py
+++ b/pypy/jit/backend/llgraph/llimpl.py
@@ -161,6 +161,7 @@
     'force_token'     : ((), 'int'),
     'call_may_force'  : (('int', 'varargs'), 'intorptr'),
     'guard_not_forced': ((), None),
+    'sse_float_add'   : (('int', 'int', 'int', 'int'), None),
 }
 
 # ____________________________________________________________
@@ -735,6 +736,12 @@
 
     op_getarrayitem_raw_pure = op_getarrayitem_raw
 
+    def op_sse_float_add(self, arraydescr, array1, array2, arrayres,
+                         index):
+        from pypy.jit.metainterp.blackhole import BlackholeInterpreter
+        return BlackholeInterpreter.bhimpl_sse_float_add.im_func(self.cpu,
+               arraydescr, array1, array2, arrayres, index)
+
     def op_getfield_gc(self, fielddescr, struct):
         if fielddescr.typeinfo == REF:
             return do_getfield_gc_ptr(struct, fielddescr.ofs)

diff --git a/pypy/jit/backend/x86/assembler.py b/pypy/jit/backend/x86/assembler.py
--- a/pypy/jit/backend/x86/assembler.py
+++ b/pypy/jit/backend/x86/assembler.py
@@ -740,6 +740,14 @@
         else:
             self.mc.MOV(to_loc, from_loc)
 
+    def move_from_array_to_xmm(self, base_loc, ofs_loc, itemsize, ofs, xmmreg):
+        assert isinstance(xmmreg, RegLoc)
+        assert isinstance(ofs, ImmedLoc)
+        assert isinstance(itemsize, ImmedLoc)
+        scale = _get_scale(itemsize.value)
+        src_addr = addr_add(base_loc, ofs_loc, ofs.value, scale)
+        self.mc.MOVAPD(xmmreg, src_addr)
+
     regalloc_mov = mov # legacy interface
 
     def regalloc_push(self, loc):
@@ -1121,6 +1129,17 @@
         self.mc.XOR_rr(edx.value, edx.value)
         self.mc.DIV_r(ecx.value)
 
+    def genop_discard_sse_float_add(self, op, arglocs):
+        base_loc, ofs_loc, itemsize, ofs, loc1, loc2 = arglocs
+        assert isinstance(loc1, RegLoc)
+        assert isinstance(loc2, RegLoc)
+        assert isinstance(ofs, ImmedLoc)
+        assert isinstance(itemsize, ImmedLoc)
+        scale = _get_scale(itemsize.value)
+        dest_addr = addr_add(base_loc, ofs_loc, ofs.value, scale)
+        self.mc.ADDPD(loc1, loc2)
+        self.mc.MOVAPD(dest_addr, loc1)
+
     genop_llong_add = _binaryop("PADDQ", True)
     genop_llong_sub = _binaryop("PSUBQ")
     genop_llong_and = _binaryop("PAND",  True)

diff --git a/pypy/jit/metainterp/resoperation.py b/pypy/jit/metainterp/resoperation.py
--- a/pypy/jit/metainterp/resoperation.py
+++ b/pypy/jit/metainterp/resoperation.py
@@ -466,6 +466,7 @@
     'SETARRAYITEM_RAW/3d',
     'SETFIELD_GC/2d',
     'SETFIELD_RAW/2d',
+    'SSE_FLOAT_ADD/4d',
     'STRSETITEM/3',
     'UNICODESETITEM/3',
     #'RUNTIMENEW/1',     # ootype operation    


More information about the Pypy-commit mailing list