[pypy-commit] pypy default: - add in the backend, for binary instructions, a memo function

Thu Nov 3 10:24:39 CET 2011

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r48677:eb27c44ca6ad
Date: 2011-11-03 08:21 +0100
http://bitbucket.org/pypy/pypy/changeset/eb27c44ca6ad/

Log:	- add in the backend, for binary instructions, a memo function
	that returns True if there is any NAME_xy that could match. If it
	returns False we know the whole subcase can be omitted from
	translated code. Without this hack, the size of most _binaryop
	INSN functions ends up quite large in C code.

	- found out that a lot of instructions have a missing case on 64
	bits, because INSN_m used to fall back to INSN_a if the constant
	offset doesn't fit in 32 bits --- but most instructions that have
	an 'm' form don't have an 'a' form. Fixed by generating an extra
	LEA and not falling back to the 'a' form.

	- location_code() is an indirect method call for no really good
	reason. Turn it into a monomorphic method that always read a
	_location_code attribute.

diff --git a/pypy/jit/backend/x86/regloc.py b/pypy/jit/backend/x86/regloc.py
--- a/pypy/jit/backend/x86/regloc.py
+++ b/pypy/jit/backend/x86/regloc.py
@@ -17,7 +17,7 @@
 
 class AssemblerLocation(object):
     # XXX: Is adding "width" here correct?
-    __slots__ = ('value', 'width')
+    _attrs_ = ('value', 'width', '_location_code')
     _immutable_ = True
     def _getregkey(self):
         return self.value
@@ -25,6 +25,9 @@
     def is_memory_reference(self):
         return self.location_code() in ('b', 's', 'j', 'a', 'm')
 
+    def location_code(self):
+        return self._location_code
+
     def value_r(self): return self.value
     def value_b(self): return self.value
     def value_s(self): return self.value
@@ -38,6 +41,8 @@
 
 class StackLoc(AssemblerLocation):
     _immutable_ = True
+    _location_code = 'b'
+
     def __init__(self, position, ebp_offset, num_words, type):
         assert ebp_offset < 0   # so no confusion with RegLoc.value
         self.position = position
@@ -49,9 +54,6 @@
     def __repr__(self):
         return '%d(%%ebp)' % (self.value,)
 
-    def location_code(self):
-        return 'b'
-
     def assembler(self):
         return repr(self)
 
@@ -63,8 +65,10 @@
         self.is_xmm = is_xmm
         if self.is_xmm:
             self.width = 8
+            self._location_code = 'x'
         else:
             self.width = WORD
+            self._location_code = 'r'
     def __repr__(self):
         if self.is_xmm:
             return rx86.R.xmmnames[self.value]
@@ -79,12 +83,6 @@
         assert not self.is_xmm
         return RegLoc(rx86.high_byte(self.value), False)
 
-    def location_code(self):
-        if self.is_xmm:
-            return 'x'
-        else:
-            return 'r'
-
     def assembler(self):
         return '%' + repr(self)
 
@@ -97,14 +95,13 @@
 class ImmedLoc(AssemblerLocation):
     _immutable_ = True
     width = WORD
+    _location_code = 'i'
+
     def __init__(self, value):
         from pypy.rpython.lltypesystem import rffi, lltype
         # force as a real int
         self.value = rffi.cast(lltype.Signed, value)
 
-    def location_code(self):
-        return 'i'
-
     def getint(self):
         return self.value
 
@@ -149,9 +146,6 @@
         info = getattr(self, attr, '?')
         return '<AddressLoc %r: %s>' % (self._location_code, info)
 
-    def location_code(self):
-        return self._location_code
-
     def value_a(self):
         return self.loc_a
 
@@ -191,6 +185,7 @@
     # we want a width of 8  (... I think.  Check this!)
     _immutable_ = True
     width = 8
+    _location_code = 'j'
 
     def __init__(self, address):
         self.value = address
@@ -198,9 +193,6 @@
     def __repr__(self):
         return '<ConstFloatLoc @%s>' % (self.value,)
 
-    def location_code(self):
-        return 'j'
-
 if IS_X86_32:
     class FloatImmedLoc(AssemblerLocation):
         # This stands for an immediate float.  It cannot be directly used in
@@ -209,6 +201,7 @@
         # instead; see below.
         _immutable_ = True
         width = 8
+        _location_code = '#'     # don't use me
 
         def __init__(self, floatstorage):
             self.aslonglong = floatstorage
@@ -229,9 +222,6 @@
             floatvalue = longlong.getrealfloat(self.aslonglong)
             return '<FloatImmedLoc(%s)>' % (floatvalue,)
 
-        def location_code(self):
-            raise NotImplementedError
-
 if IS_X86_64:
     def FloatImmedLoc(floatstorage):
         from pypy.rlib.longlong2float import float2longlong
@@ -270,6 +260,11 @@
     else:
         raise AssertionError(methname + " undefined")
 
+def _missing_binary_insn(name, code1, code2):
+    raise AssertionError(name + "_" + code1 + code2 + " missing")
+_missing_binary_insn._dont_inline_ = True
+
+
 class LocationCodeBuilder(object):
     _mixin_ = True
 
@@ -310,6 +305,23 @@
             _rx86_getattr(self, methname)(val1, val2)
         invoke._annspecialcase_ = 'specialize:arg(1)'
 
+        def has_implementation_for(loc1, loc2):
+            # A memo function that returns True if there is any NAME_xy that could match.
+            # If it returns False we know the whole subcase can be omitted from translated
+            # code.  Without this hack, the size of most _binaryop INSN functions ends up
+            # quite large in C code.
+            if loc1 == '?':
+                return any([has_implementation_for(loc1, loc2)
+                            for loc1 in unrolling_location_codes])
+            methname = name + "_" + loc1 + loc2
+            if not hasattr(rx86.AbstractX86CodeBuilder, methname):
+                return False
+            # any NAME_j should have a NAME_m as a fallback, too.  Check it
+            if loc1 == 'j': assert has_implementation_for('m', loc2), methname
+            if loc2 == 'j': assert has_implementation_for(loc1, 'm'), methname
+            return True
+        has_implementation_for._annspecialcase_ = 'specialize:memo'
+
         def INSN(self, loc1, loc2):
             code1 = loc1.location_code()
             code2 = loc2.location_code()
@@ -325,6 +337,8 @@
                 assert code2 not in ('j', 'i')
 
             for possible_code2 in unrolling_location_codes:
+                if not has_implementation_for('?', possible_code2):
+                    continue
                 if code2 == possible_code2:
                     val2 = getattr(loc2, "value_" + possible_code2)()
                     #
@@ -335,28 +349,32 @@
                     #
                     # Regular case
                     for possible_code1 in unrolling_location_codes:
+                        if not has_implementation_for(possible_code1,
+                                                      possible_code2):
+                            continue
                         if code1 == possible_code1:
                             val1 = getattr(loc1, "value_" + possible_code1)()
                             # More faking out of certain operations for x86_64
-                            if possible_code1 == 'j' and not rx86.fits_in_32bits(val1):
+                            fits32 = rx86.fits_in_32bits
+                            if possible_code1 == 'j' and not fits32(val1):
                                 val1 = self._addr_as_reg_offset(val1)
                                 invoke(self, "m" + possible_code2, val1, val2)
-                            elif possible_code2 == 'j' and not rx86.fits_in_32bits(val2):
+                                return
+                            if possible_code2 == 'j' and not fits32(val2):
                                 val2 = self._addr_as_reg_offset(val2)
                                 invoke(self, possible_code1 + "m", val1, val2)
-                            elif possible_code1 == 'm' and not rx86.fits_in_32bits(val1[1]):
+                                return
+                            if possible_code1 == 'm' and not fits32(val1[1]):
                                 val1 = self._fix_static_offset_64_m(val1)
-                                invoke(self, "a" + possible_code2, val1, val2)
-                            elif possible_code2 == 'm' and not rx86.fits_in_32bits(val2[1]):
+                            if possible_code2 == 'm' and not fits32(val2[1]):
                                 val2 = self._fix_static_offset_64_m(val2)
-                                invoke(self, possible_code1 + "a", val1, val2)
-                            else:
-                                if possible_code1 == 'a' and not rx86.fits_in_32bits(val1[3]):
-                                    val1 = self._fix_static_offset_64_a(val1)
-                                if possible_code2 == 'a' and not rx86.fits_in_32bits(val2[3]):
-                                    val2 = self._fix_static_offset_64_a(val2)
-                                invoke(self, possible_code1 + possible_code2, val1, val2)
+                            if possible_code1 == 'a' and not fits32(val1[3]):
+                                val1 = self._fix_static_offset_64_a(val1)
+                            if possible_code2 == 'a' and not fits32(val2[3]):
+                                val2 = self._fix_static_offset_64_a(val2)
+                            invoke(self, possible_code1 + possible_code2, val1, val2)
                             return
+            _missing_binary_insn(name, code1, code2)
 
         return func_with_new_name(INSN, "INSN_" + name)
 
@@ -431,12 +449,14 @@
     def _fix_static_offset_64_m(self, (basereg, static_offset)):
         # For cases where an AddressLoc has the location_code 'm', but
         # where the static offset does not fit in 32-bits.  We have to fall
-        # back to the X86_64_SCRATCH_REG.  Note that this returns a location
-        # encoded as mode 'a'.  These are all possibly rare cases; don't try
+        # back to the X86_64_SCRATCH_REG.  Returns a new location encoded
+        # as mode 'm' too.  These are all possibly rare cases; don't try
         # to reuse a past value of the scratch register at all.
         self._scratch_register_known = False
         self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset)
-        return (basereg, X86_64_SCRATCH_REG.value, 0, 0)
+        self.LEA_ra(X86_64_SCRATCH_REG.value,
+                    (basereg, X86_64_SCRATCH_REG.value, 0, 0))
+        return (X86_64_SCRATCH_REG.value, 0)
 
     def _fix_static_offset_64_a(self, (basereg, scalereg,
                                        scale, static_offset)):
diff --git a/pypy/jit/backend/x86/rx86.py b/pypy/jit/backend/x86/rx86.py
--- a/pypy/jit/backend/x86/rx86.py
+++ b/pypy/jit/backend/x86/rx86.py
@@ -745,6 +745,7 @@
     assert insnname_template.count('*') == 1
     add_insn('x', register(2), '\xC0')
     add_insn('j', abs_, immediate(2))
+    add_insn('m', mem_reg_plus_const(2))
 
 define_pxmm_insn('PADDQ_x*',     '\xD4')
 define_pxmm_insn('PSUBQ_x*',     '\xFB')
diff --git a/pypy/jit/backend/x86/test/test_regloc.py b/pypy/jit/backend/x86/test/test_regloc.py
--- a/pypy/jit/backend/x86/test/test_regloc.py
+++ b/pypy/jit/backend/x86/test/test_regloc.py
@@ -146,8 +146,10 @@
         expected_instructions = (
                 # mov r11, 0xFEDCBA9876543210
                 '\x49\xBB\x10\x32\x54\x76\x98\xBA\xDC\xFE'
-                # mov rcx, [rdx+r11]
-                '\x4A\x8B\x0C\x1A'
+                # lea r11, [rdx+r11]
+                '\x4E\x8D\x1C\x1A'
+                # mov rcx, [r11]
+                '\x49\x8B\x0B'
         )
         assert cb.getvalue() == expected_instructions
 
@@ -217,8 +219,10 @@
         expected_instructions = (
                 # mov r11, 0xFEDCBA9876543210
                 '\x49\xBB\x10\x32\x54\x76\x98\xBA\xDC\xFE'
-                # mov [rdx+r11], -0x01234567
-                '\x4A\xC7\x04\x1A\x99\xBA\xDC\xFE'
+                # lea r11, [rdx+r11]
+                '\x4E\x8D\x1C\x1A'
+                # mov [r11], -0x01234567
+                '\x49\xC7\x03\x99\xBA\xDC\xFE'
         )
         assert cb.getvalue() == expected_instructions
 
@@ -300,8 +304,10 @@
                 '\x48\xBA\xEF\xCD\xAB\x89\x67\x45\x23\x01'
                 # mov r11, 0xFEDCBA9876543210
                 '\x49\xBB\x10\x32\x54\x76\x98\xBA\xDC\xFE'
-                # mov [rax+r11], rdx
-                '\x4A\x89\x14\x18'
+                # lea r11, [rax+r11]
+                '\x4E\x8D\x1C\x18'
+                # mov [r11], rdx
+                '\x49\x89\x13'
                 # pop rdx
                 '\x5A'
         )