[pypy-commit] pypy s390x-backend: reading level=2 cache for estimation size of nursery

Tue Feb 2 11:18:36 EST 2016

Author: Richard Plangger <planrichi at gmail.com>
Branch: s390x-backend
Changeset: r82042:037ac225f6c1
Date: 2016-02-02 17:17 +0100
http://bitbucket.org/pypy/pypy/changeset/037ac225f6c1/

Log:	reading level=2 cache for estimation size of nursery

diff --git a/rpython/jit/backend/ppc/callbuilder.py b/rpython/jit/backend/ppc/callbuilder.py
--- a/rpython/jit/backend/ppc/callbuilder.py
+++ b/rpython/jit/backend/ppc/callbuilder.py
@@ -98,7 +98,7 @@
 
         # We must also copy fnloc into FNREG
         non_float_locs.append(self.fnloc)
-        non_float_regs.append(self.mc.RAW_CALL_REG)     # r2 or r12
+        non_float_regs.append(self.mc.RAW_CALL_REG)
 
         if float_locs:
             assert len(float_locs) <= len(self.FPR_ARGS)
diff --git a/rpython/jit/backend/zarch/callbuilder.py b/rpython/jit/backend/zarch/callbuilder.py
--- a/rpython/jit/backend/zarch/callbuilder.py
+++ b/rpython/jit/backend/zarch/callbuilder.py
@@ -62,7 +62,6 @@
         # called function will in turn call further functions (which must be passed the
         # address of the new frame). This stack grows downwards from high addresses
         # """
-        self.subtracted_to_sp = 0
 
         gpr_regs = 0
         fpr_regs = 0
@@ -88,11 +87,6 @@
         if self.is_call_release_gil:
             self.subtracted_to_sp += 8*WORD
             base += 8*WORD
-        # one additional word for remap frame layout
-        # regalloc_push will overwrite -8(r.SP) and destroy
-        # a parameter if we would not reserve that space
-        # base += WORD
-        # TODO self.subtracted_to_sp += WORD
         for idx,i in enumerate(stack_params):
             loc = arglocs[i]
             offset = STD_FRAME_SIZE_IN_BYTES - base + 8 * idx
@@ -149,7 +143,7 @@
     def emit_raw_call(self):
         # always allocate a stack frame for the new function
         # save the SP back chain
-        #self.mc.STG(r.SP, l.addr(-self.subtracted_to_sp, r.SP))
+        self.mc.STG(r.SP, l.addr(-self.subtracted_to_sp, r.SP))
         # move the frame pointer
         if self.subtracted_to_sp != 0:
             self.mc.LAY(r.SP, l.addr(-self.subtracted_to_sp, r.SP))
@@ -194,8 +188,6 @@
         #
         pos = STD_FRAME_SIZE_IN_BYTES - 7*WORD
         self.mc.STMG(r.r8, r.r13, l.addr(pos, r.SP))
-        # 6 registers, 1 for a floating point return value!
-        # registered by prepare_arguments!
         #
         # Save this thread's shadowstack pointer into r8, for later comparison
         gcrootmap = self.asm.cpu.gc_ll_descr.gcrootmap
@@ -266,19 +258,17 @@
         PARAM_SAVE_AREA_OFFSET = 0
         if reg is not None:
             # save 1 word below the stack pointer
-            pos = STD_FRAME_SIZE_IN_BYTES
             if reg.is_core_reg():
                 self.mc.LGR(RSAVEDRES, reg)
             elif reg.is_fp_reg():
-                self.mc.STD(reg, l.addr(pos-1*WORD, r.SP))
+                self.mc.STD(reg, l.addr(16*WORD, r.SP))
         self.mc.load_imm(self.mc.RAW_CALL_REG, self.asm.reacqgil_addr)
         self.mc.raw_call()
         if reg is not None:
-            pos = STD_FRAME_SIZE_IN_BYTES
             if reg.is_core_reg():
                 self.mc.LGR(reg, RSAVEDRES)
             elif reg.is_fp_reg():
-                self.mc.LD(reg, l.addr(pos-1*WORD, r.SP))
+                self.mc.LD(reg, l.addr(16*WORD, r.SP))
 
         # replace b1_location with BEQ(here)
         pmc = OverwritingBuilder(self.mc, b1_location, 1)
diff --git a/rpython/jit/backend/zarch/codebuilder.py b/rpython/jit/backend/zarch/codebuilder.py
--- a/rpython/jit/backend/zarch/codebuilder.py
+++ b/rpython/jit/backend/zarch/codebuilder.py
@@ -189,11 +189,7 @@
         return diff
 
     def sync(self):
-        # see sync. section of the zarch manual!
-        # 0xf creates a checkpoint which is not needed.
-        # we never want to restore the checkpoint, we only
-        # want to create a memory fence (i.e. serialization)
-        self.BCR_rr(0xe,0)
+        self.BCR_rr(0xf,0)
 
     def raw_call(self, call_reg=r.RETURN):
         """Emit a call to the address stored in the register 'call_reg',
diff --git a/rpython/jit/backend/zarch/instruction_builder.py b/rpython/jit/backend/zarch/instruction_builder.py
--- a/rpython/jit/backend/zarch/instruction_builder.py
+++ b/rpython/jit/backend/zarch/instruction_builder.py
@@ -191,6 +191,13 @@
         self.write_i32(imm32 & BIT_MASK_32)
     return encode_ri
 
+def build_s(mnemonic, (opcode1,opcode2)):
+    @builder.arguments('bd')
+    def encode_s(self, base_displace):
+        self.writechar(opcode1)
+        self.writechar(opcode2)
+        encode_base_displace(self, base_displace)
+    return encode_s
 
 def build_si(mnemonic, (opcode,)):
     @builder.arguments('bd,u8')
diff --git a/rpython/jit/backend/zarch/instructions.py b/rpython/jit/backend/zarch/instructions.py
--- a/rpython/jit/backend/zarch/instructions.py
+++ b/rpython/jit/backend/zarch/instructions.py
@@ -288,6 +288,8 @@
 
     'SVC':     ('i',     ['\x0A']),
     'TRAP2':   ('e',     ['\x01','\xFF']),
+
+    'STFLE':   ('s',     ['\xB2','\xB0']),
 }
 all_mnemonic_codes.update(arith_mnemonic_codes)
 all_mnemonic_codes.update(logic_mnemonic_codes)
diff --git a/rpython/jit/backend/zarch/test/test_assembler.py b/rpython/jit/backend/zarch/test/test_assembler.py
--- a/rpython/jit/backend/zarch/test/test_assembler.py
+++ b/rpython/jit/backend/zarch/test/test_assembler.py
@@ -144,6 +144,19 @@
         assert self.mc.BRC_byte_count == 4
         assert self.mc.LG_byte_count == 6
 
+    def test_facility(self):
+        adr = self.a.datablockwrapper.malloc_aligned(16, 16)
+        self.a.mc.load_imm(r.r2, adr)
+        self.a.mc.STFLE(loc.addr(0,r.r2))
+        self.a.mc.BCR(con.ANY, r.r14)
+        run_asm(self.a)
+        fac_data = rffi.cast(rffi.CArrayPtr(rffi.ULONG), adr)
+        f64 = bin(fac_data[0])[2:]
+        s64 = bin(fac_data[1])[2:]
+        print(f64)
+        print(s64)
+        assert f64[18] == '1' # long displacement facility
+
     def test_load_small_int_to_reg(self):
         self.a.mc.LGHI(r.r2, loc.imm(123))
         self.a.jmpto(r.r14)
diff --git a/rpython/memory/gc/env.py b/rpython/memory/gc/env.py
--- a/rpython/memory/gc/env.py
+++ b/rpython/memory/gc/env.py
@@ -137,6 +137,8 @@
         return get_L2cache_linux2_cpuinfo()
     if arch in ('alpha', 'ppc'):
         return get_L2cache_linux2_cpuinfo(label='L2 cache')
+    if arch in ('s390x'):
+        return get_L2cache_linux2_cpuinfo_s390x()
     if arch == 'ia64':
         return get_L2cache_linux2_ia64()
     if arch in ('parisc', 'parisc64'):
@@ -208,6 +210,67 @@
             "Warning: cannot find your CPU L2 cache size in /proc/cpuinfo")
         return -1
 
+def get_L2cache_linux2_cpuinfo_s390x(filename="/proc/cpuinfo", label='cache3'):
+    debug_start("gc-hardware")
+    L2cache = sys.maxint
+    try:
+        fd = os.open(filename, os.O_RDONLY, 0644)
+        try:
+            data = []
+            while True:
+                buf = os.read(fd, 4096)
+                if not buf:
+                    break
+                data.append(buf)
+        finally:
+            os.close(fd)
+    except OSError:
+        pass
+    else:
+        data = ''.join(data)
+        linepos = 0
+        while True:
+            start = _findend(data, '\n' + label, linepos)
+            if start < 0:
+                break    # done
+            linepos = _findend(data, '\n', start)
+            if linepos < 0:
+                break    # no end-of-line??
+            # *** data[start:linepos] == "   : level=2 type=Instruction scope=Private size=2048K ..."
+            start = _skipspace(data, start)
+            if data[start] != ':':
+                continue
+            # *** data[start:linepos] == ": level=2 type=Instruction scope=Private size=2048K ..."
+            start = _skipspace(data, start + 1)
+            # *** data[start:linepos] == "level=2 type=Instruction scope=Private size=2048K ..."
+            start += 44
+            end = start
+            while '0' <= data[end] <= '9':
+                end += 1
+            # *** data[start:end] == "2048"
+            if start == end:
+                continue
+            number = int(data[start:end])
+            # *** data[end:linepos] == " KB\n"
+            end = _skipspace(data, end)
+            if data[end] not in ('K', 'k'):    # assume kilobytes for now
+                continue
+            number = number * 1024
+            # for now we look for the smallest of the L2 caches of the CPUs
+            if number < L2cache:
+                L2cache = number
+
+    debug_print("L2cache =", L2cache)
+    debug_stop("gc-hardware")
+
+    if L2cache < sys.maxint:
+        return L2cache
+    else:
+        # Print a top-level warning even in non-debug builds
+        llop.debug_print(lltype.Void,
+            "Warning: cannot find your CPU L2 cache size in /proc/cpuinfo")
+        return -1
+
 def get_L2cache_linux2_sparc():
     debug_start("gc-hardware")
     cpu = 0
diff --git a/rpython/memory/gc/test/test_env.py b/rpython/memory/gc/test/test_env.py
--- a/rpython/memory/gc/test/test_env.py
+++ b/rpython/memory/gc/test/test_env.py
@@ -161,3 +161,22 @@
 """)
     result = env.get_L2cache_linux2_cpuinfo(str(filepath))
     assert result == 3072 * 1024
+
+def test_estimate_best_nursery_size_linux2_s390x():
+    filepath = udir.join('estimate_best_nursery_size_linux2')
+    filepath.write("""\
+vendor_id       : IBM/S390
+# processors    : 2
+bogomips per cpu: 20325.00
+features	: esan3 zarch stfle msa ldisp eimm dfp etf3eh highgprs 
+cache0          : level=1 type=Data scope=Private size=128K line_size=256 associativity=8
+cache1          : level=1 type=Instruction scope=Private size=96K line_size=256 associativity=6
+cache2          : level=2 type=Data scope=Private size=2048K line_size=256 associativity=8
+cache3          : level=2 type=Instruction scope=Private size=2048K line_size=256 associativity=8
+cache4          : level=3 type=Unified scope=Shared size=65536K line_size=256 associativity=16
+cache5          : level=4 type=Unified scope=Shared size=491520K line_size=256 associativity=30
+processor 0: version = FF,  identification = 026A77,  machine = 2964
+processor 1: version = FF,  identification = 026A77,  machine = 2964
+""")
+    result = env.get_L2cache_linux2_cpuinfo_s390x(str(filepath))
+    assert result == 2048 * 1024