[pypy-svn] r34649 - in pypy/dist/pypy: config translator translator/backendopt translator/backendopt/test translator/c/test

pedronis at codespeak.net pedronis at codespeak.net
Thu Nov 16 02:19:26 CET 2006


Author: pedronis
Date: Thu Nov 16 02:18:59 2006
New Revision: 34649

Modified:
   pypy/dist/pypy/config/pypyoption.py
   pypy/dist/pypy/translator/backendopt/all.py
   pypy/dist/pypy/translator/backendopt/inline.py
   pypy/dist/pypy/translator/backendopt/test/test_inline.py
   pypy/dist/pypy/translator/c/test/test_standalone.py
   pypy/dist/pypy/translator/driver.py
Log:
profile based (call counts) inlining: --prof-based-inline=<args to the program for profiling>
There's even a test.

A first stab. Based on forking...

I managed to get an -O3 executable for PyPy that's is slightly smaller but not slower. No such luck with -O2.
Still kind of good considering our failures so far to reduce the inlining amount without killing the performance.

Didn't spend time on tuning or playing with the parameters at all.



Modified: pypy/dist/pypy/config/pypyoption.py
==============================================================================
--- pypy/dist/pypy/config/pypyoption.py	(original)
+++ pypy/dist/pypy/config/pypyoption.py	Thu Nov 16 02:18:59 2006
@@ -209,6 +209,10 @@
                        "optimizer remove the asserts", default=False),
             IntOption("inline_threshold", "Threshold when to inline functions",
                       default=1, cmdline=None),
+            StrOption("profile_based_inline",
+                      "Use call count profiling to drive inlining"
+                      ", specify arguments",
+                      default=None, cmdline="--prof-based-inline"),
         ]),
 
         OptionDescription("cli", "GenCLI options", [

Modified: pypy/dist/pypy/translator/backendopt/all.py
==============================================================================
--- pypy/dist/pypy/translator/backendopt/all.py	(original)
+++ pypy/dist/pypy/translator/backendopt/all.py	Thu Nov 16 02:18:59 2006
@@ -43,35 +43,27 @@
         print_statistics(translator.graphs[0], translator)
 
     if not config.clever_malloc_removal:
-        # inline functions in each other
-        if config.inline_threshold:
-            callgraph = inline.inlinable_static_callers(graphs)
-            inline.auto_inlining(translator, config.inline_threshold,
-                                 callgraph=callgraph)
-            for graph in graphs:
-                removenoops.remove_superfluous_keep_alive(graph)
-                removenoops.remove_duplicate_casts(graph, translator)
-
-        if config.print_statistics:
-            print "after inlining:"
-            print_statistics(translator.graphs[0], translator)
-
-        # vaporize mallocs
-        if config.mallocs:
-            tot = 0
-            for graph in graphs:
-                count = remove_simple_mallocs(graph)
-                if count:
-                    # remove typical leftovers from malloc removal
-                    removenoops.remove_same_as(graph)
-                    simplify.eliminate_empty_blocks(graph)
-                    simplify.transform_dead_op_vars(graph, translator)
-                    tot += count
-            log.malloc("removed %d simple mallocs in total" % tot)
-
-        if config.print_statistics:
-            print "after malloc removal:"
-            print_statistics(translator.graphs[0], translator)
+        if config.profile_based_inline:
+            inline_malloc_removal_phase(config, translator, graphs,
+                                        config.inline_threshold*.5) # xxx tune!
+            inline.instrument_inline_candidates(graphs, config.inline_threshold)
+            data = translator.driver_instrument_result(
+                       config.profile_based_inline)
+            import array, struct
+            n = data.size()//struct.calcsize('L')
+            data = data.open('rb')
+            counters = array.array('L')
+            counters.fromfile(data, n)
+            data.close()
+            def call_count_pred(label):
+                if label >= n:
+                    return False
+                return counters[label] > 250 # xxx tune!
+        else:
+            call_count_pred = None
+        inline_malloc_removal_phase(config, translator, graphs,
+                                    config.inline_threshold,
+                                    call_count_pred=call_count_pred)
     else:
         assert graphs is translator.graphs  # XXX for now
         clever_inlining_and_malloc_removal(translator)
@@ -101,3 +93,39 @@
 
     for graph in graphs:
         checkgraph(graph)
+
+def inline_malloc_removal_phase(config, translator, graphs, inline_threshold,
+                                call_count_pred=None):
+
+    log.inlining("phase with threshold factor: %s" % inline_threshold)
+
+    # inline functions in each other
+    if inline_threshold:
+        callgraph = inline.inlinable_static_callers(graphs)
+        inline.auto_inlining(translator, inline_threshold,
+                             callgraph=callgraph,
+                             call_count_pred=call_count_pred)
+        for graph in graphs:
+            removenoops.remove_superfluous_keep_alive(graph)
+            removenoops.remove_duplicate_casts(graph, translator)
+
+        if config.print_statistics:
+            print "after inlining:"
+            print_statistics(translator.graphs[0], translator)
+
+    # vaporize mallocs
+    if config.mallocs:
+        tot = 0
+        for graph in graphs:
+            count = remove_simple_mallocs(graph)
+            if count:
+                # remove typical leftovers from malloc removal
+                removenoops.remove_same_as(graph)
+                simplify.eliminate_empty_blocks(graph)
+                simplify.transform_dead_op_vars(graph, translator)
+                tot += count
+        log.malloc("removed %d simple mallocs in total" % tot)
+
+        if config.print_statistics:
+            print "after malloc removal:"
+            print_statistics(translator.graphs[0], translator)    

Modified: pypy/dist/pypy/translator/backendopt/inline.py
==============================================================================
--- pypy/dist/pypy/translator/backendopt/inline.py	(original)
+++ pypy/dist/pypy/translator/backendopt/inline.py	Thu Nov 16 02:18:59 2006
@@ -7,7 +7,7 @@
 from pypy.objspace.flow.model import FunctionGraph
 from pypy.objspace.flow.model import traverse, mkentrymap, checkgraph
 from pypy.annotation import model as annmodel
-from pypy.rpython.lltypesystem.lltype import Bool, typeOf, Void, Ptr
+from pypy.rpython.lltypesystem.lltype import Bool, Signed, typeOf, Void, Ptr
 from pypy.rpython.lltypesystem.lltype import normalizeptr
 from pypy.rpython import rmodel
 from pypy.tool.algo import sparsemat
@@ -72,9 +72,10 @@
         return False
 
 def inline_function(translator, inline_func, graph, lltype_to_classdef,
-                    raise_analyzer):
+                    raise_analyzer, call_count_pred=None):
     inliner = Inliner(translator, graph, inline_func, lltype_to_classdef,
-                      raise_analyzer = raise_analyzer)
+                      raise_analyzer = raise_analyzer,
+                      call_count_pred=call_count_pred)
     return inliner.inline_all()
 
 def simple_inline_function(translator, inline_func, graph):
@@ -128,7 +129,9 @@
 class BaseInliner(object):
     def __init__(self, translator, graph, lltype_to_classdef, 
                  inline_guarded_calls=False,
-                 inline_guarded_calls_no_matter_what=False, raise_analyzer=None):
+                 inline_guarded_calls_no_matter_what=False,
+                 raise_analyzer=None,
+                 call_count_pred=None):
         self.translator = translator
         self.graph = graph
         self.inline_guarded_calls = inline_guarded_calls
@@ -138,10 +141,12 @@
         assert raise_analyzer is not None
         self.raise_analyzer = raise_analyzer
         self.lltype_to_classdef = lltype_to_classdef
+        self.call_count_pred = call_count_pred
 
     def inline_all(self):
         count = 0
         non_recursive = {}
+        call_count_pred = self.call_count_pred
         while self.block_to_index:
             block, d = self.block_to_index.popitem()
             index_operation, subgraph = d.popitem()
@@ -151,6 +156,13 @@
                 raise CannotInline("inlining a recursive function")
             else:
                 non_recursive[subgraph] = True
+            if call_count_pred:
+                countop = block.operations[index_operation-1]
+                assert countop.opname == 'instrument_count'
+                assert countop.args[0].value == 'inline'
+                label = countop.args[1].value
+                if not call_count_pred(label):
+                    continue
             operation = block.operations[index_operation]
             self.inline_once(block, index_operation)
             count += 1
@@ -417,12 +429,16 @@
 
 
 class Inliner(BaseInliner):
-    def __init__(self, translator, graph, inline_func, lltype_to_classdef, inline_guarded_calls=False,
-                 inline_guarded_calls_no_matter_what=False, raise_analyzer=None):
+    def __init__(self, translator, graph, inline_func, lltype_to_classdef,
+                 inline_guarded_calls=False,
+                 inline_guarded_calls_no_matter_what=False,
+                 raise_analyzer=None,
+                 call_count_pred=None):
         BaseInliner.__init__(self, translator, graph, lltype_to_classdef,
                              inline_guarded_calls,
                              inline_guarded_calls_no_matter_what,
-                             raise_analyzer)
+                             raise_analyzer,
+                             call_count_pred)
         self.inline_func = inline_func
         # to simplify exception matching
         join_blocks(graph)
@@ -447,6 +463,7 @@
               'malloc': 2,
               'yield_current_frame_to_caller': sys.maxint, # XXX bit extreme
               'resume_point': sys.maxint, # XXX bit extreme
+              'instrument_count': 0,
               }
 
 def block_weight(block, weights=OP_WEIGHTS):
@@ -535,8 +552,48 @@
     return result
 
 
+def instrument_inline_candidates(graphs, multiplier):
+    threshold = BASE_INLINE_THRESHOLD * multiplier
+    cache = {None: False}
+    def candidate(graph):
+        try:
+            return cache[graph]
+        except KeyError:
+            res = static_instruction_count(graph) <= threshold
+            cache[graph] = res
+            return res
+    n = 0
+    for parentgraph in graphs:
+        for block in parentgraph.iterblocks():
+            ops = block.operations
+            i = len(ops)-1
+            while i >= 0:
+                op = ops[i]
+                i -= 1
+                if op.opname == "direct_call":
+                    funcobj = op.args[0].value._obj
+                    graph = getattr(funcobj, 'graph', None)
+                    if graph is not None:
+                        if getattr(getattr(funcobj, '_callable', None),
+                                   'suggested_primitive', False):
+                            continue
+                        if getattr(getattr(funcobj, '_callable', None),
+                                   'dont_inline', False):
+                            continue
+                    if candidate(graph):
+                        tag = Constant('inline', Void)
+                        label = Constant(n, Signed)
+                        dummy = Variable()
+                        dummy.concretetype = Void
+                        count = SpaceOperation('instrument_count',
+                                               [tag, label], dummy)
+                        ops.insert(i+1, count)
+                        n += 1
+    log.inlining("%d call sites instrumented" % n)
+
 def auto_inlining(translator, multiplier=1, callgraph=None,
-                  threshold=BASE_INLINE_THRESHOLD):
+                  threshold=BASE_INLINE_THRESHOLD,
+                  call_count_pred=None):
     from heapq import heappush, heappop, heapreplace, heapify
     threshold = threshold * multiplier
     callers = {}     # {graph: {graphs-that-call-it}}
@@ -587,7 +644,8 @@
                 continue
             try:
                 res = bool(inline_function(translator, graph, parentgraph,
-                                           lltype_to_classdef, raise_analyzer))
+                                           lltype_to_classdef, raise_analyzer,
+                                           call_count_pred))
             except CannotInline:
                 couldnt_inline[graph] = True
                 res = CannotInline

Modified: pypy/dist/pypy/translator/backendopt/test/test_inline.py
==============================================================================
--- pypy/dist/pypy/translator/backendopt/test/test_inline.py	(original)
+++ pypy/dist/pypy/translator/backendopt/test/test_inline.py	Thu Nov 16 02:18:59 2006
@@ -8,6 +8,7 @@
 from pypy.translator.backendopt.inline import auto_inlining, Inliner
 from pypy.translator.backendopt.inline import collect_called_graphs
 from pypy.translator.backendopt.inline import measure_median_execution_cost
+from pypy.translator.backendopt.inline import instrument_inline_candidates
 from pypy.translator.translator import TranslationContext, graphof
 from pypy.rpython.llinterp import LLInterpreter
 from pypy.rlib.rarithmetic import ovfcheck
@@ -66,16 +67,25 @@
         return interp.eval_graph(graphof(t, entry), args)
     return eval_func
 
-def check_auto_inlining(func, sig, multiplier=None):
+def check_auto_inlining(func, sig, multiplier=None, call_count_check=False):
     t = translate(func, sig)
     if option.view:
         t.view()
     # inline!
     sanity_check(t)    # also check before inlining (so we don't blame it)
-    if multiplier is None:
-        auto_inlining(t)
+
+    if multiplier is not None:
+        multiplier = {'multiplier': multiplier}
     else:
-        auto_inlining(t, multiplier=multiplier)
+        multiplier = {}
+
+    call_count_pred = None
+    if call_count_check:
+        call_count_pred = lambda lbl: True
+        instrument_inline_candidates(t.graphs, **multiplier)
+
+    auto_inlining(t, call_count_pred=call_count_pred, **multiplier)
+    
     sanity_check(t)
     if option.view:
         t.view()
@@ -364,6 +374,33 @@
     result = eval_func([15])
     assert result == -1
 
+def test_auto_inlining_small_call_big_call_count():
+    def leaf(n):
+        total = 0
+        i = 0
+        while i < n:
+            total += i
+            if total > 100:
+                raise OverflowError
+            i += 1
+        return total
+    def g(n):
+        return leaf(n)
+    def f(n):
+        try:
+            return g(n)
+        except OverflowError:
+            return -1
+    eval_func, t = check_auto_inlining(f, [int], multiplier=10,
+                                       call_count_check=True)
+    f_graph = graphof(t, f)
+    assert len(collect_called_graphs(f_graph, t)) == 0
+
+    result = eval_func([10])
+    assert result == 45
+    result = eval_func([15])
+    assert result == -1
+
 def test_inline_exception_catching():
     def f3():
         raise CustomError1

Modified: pypy/dist/pypy/translator/c/test/test_standalone.py
==============================================================================
--- pypy/dist/pypy/translator/c/test/test_standalone.py	(original)
+++ pypy/dist/pypy/translator/c/test/test_standalone.py	Thu Nov 16 02:18:59 2006
@@ -87,3 +87,24 @@
     counters = struct.unpack("LLL", counters_data)
 
     assert counters == (0,3,2)
+
+def test_prof_inline():
+    if sys.platform == 'win32':
+        py.test.skip("instrumentation support is unix only for now")
+    def add(a,b):
+        return a + b - b + b - b + b - b + b - b + b - b + b - b + b
+    def entry_point(argv):
+        tot =  0
+        x = int(argv[1])
+        while x > 0:
+            tot = add(tot, x)
+            x -= 1
+        os.write(1, str(tot))
+        return 0
+    from pypy.translator.interactive import Translation
+    t = Translation(entry_point, backend='c', standalone=True)
+    t.backendopt(profile_based_inline="500")
+    exe = t.compile()
+    out = py.process.cmdexec("%s 500" % exe)
+    assert int(out) == 500*501/2
+    

Modified: pypy/dist/pypy/translator/driver.py
==============================================================================
--- pypy/dist/pypy/translator/driver.py	(original)
+++ pypy/dist/pypy/translator/driver.py	Thu Nov 16 02:18:59 2006
@@ -51,6 +51,31 @@
 def backend_to_typesystem(backend):
     return _BACKEND_TO_TYPESYSTEM.get(backend, 'ootype')
 
+
+class Instrument(Exception):
+    pass
+
+
+class ProfInstrument(object):
+    name = "profinstrument"
+    def __init__(self, datafile, compiler):
+        self.datafile = datafile
+        self.compiler = compiler
+
+    def first(self):
+        self.compiler._build()
+
+    def probe(self, exe, args):
+        from py.compat import subprocess
+        env = os.environ.copy()
+        env['_INSTRUMENT_COUNTERS'] = str(self.datafile)
+        subprocess.call([exe, args], env=env)
+        
+    def after(self):
+        # xxx
+        os._exit(0)
+
+
 class TranslationDriver(SimpleTaskEngine):
 
     def __init__(self, setopts=None, default_goal=None, disable=[],
@@ -183,6 +208,34 @@
         self.entry_point = entry_point
         self.translator = translator
 
+        self.translator.driver_instrument_result = self.instrument_result
+
+    def instrument_result(self, args):
+        backend, ts = self.get_backend_and_type_system()
+        if backend != 'c' or sys.platform == 'win32':
+            raise Exception("instrumentation requires the c backend"
+                            " and unix for now")
+        from pypy.tool.udir import udir
+        
+        datafile = udir.join('_instrument_counters')
+        makeProfInstrument = lambda compiler: ProfInstrument(datafile, compiler)
+
+        pid = os.fork()
+        if pid == 0:
+            # child compiling and running with instrumentation
+            self.config.translation.instrument = True
+            self.config.translation.instrumentctl = (makeProfInstrument,
+                                                     args)
+            raise Instrument
+        else:
+            pid, status = os.waitpid(pid, 0)
+            if os.WIFEXITED(status):
+                status = os.WEXITSTATUS(status)
+                if status != 0:
+                    raise Exception, "instrumentation child failed: %d" % status
+            else:
+                raise Exception, "instrumentation child aborted"
+            return datafile
 
     def info(self, msg):
         log.info(msg)
@@ -194,9 +247,16 @@
             return
         else:
             self.log.info("%s..." % title)
-        res = func()
+        instrument = False
+        try:
+            res = func()
+        except Instrument:
+            instrument = True
         if not func.task_idempotent:
             self.done[goal] = True
+        if instrument:
+            self.proceed('compile')
+            assert False, 'we should not get here'
         return res
 
     def task_annotate(self):



More information about the Pypy-commit mailing list