[pypy-commit] extradoc extradoc: more in-depth discussion of related work

Tue Aug 14 17:38:10 CEST 2012

Author: Carl Friedrich Bolz <cfbolz at gmx.de>
Branch: extradoc
Changeset: r4570:d3fc033b4584
Date: 2012-08-14 17:37 +0200
http://bitbucket.org/pypy/extradoc/changeset/d3fc033b4584/

Log:	more in-depth discussion of related work

diff --git a/talk/dls2012/paper.bib b/talk/dls2012/paper.bib
--- a/talk/dls2012/paper.bib
+++ b/talk/dls2012/paper.bib
@@ -12,7 +12,7 @@
 	year = {1984}
 },
 
- at inproceedings{carl_friedrich_bolz_towards_2010,
+ at inproceedings{bolz_towards_2010,
 	address = {Hagenberg, Austria},
 	title = {Towards a Jitting {VM} for Prolog execution},
 	isbn = {978-1-4503-0132-9},
@@ -21,7 +21,7 @@
 	abstract = {Most Prolog implementations are implemented in low-level languages such as C and are based on a variation of the {WAM} instruction set, which enhances their performance but makes them hard to write. In addition, many of the more dynamic features of Prolog (like assert), despite their popularity, are not well supported. We present a high-level continuation-based Prolog interpreter based on the {PyPy} project. The {PyPy} project makes it possible to easily and efficiently implement dynamic languages. It provides tools that automatically generate a just-in-time compiler for a given interpreter of the target language, by using partial evaluation techniques. The resulting Prolog implementation is surprisingly efficient: it clearly outperforms existing interpreters of Prolog in high-level languages such as Java. Moreover, on some benchmarks, our system outperforms state-of-the-art {WAM-based} Prolog implementations. Our paper aims to show that declarative languages such as Prolog can indeed benefit from having a just-in-time compiler and that {PyPy} can form the basis for implementing programming languages other than Python.},
 	booktitle = {{PPDP}},
 	publisher = {{ACM}},
-	author = {Carl Friedrich Bolz and Michael Leuschel and David Schneider},
+	author = {Bolz, Carl Friedrich and Leuschel, Michael and Schneider, David},
 	year = {2010},
 	keywords = {interpreters, jit, logic programming, partial evaluation}
 },
@@ -57,8 +57,25 @@
 	keywords = {code generation, design, dynamically typed languages, experimentation, incremental compilers, languages, measurement, performance, run-time environments, trace-based compilation}
 },
 
+ at inproceedings{kotzmann_escape_2005,
+	address = {New York, {NY}, {USA}},
+	series = {{VEE} '05},
+	title = {Escape analysis in the context of dynamic compilation and deoptimization},
+	isbn = {1-59593-047-7},
+	location = {Chicago, {IL}, {USA}},
+	doi = {10.1145/1064979.1064996},
+	abstract = {In object-oriented programming languages, an object is said to escape the method or thread in which it was created if it can also be accessed by other methods or threads. Knowing which objects do not escape allows a compiler to perform aggressive {optimizations.This} paper presents a new intraprocedural and interprocedural algorithm for escape analysis in the context of dynamic compilation where the compiler has to cope with dynamic class loading and deoptimization. It was implemented for Sun Microsystems' Java {HotSpot&#8482;} client compiler and operates on an intermediate representation in {SSA} form. We introduce equi-escape sets for the efficient propagation of escape information between related objects. The analysis is used for scalar replacement of fields and synchronization removal, as well as for stack allocation of objects and fixed-sized arrays. The results of the interprocedural analysis support the compiler in inlining decisions and allow actual parameters to be allocated on the caller {stack.Under} certain circumstances, the Java {HotSpot&#8482;} {VM} is forced to stop executing a method's machine code and transfer control to the interpreter. This is called deoptimization. Since the interpreter does not know about the scalar replacement and synchronization removal performed by the compiler, the deoptimization framework was extended to reallocate and relock objects on demand.},
+	booktitle = {Proceedings of the 1st {ACM/USENIX} international conference on Virtual execution environments},
+	publisher = {{ACM}},
+	author = {Kotzmann, Thomas and M&#246;ssenb&#246;ck, Hanspeter},
+	year = {2005},
+	note = {{ACM} {ID:} 1064996},
+	keywords = {algorithms, allocation/deallocation strategies, deoptimization},
+	pages = {111&#8211;120}
+},
+
 @inproceedings{bolz_towards_2009,
-	title = {Towards {Just-In-Time} Partial Evaluation of Prolog},
+	title = {Towards Just-In-Time Partial Evaluation of Prolog},
 	doi = {10.1007/978-3-642-12592-8_12},
 	booktitle = {Logic Program Synthesis and Transformation},
 	author = {Bolz, Carl Friedrich and Leuschel, Michael and Rigo, Armin},
@@ -82,7 +99,7 @@
 	location = {Barcelona, Spain},
 	url = {http://dx.doi.org/10.1109/MICRO.2005.22},
 	doi = {http://dx.doi.org/10.1109/MICRO.2005.22},
-	abstract = {The performance of a dynamic optimization system depends heavily on the code it selects to optimize. Many current systems follow the design of {HP} Dynamo and select a single interprocedural path, or trace, as the unit of code optimization and code caching. Though this approach to region selection has worked well in practice, we show that it is possible to adapt this basic approach to produce regions with greater locality, less needless code duplication, and fewer profiling counters. In particular, we propose two new region-selection algorithms and evaluate them against Dynamo&#191;s selection mechanism, {Next-Executing} Tail {(NET).} Our first algorithm, {Last-Executed} Iteration {(LEI)}, identifies cyclic paths of execution better than {NET}, improving locality of execution while reducing the size of the code cache. Our second algorithm allows overlapping traces of similar execution frequency to be combined into a single large region. This second technique can be applied to both {NET} and {LEI}, and we find that it significantly improves metrics of locality and memory overhead for each.},
+	abstract = {The performance of a dynamic optimization system depends heavily on the code it selects to optimize. Many current systems follow the design of {HP} Dynamo and select a single interprocedural path, or trace, as the unit of code optimization and code caching. Though this approach to region selection has worked well in practice, we show that it is possible to adapt this basic approach to produce regions with greater locality, less needless code duplication, and fewer profiling counters. In particular, we propose two new region-selection algorithms and evaluate them against Dynamo&#191;s selection mechanism, Next-Executing Tail {(NET).} Our first algorithm, Last-Executed Iteration {(LEI)}, identifies cyclic paths of execution better than {NET}, improving locality of execution while reducing the size of the code cache. Our second algorithm allows overlapping traces of similar execution frequency to be combined into a single large region. This second technique can be applied to both {NET} and {LEI}, and we find that it significantly improves metrics of locality and memory overhead for each.},
 	journal = {Proceedings of the 38th annual {IEEE/ACM} International Symposium on Microarchitecture},
 	author = {Hiniker, David and Hazelwood, Kim and Smith, Michael D},
 	year = {2005},
@@ -102,8 +119,7 @@
 
 @misc{pall_luajit_2009,
 	title = {{LuaJIT} 2.0 intellectual property disclosure and research opportunities},
-	note = {http://lua-users.org/lists/lua-l/2009-11/msg00089.html (accessed
-        June 2011)},
+	url = {http://lua-users.org/lists/lua-l/2009-11/msg00089.html},
 	author = {Pall, Mike},
 	month = nov,
 	year = {2009}
@@ -111,7 +127,7 @@
 
 @inproceedings{bolz_runtime_2011,
 	address = {Lancaster, {UK}},
-	title = {Runtime Feedback in a {Meta-Tracing} {JIT} for Efficient Dynamic Languages},
+	title = {Runtime Feedback in a Meta-Tracing {JIT} for Efficient Dynamic Languages},
 	abstract = {Meta-tracing {JIT} compilers can be applied to a variety of different languages without explicitly encoding language semantics into the compiler. So far, they lacked a way to give the language implementor control over runtime feedback. This restricted their performance. In this paper we describe the mechanisms in {PyPy&#8217;s} meta-tracing {JIT} that can be used to control runtime feedback in language-specific ways. These mechanisms are flexible enough to express classical {VM} techniques such as maps and runtime type feedback.},
 	booktitle = {{ICOOOLPS}},
 	publisher = {{ACM}},
@@ -135,7 +151,115 @@
 	pages = {71--80}
 },
 
- at inproceedings{davide_ancona_rpython:_2007,
+ at article{diwan_type-based_1998,
+	title = {Type-based alias analysis},
+	volume = {33},
+	issn = {0362-1340},
+	url = {http://doi.acm.org/10.1145/277652.277670},
+	doi = {10.1145/277652.277670},
+	abstract = {This paper evaluates three alias analyses based on programming language types. The first analysis uses type compatibility to determine aliases. The second extends the first by using additional high-level information such as field names. The third extends the second with a flow-insensitive analysis. Although other researchers suggests using types to disambiguate memory references, none evaluates its effectiveness. We perform both static and dynamic evaluations of type-based alias analyses for Modula-3, a statically-typed type-safe language. The static analysis reveals that type compatibility alone yields a very imprecise alias analysis, but the other two analyses significantly improve alias precision. We use redundant load elimination {(RLE)} to demonstrate the effectiveness of the three alias algorithms in terms of the opportunities for optimization, the impact on simulated execution times, and to compute an upper bound on what a perfect alias analysis would yield. We show modest dynamic improvements for {(RLE)}, and more surprisingly, that on average our alias analysis is within 2.5\% of a perfect alias analysis with respect to {RLE} on 8 Modula-3 programs. These results illustrate that to explore thoroughly the effectiveness of alias analyses, researchers need static, dynamic, and upper-bound analysis. In addition, we show that for type-safe languages like Modula-3 and Java, a fast and simple alias analysis may be sufficient for many applications.},
+	number = {5},
+	journal = {{SIGPLAN} Not.},
+	author = {Diwan, Amer and {McKinley}, Kathryn S. and Moss, J. Eliot B.},
+	month = may,
+	year = {1998},
+	pages = {106&#8211;117}
+},
+
+ at inproceedings{cytron_code_1986,
+	address = {New York, {NY}, {USA}},
+	series = {{POPL} '86},
+	title = {Code motion of control structures in high-level languages},
+	url = {http://doi.acm.org/10.1145/512644.512651},
+	doi = {10.1145/512644.512651},
+	abstract = {One trend among programmers is the increased use of abstractions. Through encapsulation techniques, abstractions extend the repertory of data structures and their concomitant operations that are processed directly by a compiler. For example, a compiler might not offer sets or set operations in its base language, but abstractions allow a programmer to define sets in terms of constructs already recognized by the compiler. In particular, abstractions can allow new constructs to be defined in terms of other abstractions. Although significant power is gained through the use of layered abstractions, object code quality suffers as increasingly less of a program's data structures and operations are exposed to the optimization phase of a compiler. Multiple references to abstractions are also inefficient, since the interaction between abstractions is often complex yet hidden from a compiler. Abstractions are most flexible when they are cast in general terms; a specific invocation is then tailored by the abstraction to obtain the appropriate code. A sequence of references to such abstractions can be inefficient due to functional redundancy that cannot be detected at compile-time. By integrating the references, the offending segments of code can be moved to a more advantageous position. Although procedure integration materializes abstracted constructs, the abstractions can still be ineligible for optimization using current techniques; in particular, abstractions often involve loops and conditional branches that can obscure code that would otherwise be eligible for code motion.},
+	booktitle = {Proceedings of the 13th {ACM} {SIGACT-SIGPLAN} symposium on Principles of programming languages},
+	publisher = {{ACM}},
+	author = {Cytron, Ron and Lowry, Andy and Zadeck, F. Kenneth},
+	year = {1986},
+	pages = {70&#8211;85}
+},
+
+ at article{knoop_lazy_1992,
+	title = {Lazy code motion},
+	volume = {27},
+	issn = {0362-1340},
+	url = {http://doi.acm.org/10.1145/143103.143136},
+	doi = {10.1145/143103.143136},
+	abstract = {We present a bit-vector algorithm for the optimal and economical placement of computations within flow graphs, which is as efficient as standard uni-directional analyses. The point of our algorithm is the decomposition of the bi-directional structure of the known placement algorithms into a sequence of a backward and a forward analysis, which directly implies the efficiency result. Moreover, the new compositional structure opens the algorithm for modification: two further uni-directional analysis components exclude any unnecessary code motion. This laziness of our algorithm minimizes the register pressure, which has drastic effects on the run-time behaviour of the optimized programs in practice, where an economical use of registers is essential.},
+	number = {7},
+	journal = {{SIGPLAN} Not.},
+	author = {Knoop, Jens and R&#252;thing, Oliver and Steffen, Bernhard},
+	month = jul,
+	year = {1992},
+	pages = {224&#8211;234}
+},
+
+ at incollection{allen_catalogue_1971,
+	title = {A Catalogue of Optimizing Transformations, ed. R. Rustin},
+	booktitle = {Design and Optimization of Compilers},
+	publisher = {Prentice-Hall},
+	author = {Allen, Frances and Cocke, John},
+	editor = {Rustin, Randall},
+	year = {1971},
+	pages = {1--30}
+},
+
+ at inproceedings{chow_new_1997,
+	address = {New York, {NY}, {USA}},
+	series = {{PLDI} '97},
+	title = {A new algorithm for partial redundancy elimination based on {SSA} form},
+	isbn = {0-89791-907-6},
+	url = {http://doi.acm.org/10.1145/258915.258940},
+	doi = {10.1145/258915.258940},
+	abstract = {A new algorithm, {SSAPRE}, for performing partial redundancy elimination based entirely on {SSA} form is presented. It achieves optimal code motion similar to lazy code motion {[KRS94a}, {DS93]}, but is formulated independently and does not involve iterative data flow analysis and bit vectors in its solution. It not only exhibits the characteristics common to other sparse approaches, but also inherits the advantages shared by other {SSA-based} optimization techniques. {SSAPRE} also maintains its output in the same {SSA} form as its input. In describing the algorithm, we state theorems with proofs giving our claims about {SSAPRE.} We also give additional description about our practical implementation of {SSAPRE}, and analyze and compare its performance with a bit-vector-based implementation of {PRE.} We conclude with some discussion of the implications of this work.},
+	booktitle = {Proceedings of the {ACM} {SIGPLAN} 1997 conference on Programming language design and implementation},
+	publisher = {{ACM}},
+	author = {Chow, Fred and Chan, Sun and Kennedy, Robert and Liu, Shin-Ming and Lo, Raymond and Tu, Peng},
+	year = {1997},
+	pages = {273&#8211;286}
+},
+
+ at article{morel_global_1979,
+	title = {Global optimization by suppression of partial redundancies},
+	volume = {22},
+	issn = {0001-0782},
+	url = {http://doi.acm.org/10.1145/359060.359069},
+	doi = {10.1145/359060.359069},
+	abstract = {The elimination of redundant computations and the moving of invariant computations out of loops are often done separately, with invariants moved outward loop by loop. We propose to do both at once and to move each expression directly to the entrance of the outermost loop in which it is invariant. This is done by solving a more general problem, i.e. the elimination of computations performed twice on a given execution path. Such computations are termed partially redundant. Moreover, the algorithm does not require any graphical information or restrictions on the shape of the program graph. Testing this algorithm has shown that its execution cost is nearly linear with the size of the program, and that it leads to a smaller optimizer that requires less execution time.},
+	number = {2},
+	journal = {Commun. {ACM}},
+	author = {Morel, E. and Renvoise, C.},
+	month = feb,
+	year = {1979},
+	keywords = {Boolean systems, compilation, compiler, data flow analysis, invariant computation elimination, optimization, optimizer, partial redundancy, redundancy elimination},
+	pages = {96&#8211;103}
+},
+
+ at article{dhamdhere_practical_1991,
+	title = {Practical adaption of the global optimization algorithm of Morel and Renvoise},
+	volume = {13},
+	issn = {0164-0925},
+	url = {http://doi.acm.org/10.1145/103135.214520},
+	doi = {10.1145/103135.214520},
+	number = {2},
+	journal = {{ACM} Trans. Program. Lang. Syst.},
+	author = {Dhamdhere, D. M.},
+	month = apr,
+	year = {1991},
+	pages = {291&#8211;294}
+},
+
+ at phdthesis{chow_portable_1984,
+	address = {Stanford, {CA}, {USA}},
+	title = {A portable machine-independent global optimizer&#8211;design and measurements},
+	school = {Stanford University},
+	author = {Chow, Frederick Chi-Tak},
+	year = {1984},
+	note = {{AAI8408268}}
+},
+
+ at inproceedings{ancona_rpython:_2007,
 	address = {Montreal, Quebec, Canada},
 	title = {{RPython:} a step towards reconciling dynamically and statically typed {OO} languages},
 	isbn = {978-1-59593-868-8},
@@ -145,17 +269,17 @@
 	abstract = {Although the C-based interpreter of Python is reasonably fast, implementations on the {CLI} or the {JVM} platforms offers some advantages in terms of robustness and interoperability. Unfortunately, because the {CLI} and {JVM} are primarily designed to execute statically typed, object-oriented languages, most dynamic language implementations cannot use the native bytecodes for common operations like method calls and exception handling; as a result, they are not able to take full advantage of the power offered by the {CLI} and {JVM.}},
 	booktitle = {{DLS}},
 	publisher = {{ACM}},
-	author = {Davide Ancona and Massimo Ancona and Antonio Cuni and Nicholas D. Matsakis},
+	author = {Ancona, Davide and Ancona, Massimo and Cuni, Antonio and Matsakis, Nicholas D.},
 	year = {2007},
 	keywords = {{JVM}, .net, Python}
 },
 
 @article{futamura_partial_1999,
-	title = {Partial Evaluation of Computation Process - An Approach to a {Compiler-Compiler}},
+	title = {Partial Evaluation of Computation Process - An Approach to a Compiler-Compiler},
 	volume = {12},
 	url = {http://citeseer.ist.psu.edu/futamura99partial.html},
 	number = {4},
-	journal = {{Higher-Order} and Symbolic Computation},
+	journal = {Higher-Order and Symbolic Computation},
 	author = {Futamura, Yoshihiko},
 	year = {1999},
 	keywords = {Futamura},
@@ -167,31 +291,31 @@
 	isbn = {0-13-020249-5},
 	url = {http://portal.acm.org/citation.cfm?id=153676},
 	abstract = {This book is out of print. For copies, Please refer to the following online page},
-	publisher = {{Prentice-Hall}},
+	publisher = {Prentice-Hall},
 	author = {Jones, Neil D. and Gomard, Carsten K. and Sestoft, Peter},
 	year = {1993}
 },
 
- at inproceedings{armin_rigo_pypys_2006,
+ at inproceedings{rigo_pypys_2006,
 	address = {Portland, Oregon, {USA}},
 	title = {{PyPy's} approach to virtual machine construction},
-	isbn = {{1-59593-491-X}},
+	isbn = {1-59593-491-X},
 	url = {http://portal.acm.org/citation.cfm?id=1176753},
 	doi = {10.1145/1176617.1176753},
 	abstract = {The {PyPy} project seeks to prove both on a research and a practical level the feasibility of constructing a virtual machine {(VM)} for a dynamic language in a dynamic language - in this case, Python. The aim is to translate (i.e. compile) the {VM} to arbitrary target environments, ranging in level from {C/Posix} to {Smalltalk/Squeak} via Java and {CLI/.NET}, while still being of reasonable efficiency within these {environments.A} key tool to achieve this goal is the systematic reuse of the Python language as a system programming language at various levels of our architecture and translation process. For each level, we design a corresponding type system and apply a generic type inference engine - for example, the garbage collector is written in a style that manipulates simulated pointer and address objects, and when translated to C these operations become C-level pointer and address instructions.},
 	booktitle = {{DLS}},
 	publisher = {{ACM}},
-	author = {Armin Rigo and Samuele Pedroni},
+	author = {Rigo, Armin and Pedroni, Samuele},
 	year = {2006},
 	keywords = {metacircularity, Python, retargettable code generation, type inference, {VM}}
 },
 
 @article{georges_statistically_2007,
-	title = {Statistically rigorous java performance evaluation},
+	title = {Statistically rigorous Java performance evaluation},
 	volume = {42},
 	url = {http://portal.acm.org/citation.cfm?id=1297105.1297033},
 	doi = {10.1145/1297105.1297033},
-	abstract = {Java performance is far from being trivial to benchmark because it is affected by various factors such as the Java application, its input, the virtual machine, the garbage collector, the heap size, etc. In addition, non-determinism at run-time causes the execution time of a Java program to differ from run to run. There are a number of sources of non-determinism such as {Just-In-Time} {(JIT)} compilation and optimization in the virtual machine {(VM)} driven by timer-based method sampling, thread scheduling, garbage collection, and various.},
+	abstract = {Java performance is far from being trivial to benchmark because it is affected by various factors such as the Java application, its input, the virtual machine, the garbage collector, the heap size, etc. In addition, non-determinism at run-time causes the execution time of a Java program to differ from run to run. There are a number of sources of non-determinism such as Just-In-Time {(JIT)} compilation and optimization in the virtual machine {(VM)} driven by timer-based method sampling, thread scheduling, garbage collection, and various.},
 	number = {10},
 	journal = {{SIGPLAN} Notices},
 	author = {Georges, Andy and Buytaert, Dries and Eeckhout, Lieven},
@@ -228,12 +352,12 @@
 	pages = {1--12}
 },
 
- at techreport{andreas_gal_incremental_2006,
+ at techreport{gal_incremental_2006,
 	title = {Incremental Dynamic Code Generation with Trace Trees},
 	abstract = {The unit of compilation for traditional just-in-time compilers is the method. We have explored trace-based compilation, in which the unit of compilation is a loop, potentially spanning multiple methods and even library code. Using a new intermediate representation that is discovered and updated lazily on-demand while the program is being executed, our compiler generates code that is competitive with traditional dynamic compilers, but that uses only a fraction of the compile time and memory footprint.},
 	number = {{ICS-TR-06-16}},
 	institution = {Donald Bren School of Information and Computer Science, University of California, Irvine},
-	author = {Andreas Gal and Michael Franz},
+	author = {Gal, Andreas and Franz, Michael},
 	month = nov,
 	year = {2006},
 	pages = {11}
@@ -254,19 +378,19 @@
 	keywords = {dynamic compilation, embedded, software trace scheduling, {SSA}, {VM}}
 },
 
- at inproceedings{mario_wolczko_towards_1999,
-	title = {Towards a Universal Implementation Substrate for {Object-Oriented} Languages},
+ at inproceedings{wolczko_towards_1999,
+	title = {Towards a Universal Implementation Substrate for Object-Oriented Languages},
 	abstract = {Self is a minimalist object-oriented language with a sophisticated implementation that utilizes adaptive optimization. We have built implementations of Smalltalk and Java by translation to Self. These implementations were much easier to construct in Self than by conventional means, and perform surprisingly well (competitively with conventional, commercial implementations). This leads us to believe that a Self-like system may form the basis of a universal substrate for implementation of object-oriented languages.},
 	booktitle = {{OOPSLA} workshop on Simplicity, Performance, and Portability in Virtual Machine Design},
-	author = {Mario Wolczko and Ole Agesen and David Ungar},
+	author = {Wolczko, Mario and Agesen, Ole and {{David} Ungar}},
 	year = {1999},
 	keywords = {fixme}
 },
 
- at inproceedings{hoelzle_optimizing_1994,
+ at inproceedings{holzle_optimizing_1994,
 	address = {Orlando, Florida, United States},
 	title = {Optimizing dynamically-dispatched calls with run-time type feedback},
-	isbn = {{0-89791-662-X}},
+	isbn = {0-89791-662-X},
 	url = {http://portal.acm.org/citation.cfm?id=178243.178478},
 	doi = {10.1145/178243.178478},
 	abstract = {Note: {OCR} errors may be found in this Reference List extracted from the full text article. {ACM} has opted to expose the complete List rather than only correct and linked references.},
@@ -306,17 +430,17 @@
 	doi = {10.1145/74878.74884},
 	abstract = {We have developed and implemented techniques that double the performance of dynamically-typed object-oriented languages. Our {SELF} implementation runs twice as fast as the fastest Smalltalk implementation, despite {SELF's} lack of classes and explicit variables. To compensate for the absence of classes, our system uses implementation-level maps to transparently group objects cloned from the same prototype, providing data type information and eliminating the apparent space overhead for prototype-based systems. To compensate for dynamic typing, user-defined control structures, and the lack of explicit variables, our system dynamically compiles multiple versions of a source method, each customized according to its receiver's map. Within each version the type of the receiver is fixed, and thus the compiler can statically bind and inline all messages sent to self. Message splitting and type prediction extract and preserve even more static type information, allowing the compiler to inline many other messages. Inlining dramatically improves performance and eliminates the need to hard-wire low-level methods such as +,==, and {ifTrue:.} Despite inlining and other optimizations, our system still supports interactive programming environments. The system traverses internal dependency lists to invalidate all compiled methods affected by a programming change. The debugger reconstructs inlined stack frames from compiler-generated debugging information, making inlining invisible to the {SELF} programmer.},
 	booktitle = {{OOPSLA}},
-	author = {Chambers, C. and Ungar, D. and E. Lee},
+	author = {Chambers, C. and Ungar, D. and {{E.} Lee}},
 	year = {1989},
 	keywords = {self, specialization}
 },
 
- at inproceedings{hoelzle_optimizing_1991,
-	title = {Optimizing {Dynamically-Typed} {Object-Oriented} Languages With Polymorphic Inline Caches},
+ at inproceedings{holzle_optimizing_1991,
+	title = {Optimizing Dynamically-Typed Object-Oriented Languages With Polymorphic Inline Caches},
 	isbn = {3-540-54262-0},
 	url = {http://portal.acm.org/citation.cfm?id=679193&dl=ACM&coll=portal},
 	booktitle = {{ECOOP}},
-	publisher = {{Springer-Verlag}},
+	publisher = {Springer-Verlag},
 	author = {H&#246;lzle, Urs and Chambers, Craig and Ungar, David},
 	year = {1991}
 },
@@ -346,23 +470,4 @@
 	publisher = {{ACM}},
 	author = {Sullivan, Gregory T. and Bruening, Derek L. and Baron, Iris and Garnett, Timothy and Amarasinghe, Saman},
 	year = {2003}
-}
-
- at inproceedings{kotzmann_escape_2005,
-	address = {New York, {NY}, {USA}},
-	series = {{VEE} '05},
-	title = {Escape analysis in the context of dynamic compilation and deoptimization},
-	isbn = {1-59593-047-7},
-	location = {Chicago, {IL}, {USA}},
-	doi = {10.1145/1064979.1064996},
-	abstract = {In object-oriented programming languages, an object is said to escape the method or thread in which it was created if it can also be accessed by other methods or threads. Knowing which objects do not escape allows a compiler to perform aggressive {optimizations.This} paper presents a new intraprocedural and interprocedural algorithm for escape analysis in the context of dynamic compilation where the compiler has to cope with dynamic class loading and deoptimization. It was implemented for Sun Microsystems' Java {HotSpot&#8482;} client compiler and operates on an intermediate representation in {SSA} form. We introduce equi-escape sets for the efficient propagation of escape information between related objects. The analysis is used for scalar replacement of fields and synchronization removal, as well as for stack allocation of objects and fixed-sized arrays. The results of the interprocedural analysis support the compiler in inlining decisions and allow actual parameters to be allocated on the caller {stack.Under} certain circumstances, the Java {HotSpot&#8482;} {VM} is forced to stop executing a method's machine code and transfer control to the interpreter. This is called deoptimization. Since the interpreter does not know about the scalar replacement and synchronization removal performed by the compiler, the deoptimization framework was extended to reallocate and relock objects on demand.},
-	booktitle = {Proceedings of the 1st {ACM/USENIX} international conference on Virtual execution environments},
-	publisher = {{ACM}},
-	author = {Kotzmann, Thomas and M&#246;ssenb&#246;ck, Hanspeter},
-	year = {2005},
-	note = {{ACM} {ID:} 1064996},
-	keywords = {algorithms, allocation/deallocation strategies, deoptimization},
-	pages = {111&#8211;120}
-},
-
-
+}
\ No newline at end of file
diff --git a/talk/dls2012/paper.tex b/talk/dls2012/paper.tex
--- a/talk/dls2012/paper.tex
+++ b/talk/dls2012/paper.tex
@@ -208,8 +208,8 @@
 
 The work described in this paper was done in the context of the PyPy
 project.\footnote{\texttt{http://pypy.org}} PyPy is a framework for implementing
-dynamic languages efficiently~\cite{armin_rigo_pypys_2006}. When implementing a
-language with PyPy, one writes an interpreter for the language in RPython~\cite{davide_ancona_rpython:_2007}.
+dynamic languages efficiently~\cite{rigo_pypys_2006}. When implementing a
+language with PyPy, one writes an interpreter for the language in RPython~\cite{ancona_rpython:_2007}.
 RPython (``Restricted Python``) is a subset
 of Python chosen in such a way that it can be efficiently translated to a
 C-based VM by performing type inference.
@@ -672,8 +672,8 @@
 can be reused for all other appearances. RPython's optimizers can also remove
 repeated heap reads if the intermediate operations cannot have changed their
 value.\footnote{We perform a type-based alias analysis to know which
-writes can affect which reads~\cite{XXX}. In addition writes on newly allocated objects
-can never change the value of old existing ones.}
+writes can affect which reads~\cite{diwan_type-based_1998}. In addition writes
+on newly allocated objects can never change the value of old existing ones.}
 
 When that is combined with loop peeling, the single execution of the operation
 is placed in the preamble. That is, loop invariant pure operations and heap
@@ -1091,7 +1091,7 @@
 
 Other interesting interpreters that are helped greatly by this optimization are
 for example our Prolog interpreter written in
-RPython~\cite{carl_friedrich_bolz_towards_2010}. Prolog programs often contain
+RPython~\cite{bolz_towards_2010}. Prolog programs often contain
 tight
 loops that perform list processing. Furthermore we experimented with a Python library
 for writing numerical kernels doing array manipulation. The exact extent is
@@ -1105,7 +1105,18 @@
 optimizations described here achieve are not in any way new. However, we think
 that achieving them in the way described in this paper is simpler than writing
 explicit algorithms.
-\cfbolz{more explicit listing of prior work goes here}
+
+Loop invariant code motion has been part of early compilers in the 1960s and
+1970s~\cite{allen_catalogue_1971}. The approach for achieving loop invariant
+code motion is typically to perform partial redundancy elimination. The
+approach was first proposed by Morel and Renvoise~\cite{morel_global_1979}. It
+involves solving data flow problems usually involding bidirection data flow
+equations. After improvements~\cite{chow_portable_1984,
+dhamdhere_practical_1991} this approach was followed by the work of Knoop
+et.al.~\cite{knoop_lazy_1992} who cleany separated the problem into a backward
+and forward data flow analysis. Implementing partial redundancy elimination in
+compilers that use SSA form \cite{chow_new_1997} simplified the algorithms
+because no iterative data flow analysis is needed any more.
 
 As described in the introduction,
 Mike Pall pioneered the approach described in this paper.
@@ -1155,7 +1166,7 @@
 
 The current approach still has some limitations which we plan to address in the
 future. In particular loop peeling works poorly in combination with trace
-trees~\cite{andreas_gal_incremental_2006} or trace stitching~\cite{gal_trace-based_2009}.
+trees~\cite{gal_incremental_2006} or trace stitching~\cite{gal_trace-based_2009}.
 The side exits attached guards that fail often
 currently have to jump to the preamble which makes loops with several equally
 common paths less efficient than they could be.