[pypy-commit] extradoc extradoc: merge heads

Sat Aug 11 10:32:50 CEST 2012

Author: Armin Rigo <arigo at tunes.org>
Branch: extradoc
Changeset: r4519:66bbed73ab91
Date: 2012-08-11 10:32 +0200
http://bitbucket.org/pypy/extradoc/changeset/66bbed73ab91/

Log:	merge heads

diff --git a/talk/iwtc11/benchmarks/benchmark.sh b/talk/iwtc11/benchmarks/benchmark.sh
--- a/talk/iwtc11/benchmarks/benchmark.sh
+++ b/talk/iwtc11/benchmarks/benchmark.sh
@@ -16,6 +16,8 @@
     ./runner.py -n 5 -c "$* -lstdc++" convolution/conv3x3.cc 1000 1000
     ./runner.py -n 5 -c "$* -lstdc++" convolution/dilate3x3.cc 1000 1000
     ./runner.py -n 5 -c "$* -lstdc++" image/sobel.cc 1000 1000
+    ./runner.py -n 5 -c "$*" scimark/run_SOR.c 100 32768
+    ./runner.py -n 5 -c "$*" scimark/run_SOR.c 1000 256
     rm a.out
 else
     if [ "$1" == "python2.7" ]; then
@@ -45,4 +47,6 @@
     #$* ./runner.py $EXTRA_OPTS image/noborder.py main NoBorderImage range
     #$* ./runner.py $EXTRA_OPTS image/sobel.py main NoBorderImagePadded
     #$* ./runner.py $EXTRA_OPTS image/sobel.py main NoBorderImagePadded uint8
+    $* ./runner.py $EXTRA_OPTS scimark.py SOR 100 32768
+    $* ./runner.py $EXTRA_OPTS scimark.py SOR 1000 256
 fi
diff --git a/talk/iwtc11/benchmarks/scimark.py b/talk/iwtc11/benchmarks/scimark.py
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark.py
@@ -0,0 +1,14 @@
+from convolution.convolution import Array2D
+
+def SOR_execute(omega, G, num_iterations):
+    for p in xrange(num_iterations):
+        for y in xrange(1, G.height - 1):
+            for x in xrange(1, G.width - 1):
+                G[x, y] = omega * 0.25 * (G[x, y-1] + G[x, y+1] + G[x-1, y] + G[x+1, y]) + \
+                          (1.0 - omega) * G[x, y]
+def SOR(args):
+    n, cycles = map(int, args)
+    a = Array2D(n, n)
+    SOR_execute(1.25, a, cycles)
+    return "SOR(%d, %d)" % (n, cycles)
+
diff --git a/talk/iwtc11/benchmarks/scimark/FFT.c b/talk/iwtc11/benchmarks/scimark/FFT.c
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/FFT.c
@@ -0,0 +1,165 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "FFT.h"
+
+#define PI  3.1415926535897932
+
+/*-----------------------------------------------------------------------*/
+
+static int int_log2(int n);
+
+double FFT_num_flops(int N)
+{
+
+     double Nd = (double) N;
+     double logN = (double) int_log2(N);
+
+     return (5.0*Nd-2)*logN + 2*(Nd+1);
+}
+
+static int int_log2 (int n)
+{
+    int k = 1;
+    int log = 0;
+    for(/*k=1*/; k < n; k *= 2, log++);
+    if (n != (1 << log))
+    {
+      printf("FFT: Data length is not a power of 2!: %d ",n);
+      exit(1);
+    }
+    return log; 
+}
+
+static void FFT_transform_internal (int N, double *data, int direction) {
+    int n = N/2;
+    int bit = 0;
+    int logn;
+    int dual = 1;
+
+    if (n == 1) return;         /* Identity operation! */
+    logn = int_log2(n);
+
+
+    if (N == 0) return;    
+
+    /* bit reverse the input data for decimation in time algorithm */
+    FFT_bitreverse(N, data) ;
+
+    /* apply fft recursion */
+    /* this loop executed int_log2(N) times */
+    for (bit = 0; bit < logn; bit++, dual *= 2) {
+      double w_real = 1.0;
+      double w_imag = 0.0;
+      int a;
+      int b;
+
+      double theta = 2.0 * direction * PI / (2.0 * (double) dual);
+      double s = sin(theta);
+      double t = sin(theta / 2.0);
+      double s2 = 2.0 * t * t;
+
+      for (a=0, b = 0; b < n; b += 2 * dual) {
+        int i = 2*b ;
+        int j = 2*(b + dual);
+
+        double wd_real = data[j] ;
+        double wd_imag = data[j+1] ;
+          
+        data[j]   = data[i]   - wd_real;
+        data[j+1] = data[i+1] - wd_imag;
+        data[i]  += wd_real;
+        data[i+1]+= wd_imag;
+      }
+      
+      /* a = 1 .. (dual-1) */
+      for (a = 1; a < dual; a++) {
+        /* trignometric recurrence for w-> exp(i theta) w */
+        {
+          double tmp_real = w_real - s * w_imag - s2 * w_real;
+          double tmp_imag = w_imag + s * w_real - s2 * w_imag;
+          w_real = tmp_real;
+          w_imag = tmp_imag;
+        }
+        for (b = 0; b < n; b += 2 * dual) {
+          int i = 2*(b + a);
+          int j = 2*(b + a + dual);
+
+          double z1_real = data[j];
+          double z1_imag = data[j+1];
+              
+          double wd_real = w_real * z1_real - w_imag * z1_imag;
+          double wd_imag = w_real * z1_imag + w_imag * z1_real;
+
+          data[j]   = data[i]   - wd_real;
+          data[j+1] = data[i+1] - wd_imag;
+          data[i]  += wd_real;
+          data[i+1]+= wd_imag;
+        }
+      }
+    }
+  }
+
+
+void FFT_bitreverse(int N, double *data) {
+    /* This is the Goldrader bit-reversal algorithm */
+    int n=N/2;
+    int nm1 = n-1;
+    int i=0; 
+    int j=0;
+    for (; i < nm1; i++) {
+
+      /*int ii = 2*i; */
+      int ii = i << 1;
+
+      /*int jj = 2*j; */
+      int jj = j << 1;
+
+      /* int k = n / 2 ; */
+      int k = n >> 1;
+
+      if (i < j) {
+        double tmp_real    = data[ii];
+        double tmp_imag    = data[ii+1];
+        data[ii]   = data[jj];
+        data[ii+1] = data[jj+1];
+        data[jj]   = tmp_real;
+        data[jj+1] = tmp_imag; }
+
+      while (k <= j) 
+      {
+        /*j = j - k ; */
+        j -= k;
+
+        /*k = k / 2 ;  */
+        k >>= 1 ; 
+      }
+      j += k ;
+    }
+  }
+
+
+void FFT_transform(int N, double *data)
+{
+    FFT_transform_internal(N, data, -1);
+}
+
+
+void FFT_inverse(int N, double *data)
+{
+    int n = N/2;
+    double norm = 0.0;
+    int i=0;
+    FFT_transform_internal(N, data, +1);
+
+    /* Normalize */
+
+
+    norm=1/((double) n);
+    for(i=0; i<N; i++)
+      data[i] *= norm;
+  
+}
+
+
diff --git a/talk/iwtc11/benchmarks/scimark/FFT.h b/talk/iwtc11/benchmarks/scimark/FFT.h
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/FFT.h
@@ -0,0 +1,5 @@
+
+void FFT_transform(int N, double *data);
+void FFT_inverse(int N, double *data);
+void FFT_bitreverse(int N, double *data);
+double FFT_num_flops(int N);
diff --git a/talk/iwtc11/benchmarks/scimark/LU.c b/talk/iwtc11/benchmarks/scimark/LU.c
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/LU.c
@@ -0,0 +1,102 @@
+#include <math.h>
+#include "LU.h"
+
+double LU_num_flops(int N)
+{
+        /* rougly 2/3*N^3 */
+
+    double Nd = (double) N;
+
+    return (2.0 * Nd *Nd *Nd/ 3.0);
+}
+
+
+void LU_copy_matrix(int M, int N, double **lu, double **A)
+{
+    int i;
+    int j;
+
+    for (i=0; i<M; i++)
+        for (j=0; j<N; j++)
+            lu[i][j] = A[i][j];
+}
+
+
+int LU_factor(int M, int N, double **A,  int *pivot)
+{
+ 
+
+    int minMN =  M < N ? M : N;
+    int j=0;
+
+    for (j=0; j<minMN; j++)
+    {
+        /* find pivot in column j and  test for singularity. */
+
+        int jp=j;
+        int i;
+        
+        double t = fabs(A[j][j]);
+        for (i=j+1; i<M; i++)
+        {
+            double ab = fabs(A[i][j]);
+            if ( ab > t)
+            {
+                jp = i;
+                t = ab;
+            }
+        }
+        
+        pivot[j] = jp;
+
+        /* jp now has the index of maximum element  */
+        /* of column j, below the diagonal          */
+
+        if ( A[jp][j] == 0 )                 
+            return 1;       /* factorization failed because of zero pivot */
+
+
+        if (jp != j)
+        {
+            /* swap rows j and jp */
+            double *tA = A[j];
+            A[j] = A[jp];
+            A[jp] = tA;
+        }
+
+        if (j<M-1)                /* compute elements j+1:M of jth column  */
+        {
+            /* note A(j,j), was A(jp,p) previously which was */
+            /* guarranteed not to be zero (Label #1)         */
+
+            double recp =  1.0 / A[j][j];
+            int k;
+            for (k=j+1; k<M; k++)
+                A[k][j] *= recp;
+        }
+
+
+        if (j < minMN-1)
+        {
+            /* rank-1 update to trailing submatrix:   E = E - x*y; */
+            /* E is the region A(j+1:M, j+1:N) */
+            /* x is the column vector A(j+1:M,j) */
+            /* y is row vector A(j,j+1:N)        */
+
+            int ii;
+            for (ii=j+1; ii<M; ii++)
+            {
+                double *Aii = A[ii];
+                double *Aj = A[j];
+                double AiiJ = Aii[j];
+                int jj;
+                for (jj=j+1; jj<N; jj++)
+                  Aii[jj] -= AiiJ * Aj[jj];
+
+            }
+        }
+    }
+
+    return 0;
+}
+
diff --git a/talk/iwtc11/benchmarks/scimark/LU.h b/talk/iwtc11/benchmarks/scimark/LU.h
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/LU.h
@@ -0,0 +1,9 @@
+#ifndef LU_H
+#define LU_H
+
+double LU_num_flops(int N);
+void LU_copy_matrix(int M, int N, double **lu, double **A);
+int LU_factor(int M, int N, double **A, int *pivot);
+
+
+#endif
diff --git a/talk/iwtc11/benchmarks/scimark/Makefile b/talk/iwtc11/benchmarks/scimark/Makefile
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/Makefile
@@ -0,0 +1,18 @@
+.SUFFIXES: .c .o
+
+.c.o:
+	$(CC) $(CFLAGS) -c $<
+
+all: scimark2 
+
+CC = cc
+LDFLAGS = 
+
+OBJS = FFT.o kernel.o Stopwatch.o Random.o SOR.o SparseCompRow.o \
+	array.o MonteCarlo.o LU.o 
+
+scimark2 : scimark2.o $(OBJS)
+	$(CC) $(CFLAGS) -o scimark2 scimark2.o $(OBJS) $(LDFLAGS) -lm
+
+clean:
+	rm $(OBJS) scimark2
diff --git a/talk/iwtc11/benchmarks/scimark/MonteCarlo.c b/talk/iwtc11/benchmarks/scimark/MonteCarlo.c
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/MonteCarlo.c
@@ -0,0 +1,70 @@
+#include "Random.h"
+
+/**
+ Estimate Pi by approximating the area of a circle.
+
+ How: generate N random numbers in the unit square, (0,0) to (1,1)
+ and see how are within a radius of 1 or less, i.e.
+ <pre>  
+
+ sqrt(x^2 + y^2) < r
+
+ </pre>
+  since the radius is 1.0, we can square both sides
+  and avoid a sqrt() computation:
+  <pre>
+
+    x^2 + y^2 <= 1.0
+
+  </pre>
+  this area under the curve is (Pi * r^2)/ 4.0,
+  and the area of the unit of square is 1.0,
+  so Pi can be approximated by 
+  <pre>
+                # points with x^2+y^2 < 1
+     Pi =~      --------------------------  * 4.0
+                     total # points
+
+  </pre>
+
+*/
+
+static const int SEED = 113;
+
+
+    double MonteCarlo_num_flops(int Num_samples)
+    {
+        /* 3 flops in x^2+y^2 and 1 flop in random routine */
+
+        return ((double) Num_samples)* 4.0;
+
+    }
+
+    
+
+    double MonteCarlo_integrate(int Num_samples)
+    {
+
+
+        Random R = new_Random_seed(SEED);
+
+
+        int under_curve = 0;
+        int count;
+
+        for (count=0; count<Num_samples; count++)
+        {
+            double x= Random_nextDouble(R);
+            double y= Random_nextDouble(R);
+
+            if ( x*x + y*y <= 1.0)
+                 under_curve ++;
+            
+        }
+
+        Random_delete(R);
+
+        return ((double) under_curve / Num_samples) * 4.0;
+    }
+
+
diff --git a/talk/iwtc11/benchmarks/scimark/MonteCarlo.h b/talk/iwtc11/benchmarks/scimark/MonteCarlo.h
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/MonteCarlo.h
@@ -0,0 +1,2 @@
+double MonteCarlo_integrate(int Num_samples);
+double MonteCarlo_num_flops(int Num_samples);
diff --git a/talk/iwtc11/benchmarks/scimark/README b/talk/iwtc11/benchmarks/scimark/README
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/README
@@ -0,0 +1,51 @@
+		SciMark2 (C version)
+
+This is an ANSI C version of the SciMark2 benchmark,
+translated from the original Java sources.  The intent
+in making this benchmark available in C is for mainly
+for performance comparisons.   For more information
+about SciMark, see http://math.nist.gov/scimark.
+
+Results of this benchmark can be sent to pozo at nist.gov.
+
+The program is split up into the main driver (scimark2.c) and
+kernel routines.  A sample makefile is included; 
+however, one could simply write 
+
+
+> cc -o scimark2  -O *.c
+
+and then run
+
+> scimark2
+
+This produces an output similar to
+
+
+**                                                              **
+** SciMark2 Numeric Benchmark, see http://math.nist.gov/scimark **
+** for details. (Results can be submitted to pozo at nist.gov)     **
+**                                                              **
+Using       2.00 seconds min time per kenel.
+Composite Score:           65.56
+FFT             Mflops:    63.38    (N=1024)
+SOR             Mflops:   124.80    (100 x 100)
+MonteCarlo:     Mflops:    16.05
+Sparse matmult  Mflops:    59.15    (N=1000, nz=5000)
+LU              Mflops:    64.40    (M=100, N=100)
+0:29.62 Elapsed, 29.620 user sec, 0.010 sys sec, 100.0% utilization.
+
+
+
+The first SciMark number reported is the composite score, followed
+by the an approximate Mflop rate for each kernel.
+
+
+To run the "large" version of this benchmark (with data structures
+that typically do not fit in cache) use
+
+>scimark2 -large
+
+
+------------------------------------------------------------------
+
diff --git a/talk/iwtc11/benchmarks/scimark/Random.c b/talk/iwtc11/benchmarks/scimark/Random.c
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/Random.c
@@ -0,0 +1,173 @@
+
+
+#include <stdlib.h>
+
+#include "Random.h"
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+
+  /* static const int mdig = 32; */
+#define MDIG 32
+
+  /* static const int one = 1; */
+#define ONE 1
+
+  static const int m1 = (ONE << (MDIG-2)) + ((ONE << (MDIG-2) )-ONE);
+  static const int m2 = ONE << MDIG/2;
+
+  /* For mdig = 32 : m1 =          2147483647, m2 =      65536
+     For mdig = 64 : m1 = 9223372036854775807, m2 = 4294967296 
+  */
+
+                                /* move to initialize() because  */
+                                /* compiler could not resolve as */
+                                /*   a constant.                 */
+
+  static /*const*/ double dm1;  /*  = 1.0 / (double) m1; */
+
+
+/* private methods (defined below, but not in Random.h */
+
+static void initialize(Random R, int seed);
+
+Random new_Random_seed(int seed)
+{
+    Random R = (Random) malloc(sizeof(Random_struct));
+
+    initialize(R, seed);
+    R->left = 0.0;
+    R->right = 1.0;
+    R->width = 1.0;
+    R->haveRange = 0 /*false*/;
+
+    return R;
+}
+
+Random new_Random(int seed, double left, double right) 
+{
+    Random R = (Random) malloc(sizeof(Random_struct));
+
+    initialize(R, seed);
+    R->left = left;
+    R->right = right;
+    R->width = right - left;
+    R->haveRange = 1;          /* true */
+
+    return R;
+}
+
+void Random_delete(Random R)
+{
+    free(R);
+}
+
+
+
+/* Returns the next random number in the sequence.  */
+
+double Random_nextDouble(Random R) 
+{
+    int k;
+
+    int I = R->i;
+    int J = R->j;
+    int *m = R->m;
+
+    k = m[I] - m[J];
+    if (k < 0) k += m1;
+    R->m[J] = k;
+
+    if (I == 0) 
+        I = 16;
+    else I--;
+    R->i = I;
+
+    if (J == 0) 
+        J = 16 ;
+    else J--;
+    R->j = J;
+
+    if (R->haveRange) 
+        return  R->left +  dm1 * (double) k * R->width;
+    else
+        return dm1 * (double) k;
+
+} 
+
+
+
+
+/*--------------------------------------------------------------------
+                           PRIVATE METHODS
+  ----------------------------------------------------------------- */
+
+static void initialize(Random R, int seed) 
+{
+
+    int jseed, k0, k1, j0, j1, iloop;
+
+    dm1  = 1.0 / (double) m1; 
+
+    R->seed = seed;
+
+    if (seed < 0 ) seed = -seed;            /* seed = abs(seed) */  
+    jseed = (seed < m1 ? seed : m1);        /* jseed = min(seed, m1) */
+    if (jseed % 2 == 0) --jseed;
+    k0 = 9069 % m2;
+    k1 = 9069 / m2;
+    j0 = jseed % m2;
+    j1 = jseed / m2;
+    for (iloop = 0; iloop < 17; ++iloop) 
+    {
+        jseed = j0 * k0;
+        j1 = (jseed / m2 + j0 * k1 + j1 * k0) % (m2 / 2);
+        j0 = jseed % m2;
+        R->m[iloop] = j0 + m2 * j1;
+    }
+    R->i = 4;
+    R->j = 16;
+
+}
+
+double *RandomVector(int N, Random R)
+{
+    int i;
+    double *x = (double *) malloc(sizeof(double)*N);
+
+    for (i=0; i<N; i++)
+        x[i] = Random_nextDouble(R);
+
+    return x;
+}
+
+
+double **RandomMatrix(int M, int N, Random R)
+{
+    int i;
+    int j;
+
+    /* allocate matrix */
+
+    double **A = (double **) malloc(sizeof(double*)*M);
+
+    if (A == NULL) return NULL;
+
+    for (i=0; i<M; i++)
+    {
+        A[i] = (double *) malloc(sizeof(double)*N);
+        if (A[i] == NULL) 
+        {
+            free(A);
+            return NULL;
+        }
+        for (j=0; j<N; j++)
+            A[i][j] = Random_nextDouble(R);
+    }
+    return A;
+}
+
+
+
diff --git a/talk/iwtc11/benchmarks/scimark/Random.h b/talk/iwtc11/benchmarks/scimark/Random.h
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/Random.h
@@ -0,0 +1,18 @@
+typedef struct
+{
+  int m[17];                        
+  int seed;                             
+  int i;                                /* originally = 4 */
+  int j;                                /* originally =  16 */
+  int /*boolean*/ haveRange;            /* = false; */
+  double left;                          /*= 0.0; */
+  double right;                         /* = 1.0; */
+  double width;                         /* = 1.0; */
+}
+Random_struct, *Random;
+
+Random new_Random_seed(int seed);
+double Random_nextDouble(Random R);
+void Random_delete(Random R);
+double *RandomVector(int N, Random R);
+double **RandomMatrix(int M, int N, Random R);
diff --git a/talk/iwtc11/benchmarks/scimark/SOR.c b/talk/iwtc11/benchmarks/scimark/SOR.c
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/SOR.c
@@ -0,0 +1,43 @@
+#include "SOR.h"
+
+    double SOR_num_flops(int M, int N, int num_iterations)
+    {
+        double Md = (double) M;
+        double Nd = (double) N;
+        double num_iterD = (double) num_iterations;
+
+        return (Md-1)*(Nd-1)*num_iterD*6.0;
+    }
+
+    void SOR_execute(int M, int N, double omega, double **G, int 
+            num_iterations)
+    {
+
+        double omega_over_four = omega * 0.25;
+        double one_minus_omega = 1.0 - omega;
+
+        /* update interior points */
+
+        int Mm1 = M-1;
+        int Nm1 = N-1; 
+        int p;
+        int i;
+        int j;
+        double *Gi;
+        double *Gim1;
+        double *Gip1;
+
+        for (p=0; p<num_iterations; p++)
+        {
+            for (i=1; i<Mm1; i++)
+            {
+                Gi = G[i];
+                Gim1 = G[i-1];
+                Gip1 = G[i+1];
+                for (j=1; j<Nm1; j++)
+                    Gi[j] = omega_over_four * (Gim1[j] + Gip1[j] + Gi[j-1] 
+                                + Gi[j+1]) + one_minus_omega * Gi[j];
+            }
+        }
+    }
+            
diff --git a/talk/iwtc11/benchmarks/scimark/SOR.h b/talk/iwtc11/benchmarks/scimark/SOR.h
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/SOR.h
@@ -0,0 +1,4 @@
+
+double SOR_num_flops(int M, int N, int num_iterations);
+void SOR_execute(int M, int N,double omega, double **G, int num_iterations);
+
diff --git a/talk/iwtc11/benchmarks/scimark/SparseCompRow.c b/talk/iwtc11/benchmarks/scimark/SparseCompRow.c
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/SparseCompRow.c
@@ -0,0 +1,43 @@
+    /* multiple iterations used to make kernel have roughly
+        same granulairty as other Scimark kernels. */
+
+    double SparseCompRow_num_flops(int N, int nz, int num_iterations)
+    {
+        /* Note that if nz does not divide N evenly, then the
+           actual number of nonzeros used is adjusted slightly.
+        */
+        int actual_nz = (nz/N) * N;
+        return ((double)actual_nz) * 2.0 * ((double) num_iterations);
+    }
+
+
+    /* computes  a matrix-vector multiply with a sparse matrix
+        held in compress-row format.  If the size of the matrix
+        in MxN with nz nonzeros, then the val[] is the nz nonzeros,
+        with its ith entry in column col[i].  The integer vector row[]
+        is of size M+1 and row[i] points to the begining of the
+        ith row in col[].  
+    */
+
+    void SparseCompRow_matmult( int M, double *y, double *val, int *row,
+        int *col, double *x, int NUM_ITERATIONS)
+    {
+        int reps;
+        int r;
+        int i;
+
+        for (reps=0; reps<NUM_ITERATIONS; reps++)
+        {
+        
+            for (r=0; r<M; r++)
+            {
+                double sum = 0.0; 
+                int rowR = row[r];
+                int rowRp1 = row[r+1];
+                for (i=rowR; i<rowRp1; i++)
+                    sum += x[ col[i] ] * val[i];
+                y[r] = sum;
+            }
+        }
+    }
+
diff --git a/talk/iwtc11/benchmarks/scimark/SparseCompRow.h b/talk/iwtc11/benchmarks/scimark/SparseCompRow.h
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/SparseCompRow.h
@@ -0,0 +1,10 @@
+
+#ifndef SPARSE_COMPROW_H
+#define SPARSE_COMPROW_H
+
+double SparseCompRow_num_flops(int N, int nz, int num_iterations);
+
+void SparseCompRow_matmult( int M, double *y, double *val, int *row,
+        int *col, double *x, int NUM_ITERATIONS);
+
+#endif
diff --git a/talk/iwtc11/benchmarks/scimark/Stopwatch.c b/talk/iwtc11/benchmarks/scimark/Stopwatch.c
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/Stopwatch.c
@@ -0,0 +1,82 @@
+#include <stdlib.h>
+#include "Stopwatch.h"
+
+double seconds()
+{
+        return ((double) clock()) / (double) CLOCKS_PER_SEC; 
+}
+
+void Stopwtach_reset(Stopwatch Q)
+{
+    Q->running = 0;         /* false */
+    Q->last_time = 0.0;
+    Q->total= 0.0;
+}
+
+
+Stopwatch new_Stopwatch(void)
+{
+    Stopwatch S = (Stopwatch) malloc(sizeof(Stopwatch_struct));
+    if (S == NULL)
+        return NULL;
+
+    Stopwtach_reset(S);
+    return S;
+}
+
+void Stopwatch_delete(Stopwatch S)
+{
+    if (S != NULL)
+        free(S);
+}
+
+
+/* Start resets the timer to 0.0; use resume for continued total */
+
+void Stopwatch_start(Stopwatch Q)
+{
+    if (! (Q->running)  )
+    {
+        Q->running = 1;  /* true */
+        Q->total = 0.0;
+        Q->last_time = seconds();
+    }
+}
+   
+/** 
+    Resume timing, after stopping.  (Does not wipe out
+        accumulated times.)
+
+*/
+
+void Stopwatch_resume(Stopwatch Q)
+{
+    if (!(Q->running))
+    {
+        Q-> last_time = seconds(); 
+        Q->running = 1;  /*true*/
+    }
+}
+   
+void Stopwatch_stop(Stopwatch Q)  
+{ 
+    if (Q->running) 
+    { 
+        Q->total += seconds() - Q->last_time; 
+        Q->running = 0;  /* false */
+    }
+}
+  
+ 
+double Stopwatch_read(Stopwatch Q)
+{  
+    
+    if (Q->running) 
+    {
+        double t = seconds();
+        Q->total += t - Q->last_time;
+        Q->last_time = t;
+    }
+    return Q->total;
+}
+        
diff --git a/talk/iwtc11/benchmarks/scimark/Stopwatch.h b/talk/iwtc11/benchmarks/scimark/Stopwatch.h
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/Stopwatch.h
@@ -0,0 +1,23 @@
+
+#include <time.h>
+
+typedef struct{
+    int running;        /* boolean */
+    double last_time;
+    double total;
+
+} *Stopwatch, Stopwatch_struct;
+
+
+
+double seconds();
+
+void Stopwtach_reset(Stopwatch Q);
+
+Stopwatch new_Stopwatch(void);
+void Stopwatch_delete(Stopwatch S);
+void Stopwatch_start(Stopwatch Q);
+void Stopwatch_resume(Stopwatch Q);
+void Stopwatch_stop(Stopwatch Q);
+double Stopwatch_read(Stopwatch Q);
+        
diff --git a/talk/iwtc11/benchmarks/scimark/array.c b/talk/iwtc11/benchmarks/scimark/array.c
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/array.c
@@ -0,0 +1,77 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include "array.h"
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+
+double** new_Array2D_double(int M, int N)
+{
+    int i=0;
+    int failed = 0;
+
+    double **A = (double**) malloc(sizeof(double*)*M);
+    if (A == NULL)
+        return NULL;
+
+    for (i=0; i<M; i++)
+    {
+        A[i] = (double*) malloc(N * sizeof(double));
+        if (A[i] == NULL)
+        {
+            failed = 1;
+            break;
+        }
+    }
+
+    /* if we didn't successfully allocate all rows of A      */
+    /* clean up any allocated memory (i.e. go back and free  */
+    /* previous rows) and return NULL                        */
+
+    if (failed)
+    {
+        i--;
+        for (; i<=0; i--)
+            free(A[i]);
+        free(A);
+        return NULL;
+    }
+    else
+        return A;
+}
+void Array2D_double_delete(int M, int N, double **A)
+{
+    int i;
+    if (A == NULL) return;
+
+    for (i=0; i<M; i++)
+        free(A[i]);
+
+    free(A);
+}
+
+
+  void Array2D_double_copy(int M, int N, double **B, double **A)
+  {
+
+        int remainder = N & 3;       /* N mod 4; */
+        int i=0;
+        int j=0;
+
+        for (i=0; i<M; i++)
+        {
+            double *Bi = B[i];
+            double *Ai = A[i];
+            for (j=0; j<remainder; j++)
+                Bi[j] = Ai[j];
+            for (j=remainder; j<N; j+=4)
+            {
+                Bi[j] = Ai[j];
+                Bi[j+1] = Ai[j+1];
+                Bi[j+2] = Ai[j+2];
+                Bi[j+3] = Ai[j+3];
+            }
+        }
+  }
diff --git a/talk/iwtc11/benchmarks/scimark/array.h b/talk/iwtc11/benchmarks/scimark/array.h
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/array.h
@@ -0,0 +1,9 @@
+
+#ifndef ARRAY_H
+#define ARRAY_H
+
+double **new_Array2D_double(int M, int N);
+void Array2D_double_delete(int M, int N, double **A);
+void Array2D_double_copy(int M, int N, double **B, double **A);
+
+#endif
diff --git a/talk/iwtc11/benchmarks/scimark/constants.h b/talk/iwtc11/benchmarks/scimark/constants.h
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/constants.h
@@ -0,0 +1,35 @@
+#ifndef CONSTANTS_H_
+#define CONSTANTS_H_
+
+     const  double RESOLUTION_DEFAULT = 2.0;  /* secs (normally 2.0) */
+     const  int RANDOM_SEED = 101010;
+
+    /* default: small (cache-contained) problem sizes */
+
+     const  int FFT_SIZE = 1024;  /* must be a power of two */
+     const  int SOR_SIZE =100; /* NxN grid */
+     const  int SPARSE_SIZE_M = 1000;
+     const  int SPARSE_SIZE_nz = 5000;
+     const  int LU_SIZE = 100;
+
+    /* large (out-of-cache) problem sizes */
+
+     const  int LG_FFT_SIZE = 1048576;  /* must be a power of two */
+     const  int LG_SOR_SIZE =1000;  /*  NxN grid  */
+     const  int LG_SPARSE_SIZE_M = 100000;
+     const  int LG_SPARSE_SIZE_nz =1000000;
+     const  int LG_LU_SIZE = 1000;
+
+    /* tiny problem sizes (used to mainly to preload network classes     */
+    /*                     for applet, so that network download times    */
+    /*                     are factored out of benchmark.)               */
+    /*                                                                   */
+     const  int TINY_FFT_SIZE = 16;  /* must be a power of two */
+     const  int TINY_SOR_SIZE =10; /* NxN grid */
+     const  int TINY_SPARSE_SIZE_M = 10;
+     const  int TINY_SPARSE_SIZE_N = 10;
+     const  int TINY_SPARSE_SIZE_nz = 50;
+     const  int TINY_LU_SIZE = 10;
+
+#endif
+
diff --git a/talk/iwtc11/benchmarks/scimark/kernel.c b/talk/iwtc11/benchmarks/scimark/kernel.c
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/kernel.c
@@ -0,0 +1,231 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "LU.h"
+#include "FFT.h"
+#include "SOR.h"
+#include "MonteCarlo.h"
+#include "LU.h"
+#include "Random.h" 
+#include "Stopwatch.h"  
+#include "SparseCompRow.h"
+#include "array.h"
+
+
+    double kernel_measureFFT(int N, double mintime, Random R)
+    {
+        /* initialize FFT data as complex (N real/img pairs) */
+
+        int twoN = 2*N;
+        double *x = RandomVector(twoN, R);
+        long cycles = 1;
+        Stopwatch Q = new_Stopwatch();
+        int i=0;
+        double result = 0.0;
+
+        while(1)
+        {
+            Stopwatch_start(Q);
+            for (i=0; i<cycles; i++)
+            {
+                FFT_transform(twoN, x);     /* forward transform */
+                FFT_inverse(twoN, x);       /* backward transform */
+            }
+            Stopwatch_stop(Q);
+            if (Stopwatch_read(Q) >= mintime)
+                break;
+
+            cycles *= 2;
+
+        }
+        /* approx Mflops */
+
+        result = FFT_num_flops(N)*cycles/ Stopwatch_read(Q) * 1.0e-6;
+        Stopwatch_delete(Q);
+        free(x);
+        return result;
+    }
+
+    double kernel_measureSOR(int N, double min_time, Random R)
+    {
+        double **G = RandomMatrix(N, N, R);
+        double result = 0.0;
+
+        Stopwatch Q = new_Stopwatch();
+        int cycles=1;
+        while(1)
+        {
+            Stopwatch_start(Q);
+            SOR_execute(N, N, 1.25, G, cycles);
+            Stopwatch_stop(Q);
+
+            if (Stopwatch_read(Q) >= min_time) break;
+
+            cycles *= 2;
+        }
+        /* approx Mflops */
+
+        printf("SOR cycles: %d, runtime: %f\n", cycles, Stopwatch_read(Q));
+        result = SOR_num_flops(N, N, cycles) / Stopwatch_read(Q) * 1.0e-6;
+        Stopwatch_delete(Q);
+        Array2D_double_delete(N, N, G);
+        return result;
+
+    }
+
+
+
+    double kernel_measureMonteCarlo(double min_time, Random R)
+    {
+        double result = 0.0;
+        Stopwatch Q = new_Stopwatch();
+
+        int cycles=1;
+        while(1)
+        {
+            Stopwatch_start(Q);
+            MonteCarlo_integrate(cycles);
+            Stopwatch_stop(Q);
+            if (Stopwatch_read(Q) >= min_time) break;
+
+            cycles *= 2;
+        }
+        /* approx Mflops */
+        result = MonteCarlo_num_flops(cycles) / Stopwatch_read(Q) * 1.0e-6;
+        Stopwatch_delete(Q);
+        return result;
+    }
+
+
+    double kernel_measureSparseMatMult(int N, int nz, 
+            double min_time, Random R)
+    {
+        /* initialize vector multipliers and storage for result */
+        /* y = A*y;  */
+
+        double *x = RandomVector(N, R);
+        double *y = (double*) malloc(sizeof(double)*N);
+
+        double result = 0.0;
+
+#if 0
+        // initialize square sparse matrix
+        //
+        // for this test, we create a sparse matrix with M/nz nonzeros
+        // per row, with spaced-out evenly between the begining of the
+        // row to the main diagonal.  Thus, the resulting pattern looks
+        // like
+        //             +-----------------+
+        //             +*                +
+        //             +***              +
+        //             +* * *            +
+        //             +** *  *          +
+        //             +**  *   *        +
+        //             +* *   *   *      +
+        //             +*  *   *    *    +
+        //             +*   *    *    *  + 
+        //             +-----------------+
+        //
+        // (as best reproducible with integer artihmetic)
+        // Note that the first nr rows will have elements past
+        // the diagonal.
+#endif
+
+        int nr = nz/N;      /* average number of nonzeros per row  */
+        int anz = nr *N;    /* _actual_ number of nonzeros         */
+
+            
+        double *val = RandomVector(anz, R);
+        int *col = (int*) malloc(sizeof(int)*nz);
+        int *row = (int*) malloc(sizeof(int)*(N+1));
+        int r=0;
+        int cycles=1;
+
+        Stopwatch Q = new_Stopwatch();
+
+        row[0] = 0; 
+        for (r=0; r<N; r++)
+        {
+            /* initialize elements for row r */
+
+            int rowr = row[r];
+            int step = r/ nr;
+            int i=0;
+
+            row[r+1] = rowr + nr;
+            if (step < 1) step = 1;   /* take at least unit steps */
+
+
+            for (i=0; i<nr; i++)
+                col[rowr+i] = i*step;
+                
+        }
+
+
+        while(1)
+        {
+            Stopwatch_start(Q);
+            SparseCompRow_matmult(N, y, val, row, col, x, cycles);
+            Stopwatch_stop(Q);
+            if (Stopwatch_read(Q) >= min_time) break;
+
+            cycles *= 2;
+        }
+        /* approx Mflops */
+        result = SparseCompRow_num_flops(N, nz, cycles) / 
+                        Stopwatch_read(Q) * 1.0e-6;
+
+        Stopwatch_delete(Q);
+        free(row);
+        free(col);
+        free(val);
+        free(y);
+        free(x);
+
+        return result;
+    }
+
+
+    double kernel_measureLU(int N, double min_time, Random R)
+    {
+
+        double **A = NULL;
+        double **lu = NULL; 
+        int *pivot = NULL;
+
+    
+
+        Stopwatch Q = new_Stopwatch();
+        double result = 0.0;
+        int i=0;
+        int cycles=1;
+
+        if ((A = RandomMatrix(N, N,  R)) == NULL) exit(1);
+        if ((lu = new_Array2D_double(N, N)) == NULL) exit(1);
+        if ((pivot = (int *) malloc(N * sizeof(int))) == NULL) exit(1);
+
+
+        while(1)
+        {
+            Stopwatch_start(Q);
+            for (i=0; i<cycles; i++)
+            {
+                Array2D_double_copy(N, N, lu, A);
+                LU_factor(N, N, lu, pivot);
+            }
+            Stopwatch_stop(Q);
+            if (Stopwatch_read(Q) >= min_time) break;
+
+            cycles *= 2;
+        }
+        /* approx Mflops */
+        result = LU_num_flops(N) * cycles / Stopwatch_read(Q) * 1.0e-6;
+
+        Stopwatch_delete(Q);
+        free(pivot); 
+        Array2D_double_delete(N, N, lu); 
+        Array2D_double_delete(N, N, A);
+
+        return result;
+
+    }
+
diff --git a/talk/iwtc11/benchmarks/scimark/kernel.h b/talk/iwtc11/benchmarks/scimark/kernel.h
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/kernel.h
@@ -0,0 +1,11 @@
+#ifndef KERNEL_H
+#define KERNEL_H
+
+double kernel_measureFFT( int FFT_size, double min_time, Random R);
+double kernel_measureSOR( int SOR_size, double min_time, Random R);
+double kernel_measureMonteCarlo( double min_time, Random R);
+double kernel_measureSparseMatMult(int Sparse_size_N,
+    int Sparse_size_nz, double min_time, Random R);
+double kernel_measureLU( int LU_size, double min_time, Random R);
+
+#endif
diff --git a/talk/iwtc11/benchmarks/scimark/run_SOR.c b/talk/iwtc11/benchmarks/scimark/run_SOR.c
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/run_SOR.c
@@ -0,0 +1,17 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "SOR.c"
+
+int main(int ac, char **av) {
+    assert(ac==3);
+    int N = atoi(av[1]);
+    int cycles = atoi(av[2]);
+    double **G = malloc(sizeof(double*)*N);
+    int i;
+    for (i=0; i<N; i++) G[i] = malloc(sizeof(double)*N);
+    SOR_execute(N, N, 1.25, G, cycles);
+    fprintf(stderr, "SOR(%d, %d):  ", N, cycles);
+    return 0;
+}
diff --git a/talk/iwtc11/benchmarks/scimark/scimark2.c b/talk/iwtc11/benchmarks/scimark/scimark2.c
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/scimark2.c
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "Random.h"
+#include "kernel.h"
+#include "constants.h"
+
+void print_banner(void);
+
+int main(int argc, char *argv[])
+{
+        /* default to the (small) cache-contained version */
+
+        double min_time = RESOLUTION_DEFAULT;
+
+        int FFT_size = FFT_SIZE;
+        int SOR_size =  SOR_SIZE;
+        int Sparse_size_M = SPARSE_SIZE_M;
+        int Sparse_size_nz = SPARSE_SIZE_nz;
+        int LU_size = LU_SIZE;
+
+
+        /* run the benchmark */
+
+        double res[6] = {0.0};
+        Random R = new_Random_seed(RANDOM_SEED);
+
+
+        if (argc > 1)
+        {
+			int current_arg = 1;
+
+			if (strcmp(argv[1], "-help")==0  ||
+					strcmp(argv[1], "-h") == 0)
+			{
+				fprintf(stderr, "Usage: [-large] [minimum_time]\n");
+				exit(0);
+			}
+
+			if (strcmp(argv[1], "-large")==0)
+			{
+				FFT_size = LG_FFT_SIZE;
+				SOR_size = LG_SOR_SIZE;
+				Sparse_size_M = LG_SPARSE_SIZE_M;
+				Sparse_size_nz = LG_SPARSE_SIZE_nz;
+				LU_size = LG_LU_SIZE;
+
+				current_arg++;
+			}
+
+			if (current_arg < argc)
+			{
+				min_time = atof(argv[current_arg]);
+			}
+			
+        }
+
+	
+		print_banner();
+		printf("Using %10.2f seconds min time per kenel.\n", min_time);
+
+        res[1] = kernel_measureFFT( FFT_size, min_time, R);   
+        res[2] = kernel_measureSOR( SOR_size, min_time, R);   
+        res[3] = kernel_measureMonteCarlo(min_time, R); 
+        res[4] = kernel_measureSparseMatMult( Sparse_size_M, 
+                Sparse_size_nz, min_time, R);           
+        res[5] = kernel_measureLU( LU_size, min_time, R);  
+
+
+
+        res[0] = (res[1] + res[2] + res[3] + res[4] + res[5]) / 5;
+
+        /* print out results  */
+        printf("Composite Score:        %8.2f\n" ,res[0]);
+        printf("FFT             Mflops: %8.2f    (N=%d)\n", res[1], FFT_size);
+        printf("SOR             Mflops: %8.2f    (%d x %d)\n", 		
+				res[2], SOR_size, SOR_size);
+        printf("MonteCarlo:     Mflops: %8.2f\n", res[3]);
+        printf("Sparse matmult  Mflops: %8.2f    (N=%d, nz=%d)\n", res[4], 
+					Sparse_size_M, Sparse_size_nz);
+        printf("LU              Mflops: %8.2f    (M=%d, N=%d)\n", res[5],
+				LU_size, LU_size);
+
+
+        Random_delete(R);
+
+        return 0;
+  
+}
+
+void print_banner()
+{
+ printf("**                                                              **\n");
+ printf("** SciMark2 Numeric Benchmark, see http://math.nist.gov/scimark **\n");
+ printf("** for details. (Results can be submitted to pozo at nist.gov)     **\n");
+ printf("**                                                              **\n");
+}
diff --git a/talk/iwtc11/benchmarks/scimark/scimark2.h b/talk/iwtc11/benchmarks/scimark/scimark2.h
new file mode 100644
--- /dev/null
+++ b/talk/iwtc11/benchmarks/scimark/scimark2.h
@@ -0,0 +1,22 @@
+
+#ifndef SCIMARK2_H
+#define SCIMARK2_H
+
+#define VERSION 2.0
+
+#ifndef NULL 
+#define NULL 0
+#endif
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+
+
+#endif
+
diff --git a/talk/vmil2012/Makefile b/talk/vmil2012/Makefile
--- a/talk/vmil2012/Makefile
+++ b/talk/vmil2012/Makefile
@@ -1,5 +1,5 @@
 
-jit-guards.pdf: paper.tex paper.bib figures/log.tex figures/example.tex figures/benchmarks_table.tex figures/backend_table.tex figures/ops_count_table.tex figures/loop_bridge.pdf figures/guard_table.tex figures/resume_data_table.tex
+jit-guards.pdf: paper.tex paper.bib figures/log.tex figures/example.tex figures/benchmarks_table.tex figures/backend_table.tex figures/ops_count_table.tex figures/loop_bridge.pdf figures/guard_table.tex figures/resume_data_table.tex figures/failing_guards_table.tex
 	pdflatex paper
 	bibtex paper
 	pdflatex paper
@@ -18,7 +18,7 @@
 %.tex: %.py
 	pygmentize -l python -o $@ $<
 
-figures/%_table.tex: tool/build_tables.py logs/backend_summary.csv logs/summary.csv tool/table_template.tex logs/bridge_summary.csv logs/resume_summary.csv
+figures/%_table.tex: tool/build_tables.py logs/backend_summary.csv logs/summary.csv tool/table_template.tex logs/bridge_summary.csv logs/resume_summary.csv logs/guard_summary.json
 	tool/setup.sh
 	paper_env/bin/python tool/build_tables.py $@
 
diff --git a/talk/vmil2012/figures/example.tex b/talk/vmil2012/figures/example.tex
--- a/talk/vmil2012/figures/example.tex
+++ b/talk/vmil2012/figures/example.tex
@@ -10,11 +10,11 @@
             return Odd(n)
 
 class Odd(Base):
-    def f(self):
+    def step(self):
         return Even(self.value * 3 + 1)
 
 class Even(Base):
-    def f(self):
+    def step(self):
         n = self.value >> 2
         if n == 1:
             return None
@@ -26,6 +26,6 @@
         j += 1
         if a is None:
             return True
-        a = a.f()
+        a = a.step()
     return False
 \end{lstlisting}
diff --git a/talk/vmil2012/figures/log.tex b/talk/vmil2012/figures/log.tex
--- a/talk/vmil2012/figures/log.tex
+++ b/talk/vmil2012/figures/log.tex
@@ -1,18 +1,4 @@
 \begin{lstlisting}[mathescape, numbers=right, escapechar=|, firstnumber=-1]
-[$j_1$, $a_1$]                              |\setcounter{lstnumber}{-2}|
-label($j_1$, $a_1$, descr=label0))          |\setcounter{lstnumber}{24}|
-$j_2$ = int_add($j_1$, 1)                   |\setcounter{lstnumber}{25}|
-guard_nonnull_class($a_1$, Even)            |\setcounter{lstnumber}{16}|
-$i_1$ = getfield_gc($a_1$, descr='value')   |\setcounter{lstnumber}{16}|
-$i_2$ = int_rshift($i_1$, 2)                |\setcounter{lstnumber}{17}|
-$b_1$ = int_eq($i_2$, 1)                    |\setcounter{lstnumber}{17}|
-guard_false($b_1$)                          |\setcounter{lstnumber}{5}|
-$i_3$ = int_and($i_2$, 1)                   |\setcounter{lstnumber}{5}|
-$i_4$ = int_is_zero($i_3$)                  |\setcounter{lstnumber}{5}|
-guard_true($i_4$)                           |\setcounter{lstnumber}{23}|
-$b_2$ = int_lt($j_2$, 100)                  |\setcounter{lstnumber}{23}|
-guard_true($b_2$)                           |\setcounter{lstnumber}{-2}|
-                                            |\setcounter{lstnumber}{-2}|
 label($j_2$, $i_2$, descr=label1)           |\setcounter{lstnumber}{24}|
 $j_3$ = int_add($j_2$, 1)                   |\setcounter{lstnumber}{16}|
 $i_5$ = int_rshift($i_2$, 2)                |\setcounter{lstnumber}{17}|
@@ -22,6 +8,6 @@
 $b_4$ = int_is_zero($i_6$)                  |\setcounter{lstnumber}{5}|
 guard_true($b_4$)                           |\setcounter{lstnumber}{23}|
 $b_5$ = int_lt($j_3$, 100)                  |\setcounter{lstnumber}{23}|
-guard_true($b_5$)                           |\setcounter{lstnumber}{-2}| 
+guard_true($b_5$)                           |\setcounter{lstnumber}{-2}|
 jump($j_3$, $i_5$, descr=label1)
 \end{lstlisting}
diff --git a/talk/vmil2012/figures/unopt-log.tex b/talk/vmil2012/figures/unopt-log.tex
new file mode 100644
--- /dev/null
+++ b/talk/vmil2012/figures/unopt-log.tex
@@ -0,0 +1,18 @@
+\begin{lstlisting}[mathescape, numbers=right, escapechar=|, firstnumber=-1]
+[$j_1$, $a_1$]                              |\setcounter{lstnumber}{24}|
+$j_2$ = int_add($j_1$, 1)                   |\setcounter{lstnumber}{25}|
+guard_nonnull($a_1$)                        |\setcounter{lstnumber}{27}|
+guard_class($a_1$, Even)                    |\setcounter{lstnumber}{16}|
+$i_1$ = getfield_gc($a_1$, descr='value')   |\setcounter{lstnumber}{16}|
+$i_2$ = int_rshift($i_1$, 2)                |\setcounter{lstnumber}{17}|
+$b_1$ = int_eq($i_2$, 1)                    |\setcounter{lstnumber}{17}|
+guard_false($b_1$)                          |\setcounter{lstnumber}{5}|
+$i_3$ = int_and($i_2$, 1)                   |\setcounter{lstnumber}{5}|
+$i_4$ = int_is_zero($i_3$)                  |\setcounter{lstnumber}{5}|
+guard_true($i_4$)                           |\setcounter{lstnumber}{6}|
+$a_2$ = new(Even)                           |\setcounter{lstnumber}{2}|
+setfield_gc($a_2$, descr='value')           |\setcounter{lstnumber}{23}|
+$b_2$ = int_lt($j_2$, 100)                  |\setcounter{lstnumber}{23}|
+guard_true($b_2$)                           |\setcounter{lstnumber}{-2}|
+jump($j_2$, $a_2$)
+\end{lstlisting}
diff --git a/talk/vmil2012/paper.tex b/talk/vmil2012/paper.tex
--- a/talk/vmil2012/paper.tex
+++ b/talk/vmil2012/paper.tex
@@ -237,9 +237,10 @@
 interpreter profiles the executed program and selects frequently executed code
 paths to be compiled to machine code. After profiling identified an interesting
 path, tracing is started, recording all operations that are executed on this
-path. Like in most compilers tracing JITs use an intermediate representation
-to store the recorded operations, which is typically in SSA form\todo{some ssa
-reference}. Since tracing follows actual execution the code that is recorded
+path. Like in most compilers tracing JITs use an intermediate representation to
+store the recorded operations, which is typically in SSA
+form~\cite{cytron_efficiently_1991}. Since tracing follows actual execution the
+code that is recorded
 represents only one possible path through the control flow graph. Points of
 divergence from the recorded path are marked with special operations called
 \emph{guards}, these operations ensure that assumptions valid during the
@@ -261,13 +262,26 @@
 approach is called \emph{meta-tracing}. For the purpose of this paper the fact
 that RPython's tracing JIT is a meta-tracing JIT can be ignored.
 
-\todo{explain example}
-%___________________________________________________________________________
-
 \begin{figure}
     \input{figures/example.tex}
     \caption{Example Program}
-    \label{fig:trace-log}
+    \label{fig:example}
+\end{figure}
+
+Figure~\ref{fig:example} shows an example RPython function that checks
+whether a number reduces to 1 with less than 100 steps of the Collatz process.
+It uses an \lstinline{Even} and an \lstinline{Odd} class to box the numbers, to
+make the example more interesting. If the loop in \lstinline{check_reduces} is
+traced when \lstinline{a} is a multiple of four, the unoptimized
+trace looks like in Figure~\ref{fig:unopt-trace}. The line numbers in the trace
+correspond to the line numbers in Figure~\ref{fig:trace-log}. The resulting
+trace repeatedly halves the current value and checks whether it is equal to
+one, or odd. In either of these cases the trace is left via a guard failure.
+
+\begin{figure}
+    \input{figures/unopt-log.tex}
+    \caption{Unoptimized trace}
+    \label{fig:unopt-trace}
 \end{figure}
 
 \section{Guards in the Frontend} %{Resume Data}
@@ -352,7 +366,6 @@
     \item For virtuals,
         the payload is an index into a list of virtuals, see next section.
 \end{itemize}
-\todo{figure showing linked resume-data}
 
 \subsection{Interaction With Optimization}
 \label{sub:optimization}
@@ -420,11 +433,17 @@
 So far no special compression is done with this information,
 compared to the other source of information delayed heap stores are quite rare.
 
-\begin{figure}
-\includegraphics[width=0.5\textwidth]{figures/resume_data.pdf}
-\caption{The resume data for Figure~\ref{fig:trace-log}}
-\label{fig:resume-data}
-\end{figure}
+Figure~\ref{fig:trace-log} shows the optimized version of the trace in
+Figure~\ref{fig:fig:unopt-trace}. Allocation removal has removed the
+\lstinline{new} operation and other operations handling the boxes. The
+operations handle unboxed numbers now.
+
+Figure~\ref{fig:resume-data} sketches the symbolic frames of the first two
+guards in the trace. The frames for \lstinline{check_reduces} and
+\lstinline{Even.step} as well as the description of the allocation-removed
+virtual instance of \lstinline{Even} are shared between the two guards.
+
+\todo{fix labels in diagram}
 
 % section Resume Data (end)
 
@@ -437,6 +456,13 @@
 \section{Guards in the Backend}
 \label{sec:Guards in the Backend}
 
+\begin{figure}
+\includegraphics[width=0.5\textwidth]{figures/resume_data.pdf}
+\caption{The resume data for Figure~\ref{fig:trace-log}}
+\label{fig:resume-data}
+\end{figure}
+
+
 After optimization the resulting trace is handed to the over platform specific
 backend to be compiled to machine code. The compilation phase consists of two
 passes over the lists of instructions, a backwards pass to calculate live
@@ -456,7 +482,7 @@
 pseudo-assembler if the operation and the guard are compiled separated or if
 they are merged.
 
-\bivab{Figure needs better formatting}
+\todo{Figure needs better formatting}
 \begin{figure}[ht]
   \noindent
   \centering
@@ -567,6 +593,7 @@
 
 \section{Evaluation}
 \label{sec:evaluation}
+\todo{improve the table formatting}
 
 The results presented in this section are based on numbers gathered by running
 a subset of the standard PyPy benchmarks. The PyPy benchmarks are used to
@@ -608,73 +635,136 @@
 \end{description}
 
 From the mentioned benchmarks we collected different datasets to evaluate the
-Frequency, the overhead and overall behaviour of guards.
+frequency, the overhead and overall behaviour of guards, the results are
+summarized in the remainder of this section. We want to point out three
+aspects of guards in particular
+\begin{itemize}
+  \item Guards are very common operations in traces.
+  \item There is overhead associated with guards.
+  \item Guard failures are local and rare.
+\end{itemize}
+
+All figures in this section do not take garbage collection of machine code into account. Pieces
+of machine code can be globally invalidated or just become cold again. In both
+cases the generated machine code and the related data is garbage collected. The
+figures show the total amount of operations that are evaluated by the JIT and
+the total amount of code and data that is generated from the optimized traces.
+
+
+\subsection{Frequency of Guards}
+\label{sub:guard_frequency}
+\begin{figure*}
+    \include{figures/benchmarks_table}
+    \caption{Number of operations in the recorded traces and the relative amount of guards before and after optimizations}
+    \label{fig:benchmarks}
+\end{figure*}
+
 Figure~\ref{fig:benchmarks} summarizes the total number of operations that were
 recorded during tracing for each of the benchmarks and what percentage of these
 operations are guards. The number of operations was counted on the unoptimized
-and optimized traces. Showing that the overall optimization rate is between
-65.80\% and 86.23\% of all operations and that the optimization rate for guards
-is similar to the general one, as could be assumed based on
-Figure~\ref{fig:guard_percent}. These numbers show that guards are a rather
-common operation in the traces, which is a reason the put effort into
-optimizing them.
-\todo{some pie charts about operation distribution}
+and optimized traces. The Figure shows that the overall optimization rate for
+operations which is between 69.4\% and 83.89\% of the traced operations and the
+optimization rate of guards, which is between 65.8\% and 86.2\% of the
+operations, are very similar, as could be assumed based on
+Figure~\ref{fig:guard_percent}. This indicates that the optimizer can remove
+most of the guards, but after the optimization pass guards still account for
+15.2\% to 20.2\% of the operations being compiled and later executed.
+The frequency of guard operations makes it important to store the associated
+information efficiently and also to make sure that guard checks are executed
+quickly.
 
-\begin{figure*}
-    \include{figures/benchmarks_table}
-    \caption{Benchmark Results}
-    \label{fig:benchmarks}
-\end{figure*}
-
+\subsection{Overhead of Guards}
+\label{sub:guard_overhead}
 \begin{figure}
     \include{figures/resume_data_table}
-    \caption{Resume Data sizes in KiB}
+    \caption{Resume data sizes}
     \label{fig:resume_data_sizes}
 \end{figure}
 
-\todo{figure about failure counts of guards (histogram?)}
-\todo{add resume data sizes without sharing}
-\todo{add a footnote about why guards have a threshold of 100}
-
 The overhead that is incurred by the JIT to manage the \texttt{resume data},
 the \texttt{low-level resume data} as well as the generated machine code is
 shown in Figure~\ref{fig:backend_data}. It shows the total memory consumption
 of the code and of the data generated by the machine code backend and an
 approximation of the size of the \texttt{resume data} structures for the
-different benchmarks mentioned above. The size of the machine code is composed
-of the size of the compiled operations, the trampolines generated for the
-guards and a set of support functions that are generated when the JIT starts
-and are shared by all compiled traces. The size of the \texttt{low-level resume
+different benchmarks mentioned above. The machine code taken into account is
+composed of the compiled operations, the trampolines generated for the guards
+and a set of support functions that are generated when the JIT starts and which
+are shared by all compiled traces. The size of the \texttt{low-level resume
 data} is the size of the compressed mapping from registers and stack to
-IR-level variable and finally the size of the \texttt{resume data} is an
-approximation of the size of the compressed high-level resume data\todo{explain
-why it is an approximation}.
+IR-level variables and finally the size of the \texttt{resume data} is an
+approximation of the size of the compressed high-level resume data as described
+in Section~\ref{sec:Resume Data}.\footnote{
+The size of the resume data is not measured at runtime, but reconstructed from
+log files.}
 
-Compared to the size of the generated machine code the compressed
-\texttt{low-level resume data} is about 15\% to 20\% of that size, depending on
-the benchmark. On the other hand the generated machine code has only a size
-ranging from 20.21\% to 37.98\% of the size of the high and low-level
-\texttt{resume data} being compressed as described before.
+For the different benchmarks the \texttt{low-level resume data} has a size of
+about 15\% to 20\% of the amount of memory compared to the size of the
+generated machine code. On the other hand the generated machine code has only a
+size ranging from 20.5\% to 37.98\% of the size of the high and low-level
+\texttt{resume data} combined and being compressed as described before.
 
 Tracing JIT compilers only compile the subset of the code executed in a program
 that is traced in a hot loop, for this reason the amount of generated machine
-code will be smaller than in other juts-in-time compilation approaches. Still
-the overhead associated to guards to resume execution from a side exit appears
-to be high.\bivab{put into relation to other JITs, compilers in general}
+code will be smaller than in other juts-in-time compilation approaches.  This
+creates a larger discrepancy between the size of the \texttt{resume data} when
+compared to the illustrates why it is important to compress this information.
 
-\begin{figure*}
+\begin{figure}
     \include{figures/backend_table}
-    \caption{Total size of generated machine code and guard data}
+    \caption{Total size of generated machine code and resume data}
     \label{fig:backend_data}
-\end{figure*}
+\end{figure}
 
-Both figures do not take into account garbage collection. Pieces of machine
-code can be globally invalidated or just become cold again. In both cases the
-generated machine code and the related data is garbage collected. The figures
-show the total amount of operations that are evaluated by the JIT and the
-total amount of code and data that is generated from the optimized traces.
+Why the efficient storing of the \texttt{resume data} is a central concern in the design
+of guards is illustrated by Figure~\ref{fig:backend_data}. This figure shows
+the size of the compressed \texttt{resume data}, the approximated size of
+storing the \texttt{resume data} without compression and
+an approximation of the best possible compression of the resume data by
+compressing the data using the
+\texttt{xz} compression tool, which is a ``general-purpose data compression
+software with high compression ratio''.\footnote{\url{http://tukaani.org/xz/}}
 
-\todo{compare to naive variant of resume data}
+The results show that the current approach of compression and data sharing only
+requires 18.3\% to 31.1\% of the space compared to a naive approach. This
+shows that large parts of the resume data are redundant and can be stored more
+efficiently through using the techniques described above. On the other hand
+comparing the results to the xz compression which only requires between 17.1\%
+and 21.1\% of the space required by our compression shows that the compression
+is not optimal but a trade-off between the required space and the time needed
+to build a good compressed representation of the compressed resume data for the
+large amount of guards present in the traces.
+
+\subsection{Guard Failures}
+\label{sub:guard_failure}
+The last point in this discussion is the frequency of guard failures.
+Figure~\ref{fig:failing_guards} presents for each benchmark a list of the
+relative amounts of guards that ever fail and of guards that fail more than 200
+times.\footnote{
+    The threshold of 200 is rather high. It was picked experimentally to give
+    good results for long-running programs.
+}
+As described before, for guards that fail more than 200 times, a trace
+is recorded that starts from the guard. Afterwards the guard is patched so that later
+failures execute the new trace instead of taking the side-exit. Hence the
+numbers presented for guards that fail more than 200 times represent the 200
+failures up to the compilation of the bridge and all executions of the then
+attached bridge.
+
+\begin{figure}
+    \include{figures/failing_guards_table}
+    \caption{Failing guards relative to the total number of guards}
+    \label{fig:failing_guards}
+\end{figure}
+
+From Figure~\ref{fig:failing_guards} we can see that only a very small amount
+of all the guards in the optimized traces ever fail. This amount varies between
+2.4\% and 5.7\% of all guards. As can be expected, even less guards fail often
+enough that a bride is compiled for them, only 1.2\% to 3.6\% of all guards
+fail more than 200 times. Also of all failing guards a few fail extremely often
+and most fail rarely. The results emphasizes that as most of the guards never
+fail it is important to make sure that the successful execution of a guard does
+not have unnecessary overhead.
+
 
 \section{Related Work}
 \label{sec:Related Work}
@@ -696,15 +786,18 @@
 Mike Pall, the author of LuaJIT describes in a post to the lua-users mailing
 list different technologies and techniques used in the implementation of
 LuaJIT~\cite{Pall:2009}. Pall explains that guards in LuaJIT use a datastucture
-called snapshots, similar to RPython's resume data, to store the information about
-how to rebuild the state from a side-exit using the information in the snapshot
-and the machine execution state. Pall also acknowledges that snapshot for
-guards are associated with a large memory footprint. The solution used in
-LuaJIT is to store sparse snapshots, avoiding the creation of snapshots for
-every guard to reduce memory pressure. Snapshots are only created for guards
-after updates to the global state, after control flow points from the original
-program and for guards that are likely to fail. As an outlook Pall mentions the
-plans to switch to compressed snapshots to further reduce redundancy.
+called snapshots, similar to RPython's resume data, to store the information
+about how to rebuild the state from a side-exit using the information in the
+snapshot and the machine execution state. According to Pall~\cite{Pall:2009}
+snapshots for guards in LuaJIT are associated with a large memory footprint.
+The solution used in there is to store sparse snapshots, avoiding the creation
+of snapshots for every guard to reduce memory pressure. Snapshots are only
+created for guards after updates to the global state, after control flow points
+from the original program and for guards that are likely to fail. As an outlook
+Pall mentions the plans to switch to compressed snapshots to further reduce
+redundancy. The approach of not creating snapshots at all for every guard is
+orthogonal to the resume data compression presented in this paper and could be
+reused within RPython to improve the memory usage further.
 
 Linking side exits to pieces of later compiled machine code was described first
 in the context of Dynamo~\cite{Bala:2000wv} under the name of Fragment Linking.
@@ -776,11 +869,38 @@
 
 \section{Conclusion}
 \label{sec:Conclusion}
+In this paper we have concentrated on guards, an operation typically found in
+tracing just-in-time compilers and used to denote points of possible control
+flow divergence in recorded traces.
+We described how, based on the observation that guards are a frequent operation
+in traces and that they do not fail often, guards have been implemented in the
+high and low level components of RPython's tracing JIT compiler.
 
-\todo{conclusion}
+Finally we have presented experimental data collected using the standard PyPy
+benchmark set to evaluate previous observations and assumptions. Our
+experiments showed that, as previously assumed, guards are a very common
+operation in traces. At the same time guards are associated with a high
+overhead, because for all compiled guards information needs to be
+stored to restore the execution state in case of a bail-out. The measurements
+showed that the compression techniques used in PyPy effectively reduce the
+overhead of guards, while it still produces a significant overhead. The results
+also showed that guard failure is a local event: there are few
+guards that fail at all, and even fewer that fail very often.
+These numbers validate the design decision of reducing the overhead of
+successful guard checks as much as possible while paying a higher price in the
+case of bailout due to having to decode compressed state representation.
+The compressed state representation is reduces the memory footprint of rarely
+used data.
+
+Based on the observation that most guards do not fail very often or at all it
+would be worth exploring if a more aggressive compression scheme for guards
+would be worth the memory saving in contrast to the increased decoding
+overhead. Based on the same observation we would like to explore the concept of
+LuaJIT's sparse snapshots and its applicability to PyPy.
 
 \section*{Acknowledgements}
 \section*{Appendix}
+\todo{remove this section and the figures}
 \begin{figure*}
     \include{figures/ops_count_table}
     \caption{Relative numbers of operations in the traces generated for
diff --git a/talk/vmil2012/tool/bridgedata.py b/talk/vmil2012/tool/bridgedata.py
--- a/talk/vmil2012/tool/bridgedata.py
+++ b/talk/vmil2012/tool/bridgedata.py
@@ -20,6 +20,7 @@
         summary = logparser.extract_category(logfile, 'jit-summary')
         if len(summary) == 0:
             yield (exe, name, log, 'n/a', 'n/a')
+            continue
         summary = summary[0].splitlines()
         for line in summary:
             if line.startswith('Total # of bridges'):
diff --git a/talk/vmil2012/tool/build_tables.py b/talk/vmil2012/tool/build_tables.py
--- a/talk/vmil2012/tool/build_tables.py
+++ b/talk/vmil2012/tool/build_tables.py
@@ -1,9 +1,10 @@
 from __future__ import division
 import csv
 import django
-from django.template import Template, Context
+import json
 import os
 import sys
+from django.template import Template, Context
 
 # This line is required for Django configuration
 django.conf.settings.configure()
@@ -15,17 +16,47 @@
         return [l for l in reader]
 
 
+def build_failing_guards_table(files, texfile, template):
+    BRIDGE_THRESHOLD = 200
+    assert len(files) == 2
+    with open(files[1]) as f:
+        failures = json.load(f)
+    for l in getlines(files[0]):
+        failures[l['bench']]['nguards'] = float(l['number of guards'])
+
+    table = []
+    head = ['Benchmark',
+            'Failing guards',
+            'Over %d failures' % BRIDGE_THRESHOLD]
+
+    for bench, info in failures.iteritems():
+        total = failures[bench]['nguards']
+        total_failures = len(info['results'])
+        bridges = len([k for k,v in info['results'].iteritems() \
+                                            if v > BRIDGE_THRESHOLD])
+        res = [bench.replace('_', '\\_'),
+                "%.1f\\%%" % (100 * total_failures/total),
+                "%.1f\\%%" % (100 * bridges/total),
+        ]
+        table.append(res)
+    output = render_table(template, head, sorted(table))
+    write_table(output, texfile)
+
+
 def build_resume_data_table(csvfiles, texfile, template):
     assert len(csvfiles) == 1
     lines = getlines(csvfiles[0])
     table = []
-    head = ['Benchmark', 'compressed', 'naive', 'xz compressed']
+    head = ['Benchmark', 'Compressed', 'Naive', 'xz compressed']
 
     for bench in lines:
+        total = float(bench['total resume data size'])
+        naive = float(bench['naive resume data size'])
+        xz = float(bench['compressed resume data size'])
         res = [bench['bench'].replace('_', '\\_'),
-                "%.2f" % float(bench['total resume data size']),
-                "%.2f" % float(bench['naive resume data size']),
-                "%.2f" % float(bench['compressed resume data size']),
+                "%.2f {\scriptsize KiB}" %  (total,),# (100*total/naive)),
+                "%.2f {\scriptsize KiB}" % (naive),#, 100*naive/total),
+                "%.2f {\scriptsize KiB}" % (xz),#, 100*xz/total),
         ]
         table.append(res)
     output = render_table(template, head, sorted(table))
@@ -52,7 +83,7 @@
                 values.append(o / ops[t] * 100)
 
             assert 100.0 - sum(values) < 0.0001
-            res.extend(['%.2f ' % v for v in values])
+            res.extend(['%.1f\\%%' % v for v in values])
         table.append(res)
     output = render_table(template, head, sorted(table))
     write_table(output, texfile)
@@ -61,7 +92,7 @@
     assert len(csvfiles) == 1
     lines = getlines(csvfiles[0])
     table = []
-    head = ['Benchmark', 'guards b/o in \%', 'guards a/o in \%']
+    head = ['Benchmark', 'Guards before', 'Guards after']
 
     keys = 'numeric set get rest new guard '.split()
     for bench in lines:
@@ -71,7 +102,7 @@
         res = [bench['bench'].replace('_', '\\_'),]
         for t in ('before', 'after'):
             o = int(bench['guard %s' % t])
-            res.append('%.2f ' % (o / ops[t] * 100))
+            res.append('%.1f\\%%' % (o / ops[t] * 100))
         table.append(res)
     output = render_table(template, head, sorted(table))
     write_table(output, texfile)
@@ -82,17 +113,18 @@
     assert len(csvfiles) == 2
     lines = getlines(csvfiles[0])
     bridge_lines = getlines(csvfiles[1])
+    # keep this around for the assertion bellow
     bridgedata = {}
     for l in bridge_lines:
         bridgedata[l['bench']] = l
 
     head = ['Benchmark',
-            'ops b/o',
-            'guards b/o',
-            'ops a/o',
-            'guards a/o',
-            'opt. rate',
-            'guard opt. rate',
+            'Ops. before',
+            'Guards before',
+            'Ops. after',
+            'Guards after',
+            'Opt. rate',
+            'Guard opt. rate',
             ]
 
     table = []
@@ -110,11 +142,11 @@
         res = [
                 bench['bench'].replace('_', '\\_'),
                 ops_bo,
-                "%.2f \\%%" % (guards_bo / ops_bo * 100,),
+                "%.1f\\%%" % (guards_bo / ops_bo * 100,),
                 ops_ao,
-                "%.2f \\%%" % (guards_ao / ops_ao * 100,),
-                "%.2f \\%%" % ((1 - ops_ao / ops_bo) * 100,),
-                "%.2f \\%%" % ((1 - guards_ao / guards_bo) * 100,),
+                "%.1f\\%%" % (guards_ao / ops_ao * 100,),
+                "%.1f\\%%" % ((1 - ops_ao / ops_bo) * 100,),
+                "%.1f\\%%" % ((1 - guards_ao / guards_bo) * 100,),
               ]
         table.append(res)
     output = render_table(template, head, sorted(table))
@@ -128,11 +160,11 @@
     for l in resume_lines:
         resumedata[l['bench']] = l
 
-    head = ['Benchmark',
-            'Machine code size (kB)',
-            'hl resume data (kB)',
-            'll resume data (kB)',
-            'machine code resume data relation in \\%']
+    head = [r'Benchmark',
+            r'Code',
+            r'Resume data',
+            r'll data',
+            r'Relation']
 
     table = []
     # collect data
@@ -142,12 +174,12 @@
         gmsize = float(bench['guard map size'])
         asmsize = float(bench['asm size'])
         rdsize = float(resumedata[name]['total resume data size'])
-        rel = "%.2f" % (asmsize / (gmsize + rdsize) * 100,)
+        rel = r"%.1f{\scriptsize\%%}" % (asmsize / (gmsize + rdsize) * 100,)
         table.append([
-            bench['bench'],
-            "%.2f" % (asmsize,),
-            "%.2f" % (rdsize,),
-            "%.2f" % (gmsize,),
+            r"%s" % bench['bench'],
+            r"%.1f {\scriptsize KiB}" % (asmsize,),
+            r"%.1f {\scriptsize KiB}" % (rdsize,),
+            r"%.1f {\scriptsize KiB}" % (gmsize,),
             rel])
     output = render_table(template, head, sorted(table))
     write_table(output, texfile)
@@ -178,6 +210,8 @@
             (['summary.csv'], build_guard_table),
         'resume_data_table.tex':
             (['resume_summary.csv'], build_resume_data_table),
+        'failing_guards_table.tex':
+            (['resume_summary.csv', 'guard_summary.json'], build_failing_guards_table),
         }
 
 
diff --git a/talk/vmil2012/zotero.bib b/talk/vmil2012/zotero.bib
--- a/talk/vmil2012/zotero.bib
+++ b/talk/vmil2012/zotero.bib
@@ -116,6 +116,17 @@
 	pages = {32&#8211;43}
 },
 
+ at article{cytron_efficiently_1991,
+	title = {Efficiently Computing Static Single Assignment Form and the Control Dependence Graph},
+	volume = {13},
+	number = {4},
+	journal = {{ACM} Transactions on Programming Languages and Systems},
+	author = {Cytron, Ron and Ferrante, Jeanne and Rosen, Barry K. and Wegman, Mark N. and Zadeck, F. Kenneth},
+	month = oct,
+	year = {1991},
+	pages = {451&#8211;490}
+},
+
 @inproceedings{bolz_tracing_2009,
 	address = {Genova, Italy},
 	title = {Tracing the meta-level: {PyPy's} tracing {JIT} compiler},