[pypy-commit] pypy numpy-data-buffer: merge in separate-applevel-numpy

Tue Oct 4 00:40:57 CEST 2011

Author: Timo Paulssen <timonator at perpetuum-immobile.de>
Branch: numpy-data-buffer
Changeset: r47814:0698ee9b73e2
Date: 2011-10-04 00:35 +0200
http://bitbucket.org/pypy/pypy/changeset/0698ee9b73e2/

Log:	merge in separate-applevel-numpy

diff --git a/lib_pypy/numpy/__init__.py b/lib_pypy/numpy/__init__.py
--- a/lib_pypy/numpy/__init__.py
+++ b/lib_pypy/numpy/__init__.py
@@ -13,6 +13,8 @@
         arccos,
         arcsin,
         arctan,
+        arcsinh,
+        arctanh,
         copysign,
         cos,
         divide,
@@ -36,6 +38,8 @@
         tan,
     )
 
+inf = float("inf")
+
 def average(a):
     # This implements a weighted average, for now we don't implement the
     # weighting, just the average part!
diff --git a/pypy/interpreter/executioncontext.py b/pypy/interpreter/executioncontext.py
--- a/pypy/interpreter/executioncontext.py
+++ b/pypy/interpreter/executioncontext.py
@@ -307,7 +307,11 @@
         self._nonperiodic_actions = []
         self.has_bytecode_counter = False
         self.fired_actions = None
-        self.checkinterval_scaled = 100 * TICK_COUNTER_STEP
+        # the default value is not 100, unlike CPython 2.7, but a much
+        # larger value, because we use a technique that not only allows
+        # but actually *forces* another thread to run whenever the counter
+        # reaches zero.
+        self.checkinterval_scaled = 10000 * TICK_COUNTER_STEP
         self._rebuild_action_dispatcher()
 
     def fire(self, action):
diff --git a/pypy/jit/codewriter/effectinfo.py b/pypy/jit/codewriter/effectinfo.py
--- a/pypy/jit/codewriter/effectinfo.py
+++ b/pypy/jit/codewriter/effectinfo.py
@@ -74,6 +74,7 @@
     OS_LLONG_UGE                = 91
     OS_LLONG_URSHIFT            = 92
     OS_LLONG_FROM_UINT          = 93
+    OS_LLONG_U_TO_FLOAT         = 94
     #
     OS_MATH_SQRT                = 100
 
diff --git a/pypy/jit/codewriter/jtransform.py b/pypy/jit/codewriter/jtransform.py
--- a/pypy/jit/codewriter/jtransform.py
+++ b/pypy/jit/codewriter/jtransform.py
@@ -440,6 +440,7 @@
     rewrite_op_ullong_mod_zer      = _do_builtin_call
     rewrite_op_gc_identityhash = _do_builtin_call
     rewrite_op_gc_id           = _do_builtin_call
+    rewrite_op_uint_mod        = _do_builtin_call
 
     # ----------
     # getfield/setfield/mallocs etc.
@@ -883,9 +884,15 @@
                 v = v_arg
                 oplist = []
             if unsigned1:
-                opname = 'cast_uint_to_longlong'
+                if unsigned2:
+                    opname = 'cast_uint_to_ulonglong'
+                else:
+                    opname = 'cast_uint_to_longlong'
             else:
-                opname = 'cast_int_to_longlong'
+                if unsigned2:
+                    opname = 'cast_int_to_ulonglong'
+                else:
+                    opname = 'cast_int_to_longlong'
             op2 = self.rewrite_operation(
                 SpaceOperation(opname, [v], v_result)
             )
@@ -995,6 +1002,21 @@
                 return op2
         ''' % (_op, _oopspec.lower(), _oopspec, _oopspec)).compile()
 
+    for _op, _oopspec in [('cast_int_to_ulonglong',     'FROM_INT'),
+                          ('cast_uint_to_ulonglong',    'FROM_UINT'),
+                          ('cast_float_to_ulonglong',   'FROM_FLOAT'),
+                          ('cast_ulonglong_to_float',   'U_TO_FLOAT'),
+                         ]:
+        exec py.code.Source('''
+            def rewrite_op_%s(self, op):
+                args = op.args
+                op1 = self.prepare_builtin_call(op, "ullong_%s", args)
+                op2 = self._handle_oopspec_call(op1, args,
+                                                EffectInfo.OS_LLONG_%s,
+                                           EffectInfo.EF_ELIDABLE_CANNOT_RAISE)
+                return op2
+        ''' % (_op, _oopspec.lower(), _oopspec)).compile()
+
     def _normalize(self, oplist):
         if isinstance(oplist, SpaceOperation):
             return [oplist]
diff --git a/pypy/jit/codewriter/support.py b/pypy/jit/codewriter/support.py
--- a/pypy/jit/codewriter/support.py
+++ b/pypy/jit/codewriter/support.py
@@ -315,18 +315,30 @@
 def _ll_1_llong_from_int(x):
     return r_longlong(intmask(x))
 
+def _ll_1_ullong_from_int(x):
+    return r_ulonglong(intmask(x))
+
 def _ll_1_llong_from_uint(x):
     return r_longlong(r_uint(x))
 
+def _ll_1_ullong_from_uint(x):
+    return r_ulonglong(r_uint(x))
+
 def _ll_1_llong_to_int(xll):
     return intmask(xll)
 
 def _ll_1_llong_from_float(xf):
     return r_longlong(xf)
 
+def _ll_1_ullong_from_float(xf):
+    return r_ulonglong(xf)
+
 def _ll_1_llong_to_float(xll):
     return float(rffi.cast(lltype.SignedLongLong, xll))
 
+def _ll_1_ullong_u_to_float(xull):
+    return float(rffi.cast(lltype.UnsignedLongLong, xull))
+
 
 def _ll_1_llong_abs(xll):
     if xll < 0:
@@ -351,20 +363,23 @@
     return llop.llong_mod(lltype.SignedLongLong, xll, yll)
 
 def _ll_2_ullong_floordiv(xll, yll):
-    return llop.ullong_floordiv(lltype.SignedLongLong, xll, yll)
+    return llop.ullong_floordiv(lltype.UnsignedLongLong, xll, yll)
 
 def _ll_2_ullong_floordiv_zer(xll, yll):
     if yll == 0:
         raise ZeroDivisionError
-    return llop.ullong_floordiv(lltype.SignedLongLong, xll, yll)
+    return llop.ullong_floordiv(lltype.UnsignedLongLong, xll, yll)
 
 def _ll_2_ullong_mod(xll, yll):
-    return llop.ullong_mod(lltype.SignedLongLong, xll, yll)
+    return llop.ullong_mod(lltype.UnsignedLongLong, xll, yll)
 
 def _ll_2_ullong_mod_zer(xll, yll):
     if yll == 0:
         raise ZeroDivisionError
-    return llop.ullong_mod(lltype.SignedLongLong, xll, yll)
+    return llop.ullong_mod(lltype.UnsignedLongLong, xll, yll)
+
+def _ll_2_uint_mod(xll, yll):
+    return llop.uint_mod(lltype.Unsigned, xll, yll)
 
 
 # libffi support
diff --git a/pypy/jit/codewriter/test/test_flatten.py b/pypy/jit/codewriter/test/test_flatten.py
--- a/pypy/jit/codewriter/test/test_flatten.py
+++ b/pypy/jit/codewriter/test/test_flatten.py
@@ -829,14 +829,15 @@
                     self.encoding_test(f, [rffi.cast(FROM, 42)], expectedstr,
                                        transform=True)
                 elif TO in (rffi.LONG, rffi.ULONG):
+                    if rffi.cast(FROM, -1) < 0:
+                        fnname = "llong_from_int"
+                    else:
+                        fnname = "llong_from_uint"
                     if TO == rffi.LONG:
                         TO = rffi.LONGLONG
                     else:
                         TO = rffi.ULONGLONG
-                    if rffi.cast(FROM, -1) < 0:
-                        fnname = "llong_from_int"
-                    else:
-                        fnname = "llong_from_uint"
+                        fnname = "u" + fnname
                     expected.pop()   # remove int_return
                     expected.append(
                         "residual_call_irf_f $<* fn %s>, <Descr>, I[%s], R[], F[] -> %%f0"
diff --git a/pypy/jit/codewriter/test/test_longlong.py b/pypy/jit/codewriter/test/test_longlong.py
--- a/pypy/jit/codewriter/test/test_longlong.py
+++ b/pypy/jit/codewriter/test/test_longlong.py
@@ -57,7 +57,8 @@
             assert op1.opname == 'residual_call_irf_f'
         else:
             assert op1.opname == 'residual_call_irf_i'
-        gotindex = getattr(EffectInfo, 'OS_' + op1.args[0].value.upper())
+        gotindex = getattr(EffectInfo,
+                           'OS_' + op1.args[0].value.upper().lstrip('U'))
         assert gotindex == oopspecindex
         assert op1.args[1] == 'calldescr-%d' % oopspecindex
         assert list(op1.args[2]) == [v for v in vlist
@@ -192,8 +193,12 @@
                       [lltype.SignedLongLong], lltype.Signed)
         self.do_check('cast_float_to_longlong', EffectInfo.OS_LLONG_FROM_FLOAT,
                       [lltype.Float], lltype.SignedLongLong)
+        self.do_check('cast_float_to_ulonglong', EffectInfo.OS_LLONG_FROM_FLOAT,
+                      [lltype.Float], lltype.UnsignedLongLong)
         self.do_check('cast_longlong_to_float', EffectInfo.OS_LLONG_TO_FLOAT,
                       [lltype.SignedLongLong], lltype.Float)
+        self.do_check('cast_ulonglong_to_float', EffectInfo.OS_LLONG_U_TO_FLOAT,
+                      [lltype.UnsignedLongLong], lltype.Float)
         for T1 in [lltype.SignedLongLong, lltype.UnsignedLongLong]:
             for T2 in [lltype.Signed, lltype.Unsigned]:
                 self.do_check('cast_primitive', EffectInfo.OS_LLONG_TO_INT,
diff --git a/pypy/jit/metainterp/optimizeopt/__init__.py b/pypy/jit/metainterp/optimizeopt/__init__.py
--- a/pypy/jit/metainterp/optimizeopt/__init__.py
+++ b/pypy/jit/metainterp/optimizeopt/__init__.py
@@ -7,6 +7,8 @@
 from pypy.jit.metainterp.optimizeopt.unroll import optimize_unroll, OptInlineShortPreamble
 from pypy.jit.metainterp.optimizeopt.fficall import OptFfiCall
 from pypy.jit.metainterp.optimizeopt.simplify import OptSimplify
+from pypy.jit.metainterp.optimizeopt.pure import OptPure
+from pypy.jit.metainterp.optimizeopt.earlyforce import OptEarlyForce
 from pypy.rlib.jit import PARAMETERS
 from pypy.rlib.unroll import unrolling_iterable
 
@@ -14,6 +16,8 @@
             ('rewrite', OptRewrite),
             ('virtualize', OptVirtualize),
             ('string', OptString),
+            ('earlyforce', OptEarlyForce),
+            ('pure', OptPure),
             ('heap', OptHeap),
             ('ffi', None),
             ('unroll', None)]
diff --git a/pypy/jit/metainterp/optimizeopt/earlyforce.py b/pypy/jit/metainterp/optimizeopt/earlyforce.py
new file mode 100644
--- /dev/null
+++ b/pypy/jit/metainterp/optimizeopt/earlyforce.py
@@ -0,0 +1,24 @@
+from pypy.jit.metainterp.optimizeopt.optimizer import Optimization
+from pypy.jit.metainterp.optimizeopt.vstring import VAbstractStringValue
+from pypy.jit.metainterp.resoperation import rop, ResOperation
+
+class OptEarlyForce(Optimization):
+    def propagate_forward(self, op):
+        opnum = op.getopnum()
+        if (opnum != rop.SETFIELD_GC and 
+            opnum != rop.SETARRAYITEM_GC and
+            opnum != rop.QUASIIMMUT_FIELD):
+               
+            for arg in op.getarglist():
+                if arg in self.optimizer.values:
+                    value = self.getvalue(arg)
+                    value.force_box(self)
+        self.emit_operation(op)
+
+    def new(self):
+        return OptEarlyForce()
+
+    def setup(self):
+        self.optimizer.optearlyforce = self
+
+    
diff --git a/pypy/jit/metainterp/optimizeopt/fficall.py b/pypy/jit/metainterp/optimizeopt/fficall.py
--- a/pypy/jit/metainterp/optimizeopt/fficall.py
+++ b/pypy/jit/metainterp/optimizeopt/fficall.py
@@ -177,10 +177,10 @@
             funcinfo.descr is None):
             return [op] # cannot optimize
         funcsymval = self.getvalue(op.getarg(2))
-        arglist = [funcsymval.force_box()]
+        arglist = [funcsymval.get_key_box()]
         for push_op in funcinfo.opargs:
             argval = self.getvalue(push_op.getarg(2))
-            arglist.append(argval.force_box())
+            arglist.append(argval.get_key_box())
         newop = ResOperation(rop.CALL_RELEASE_GIL, arglist, op.result,
                              descr=funcinfo.descr)
         self.commit_optimization()
diff --git a/pypy/jit/metainterp/optimizeopt/heap.py b/pypy/jit/metainterp/optimizeopt/heap.py
--- a/pypy/jit/metainterp/optimizeopt/heap.py
+++ b/pypy/jit/metainterp/optimizeopt/heap.py
@@ -50,6 +50,7 @@
             if not self._lazy_setfield_registered:
                 optheap._lazy_setfields_and_arrayitems.append(self)
                 self._lazy_setfield_registered = True
+            
         else:
             # this is the case where the pending setfield ends up
             # storing precisely the value that is already there,
@@ -158,12 +159,17 @@
         self._lazy_setfields_and_arrayitems = []
         self._remove_guard_not_invalidated = False
         self._seen_guard_not_invalidated = False
+        self.posponedop = None
 
     def force_at_end_of_preamble(self):
         self.force_all_lazy_setfields_and_arrayitems()
 
     def flush(self):
         self.force_all_lazy_setfields_and_arrayitems()
+        if self.posponedop:
+            posponedop = self.posponedop
+            self.posponedop = None
+            self.next_optimization.propagate_forward(posponedop)
 
     def new(self):
         return OptHeap()
@@ -211,7 +217,15 @@
 
     def emit_operation(self, op):
         self.emitting_operation(op)
-        self.next_optimization.propagate_forward(op)
+        if self.posponedop:
+            posponedop = self.posponedop
+            self.posponedop = None
+            self.next_optimization.propagate_forward(posponedop)
+        if (op.is_comparison() or op.getopnum() == rop.CALL_MAY_FORCE
+            or op.is_ovf()):
+            self.posponedop = op
+        else:
+            self.next_optimization.propagate_forward(op)
 
     def emitting_operation(self, op):
         if op.has_no_side_effect():
@@ -293,30 +307,6 @@
             if indexvalue is None or indexvalue.intbound.contains(idx):
                 cf.force_lazy_setfield(self, can_cache)
 
-    def fixup_guard_situation(self):
-        # hackish: reverse the order of the last two operations if it makes
-        # sense to avoid a situation like "int_eq/setfield_gc/guard_true",
-        # which the backend (at least the x86 backend) does not handle well.
-        newoperations = self.optimizer.newoperations
-        if len(newoperations) < 2:
-            return
-        lastop = newoperations[-1]
-        if (lastop.getopnum() != rop.SETFIELD_GC and
-            lastop.getopnum() != rop.SETARRAYITEM_GC):
-            return
-        # - is_comparison() for cases like "int_eq/setfield_gc/guard_true"
-        # - CALL_MAY_FORCE: "call_may_force/setfield_gc/guard_not_forced"
-        # - is_ovf(): "int_add_ovf/setfield_gc/guard_no_overflow"
-        prevop = newoperations[-2]
-        opnum = prevop.getopnum()
-        if not (prevop.is_comparison() or opnum == rop.CALL_MAY_FORCE
-                or prevop.is_ovf()):
-            return
-        if prevop.result in lastop.getarglist():
-            return
-        newoperations[-2] = lastop
-        newoperations[-1] = prevop
-
     def _assert_valid_cf(self, cf):
         # check that 'cf' is in cached_fields or cached_arrayitems
         if not we_are_translated():
@@ -362,7 +352,6 @@
                                       fieldvalue.get_key_box(), itemindex))
             else:
                 cf.force_lazy_setfield(self)
-                self.fixup_guard_situation()
         return pendingfields
 
     def optimize_GETFIELD_GC(self, op):
@@ -374,12 +363,22 @@
             return
         # default case: produce the operation
         structvalue.ensure_nonnull()
-        ###self.optimizer.optimize_default(op)
         self.emit_operation(op)
         # then remember the result of reading the field
         fieldvalue = self.getvalue(op.result)
         cf.remember_field_value(structvalue, fieldvalue, op)
 
+    def optimize_GETFIELD_GC_PURE(self, op):
+        structvalue = self.getvalue(op.getarg(0))
+        cf = self.field_cache(op.getdescr())
+        fieldvalue = cf.getfield_from_cache(self, structvalue)
+        if fieldvalue is not None:
+            self.make_equal_to(op.result, fieldvalue)
+            return
+        # default case: produce the operation
+        structvalue.ensure_nonnull()
+        self.emit_operation(op)
+
     def optimize_SETFIELD_GC(self, op):
         if self.has_pure_result(rop.GETFIELD_GC_PURE, [op.getarg(0)],
                                 op.getdescr()):
@@ -389,6 +388,7 @@
         #
         cf = self.field_cache(op.getdescr())
         cf.do_setfield(self, op)
+        
 
     def optimize_GETARRAYITEM_GC(self, op):
         arrayvalue = self.getvalue(op.getarg(0))
@@ -413,6 +413,25 @@
             fieldvalue = self.getvalue(op.result)
             cf.remember_field_value(arrayvalue, fieldvalue, op)
 
+    def optimize_GETARRAYITEM_GC_PURE(self, op):
+        arrayvalue = self.getvalue(op.getarg(0))
+        indexvalue = self.getvalue(op.getarg(1))
+        cf = None
+        if indexvalue.is_constant():
+            arrayvalue.make_len_gt(MODE_ARRAY, op.getdescr(), indexvalue.box.getint())
+            # use the cache on (arraydescr, index), which is a constant
+            cf = self.arrayitem_cache(op.getdescr(), indexvalue.box.getint())
+            fieldvalue = cf.getfield_from_cache(self, arrayvalue)
+            if fieldvalue is not None:
+                self.make_equal_to(op.result, fieldvalue)
+                return
+        else:
+            # variable index, so make sure the lazy setarrayitems are done
+            self.force_lazy_setarrayitem(op.getdescr(), indexvalue=indexvalue)
+        # default case: produce the operation
+        arrayvalue.ensure_nonnull()
+        self.emit_operation(op)
+
     def optimize_SETARRAYITEM_GC(self, op):
         if self.has_pure_result(rop.GETARRAYITEM_GC_PURE, [op.getarg(0),
                                                            op.getarg(1)],
diff --git a/pypy/jit/metainterp/optimizeopt/optimizer.py b/pypy/jit/metainterp/optimizeopt/optimizer.py
--- a/pypy/jit/metainterp/optimizeopt/optimizer.py
+++ b/pypy/jit/metainterp/optimizeopt/optimizer.py
@@ -31,8 +31,8 @@
 
 class OptValue(object):
     __metaclass__ = extendabletype
-    _attrs_ = ('box', 'known_class', 'last_guard_index', 'level', 'intbound', 'lenbound')
-    last_guard_index = -1
+    _attrs_ = ('box', 'known_class', 'last_guard', 'level', 'intbound', 'lenbound')
+    last_guard = None
 
     level = LEVEL_UNKNOWN
     known_class = None
@@ -100,7 +100,7 @@
             self.make_constant(other.get_key_box())
             optimizer.turned_constant(self)
         elif other.level == LEVEL_KNOWNCLASS:
-            self.make_constant_class(other.known_class, -1)
+            self.make_constant_class(other.known_class, None)
         else:
             if other.level == LEVEL_NONNULL:
                 self.ensure_nonnull()
@@ -114,13 +114,13 @@
                     self.lenbound = other.lenbound.clone()
 
 
-    def force_box(self):
+    def force_box(self, optforce):
         return self.box
 
     def get_key_box(self):
         return self.box
 
-    def force_at_end_of_preamble(self, already_forced):
+    def force_at_end_of_preamble(self, already_forced, optforce):
         return self
 
     def get_args_for_fail(self, modifier):
@@ -162,16 +162,16 @@
         else:
             return None
 
-    def make_constant_class(self, classbox, opindex):
+    def make_constant_class(self, classbox, guardop):
         assert self.level < LEVEL_KNOWNCLASS
         self.known_class = classbox
         self.level = LEVEL_KNOWNCLASS
-        self.last_guard_index = opindex
+        self.last_guard = guardop
 
-    def make_nonnull(self, opindex):
+    def make_nonnull(self, guardop):
         assert self.level < LEVEL_NONNULL
         self.level = LEVEL_NONNULL
-        self.last_guard_index = opindex
+        self.last_guard = guardop
 
     def is_nonnull(self):
         level = self.level
@@ -240,26 +240,12 @@
     def __init__(self):
         pass # make rpython happy
 
-    def propagate_begin_forward(self):
-        if self.next_optimization:
-            self.next_optimization.propagate_begin_forward()
-
-    def propagate_end_forward(self):
-        if self.next_optimization:
-            self.next_optimization.propagate_end_forward()
-
     def propagate_forward(self, op):
         raise NotImplementedError
 
     def emit_operation(self, op):
         self.next_optimization.propagate_forward(op)
 
-    def test_emittable(self, op):
-        return self.is_emittable(op)
-
-    def is_emittable(self, op):
-        return self.next_optimization.test_emittable(op)
-
     # FIXME: Move some of these here?
     def getvalue(self, box):
         return self.optimizer.getvalue(box)
@@ -289,19 +275,19 @@
         return self.optimizer.new_const_item(arraydescr)
 
     def pure(self, opnum, args, result):
-        op = ResOperation(opnum, args, result)
-        key = self.optimizer.make_args_key(op)
-        if key not in self.optimizer.pure_operations:
-            self.optimizer.pure_operations[key] = op
+        if self.optimizer.optpure:
+            self.optimizer.optpure.pure(opnum, args, result)
 
     def has_pure_result(self, opnum, args, descr):
-        op = ResOperation(opnum, args, None, descr)
-        key = self.optimizer.make_args_key(op)
-        op = self.optimizer.pure_operations.get(key, None)
-        if op is None:
-            return False
-        return op.getdescr() is descr
+        if self.optimizer.optpure:
+            return self.optimizer.optpure.has_pure_result(opnum, args, descr)
+        return False
 
+    def get_pure_result(self, key):    
+        if self.optimizer.optpure:
+            return self.optimizer.optpure.get_pure_result(key)
+        return None
+    
     def setup(self):
         pass
 
@@ -322,6 +308,9 @@
     def produce_potential_short_preamble_ops(self, potential_ops):
         pass
 
+    def forget_numberings(self, box):
+        self.optimizer.forget_numberings(box)
+
 class Optimizer(Optimization):
 
     def __init__(self, metainterp_sd, loop, optimizations=None, bridge=False):
@@ -333,14 +322,16 @@
         self.interned_refs = self.cpu.ts.new_ref_dict()
         self.resumedata_memo = resume.ResumeDataLoopMemo(metainterp_sd)
         self.bool_boxes = {}
-        self.pure_operations = args_dict()
         self.producer = {}
         self.pendingfields = []
-        self.posponedop = None
         self.exception_might_have_happened = False
         self.quasi_immutable_deps = None
         self.opaque_pointers = {}
-        self.newoperations = []
+        self.replaces_guard = {}
+        self._newoperations = []
+        self.optimizer = self
+        self.optpure = None
+        self.optearlyforce = None
         if loop is not None:
             self.call_pure_results = loop.call_pure_results
 
@@ -369,21 +360,20 @@
     def flush(self):
         for o in self.optimizations:
             o.flush()
-        assert self.posponedop is None
 
     def new(self):
         new = Optimizer(self.metainterp_sd, self.loop)
         return self._new(new)
 
     def _new(self, new):
-        assert self.posponedop is None
         optimizations = [o.new() for o in self.optimizations]
         new.set_optimizations(optimizations)
         new.quasi_immutable_deps = self.quasi_immutable_deps
         return new
 
     def produce_potential_short_preamble_ops(self, sb):
-        raise NotImplementedError('This is implemented in unroll.UnrollableOptimizer')
+        for opt in self.optimizations:
+            opt.produce_potential_short_preamble_ops(sb)
 
     def turned_constant(self, value):
         for o in self.optimizations:
@@ -433,6 +423,13 @@
             return constbox
         return None
 
+    def get_newoperations(self):
+        self.flush()
+        return self._newoperations
+
+    def clear_newoperations(self):
+        self._newoperations = []
+
     def make_equal_to(self, box, value, replace=False):
         assert isinstance(value, OptValue)
         assert replace or box not in self.values
@@ -481,15 +478,10 @@
 
     def propagate_all_forward(self):
         self.exception_might_have_happened = self.bridge
-        self.newoperations = []
-        self.first_optimization.propagate_begin_forward()
-        self.i = 0
-        while self.i < len(self.loop.operations):
-            op = self.loop.operations[self.i]
+        self.clear_newoperations()
+        for op in self.loop.operations:
             self.first_optimization.propagate_forward(op)
-            self.i += 1
-        self.first_optimization.propagate_end_forward()
-        self.loop.operations = self.newoperations
+        self.loop.operations = self.get_newoperations()
         self.loop.quasi_immutable_deps = self.quasi_immutable_deps
         # accumulate counters
         self.resumedata_memo.update_counters(self.metainterp_sd.profiler)
@@ -501,9 +493,6 @@
         self.producer[op.result] = op
         dispatch_opt(self, op)
 
-    def test_emittable(self, op):
-        return True
-
     def emit_operation(self, op):
         if op.returns_bool_result():
             self.bool_boxes[self.getvalue(op.result)] = None
@@ -519,14 +508,30 @@
                 pass
             else:
                 self.ensure_imported(value)
-                op.setarg(i, value.force_box())
+                op.setarg(i, value.force_box(self))
         self.metainterp_sd.profiler.count(jitprof.OPT_OPS)
         if op.is_guard():
             self.metainterp_sd.profiler.count(jitprof.OPT_GUARDS)
-            op = self.store_final_boxes_in_guard(op)
+            if self.replaces_guard and op in self.replaces_guard:
+                self.replace_op(self.replaces_guard[op], op)
+                del self.replaces_guard[op]
+                return
+            else:
+                op = self.store_final_boxes_in_guard(op)
         elif op.can_raise():
             self.exception_might_have_happened = True
-        self.newoperations.append(op)
+        self._newoperations.append(op)
+
+    def replace_op(self, old_op, new_op):
+        # XXX: Do we want to cache indexes to prevent search?
+        i = len(self._newoperations) 
+        while i > 0:
+            i -= 1
+            if self._newoperations[i] is old_op:
+                self._newoperations[i] = new_op
+                break
+        else:
+            assert False
 
     def store_final_boxes_in_guard(self, op):
         descr = op.getdescr()
@@ -574,51 +579,8 @@
         args[n + 1] = op.getdescr()
         return args
 
-    @specialize.argtype(0)
     def optimize_default(self, op):
-        canfold = op.is_always_pure()
-        if op.is_ovf():
-            self.posponedop = op
-            return
-        if self.posponedop:
-            nextop = op
-            op = self.posponedop
-            self.posponedop = None
-            canfold = nextop.getopnum() == rop.GUARD_NO_OVERFLOW
-        else:
-            nextop = None
-
-        if canfold:
-            for i in range(op.numargs()):
-                if self.get_constant_box(op.getarg(i)) is None:
-                    break
-            else:
-                # all constant arguments: constant-fold away
-                resbox = self.constant_fold(op)
-                # note that INT_xxx_OVF is not done from here, and the
-                # overflows in the INT_xxx operations are ignored
-                self.make_constant(op.result, resbox)
-                return
-
-            # did we do the exact same operation already?
-            args = self.make_args_key(op)
-            oldop = self.pure_operations.get(args, None)
-            if oldop is not None and oldop.getdescr() is op.getdescr():
-                assert oldop.getopnum() == op.getopnum()
-                self.make_equal_to(op.result, self.getvalue(oldop.result),
-                                   True)
-                return
-            else:
-                self.pure_operations[args] = op
-                self.remember_emitting_pure(op)
-
-        # otherwise, the operation remains
         self.emit_operation(op)
-        if nextop:
-            self.emit_operation(nextop)
-
-    def remember_emitting_pure(self, op):
-        pass
 
     def constant_fold(self, op):
         argboxes = [self.get_constant_box(op.getarg(i))
@@ -636,11 +598,6 @@
     def optimize_DEBUG_MERGE_POINT(self, op):
         self.emit_operation(op)
 
-    def optimize_CAST_OPAQUE_PTR(self, op):
-        value = self.getvalue(op.getarg(0))
-        self.opaque_pointers[value] = True
-        self.make_equal_to(op.result, value)
-
     def optimize_GETARRAYITEM_GC_PURE(self, op):
         indexvalue = self.getvalue(op.getarg(1))
         if indexvalue.is_constant():
diff --git a/pypy/jit/metainterp/optimizeopt/pure.py b/pypy/jit/metainterp/optimizeopt/pure.py
new file mode 100644
--- /dev/null
+++ b/pypy/jit/metainterp/optimizeopt/pure.py
@@ -0,0 +1,115 @@
+from pypy.jit.metainterp.optimizeopt.optimizer import Optimization
+from pypy.jit.metainterp.resoperation import rop, ResOperation
+from pypy.jit.metainterp.optimizeopt.util import (make_dispatcher_method,
+    args_dict)
+
+class OptPure(Optimization):
+    def __init__(self):
+        self.posponedop = None
+        self.pure_operations = args_dict()
+        self.emitted_pure_operations = {}
+
+    def propagate_forward(self, op):
+        dispatch_opt(self, op)
+
+    def optimize_default(self, op):
+        canfold = op.is_always_pure()
+        if op.is_ovf():
+            self.posponedop = op
+            return
+        if self.posponedop:
+            nextop = op
+            op = self.posponedop
+            self.posponedop = None
+            canfold = nextop.getopnum() == rop.GUARD_NO_OVERFLOW
+        else:
+            nextop = None
+
+        if canfold:
+            for i in range(op.numargs()):
+                if self.get_constant_box(op.getarg(i)) is None:
+                    break
+            else:
+                # all constant arguments: constant-fold away
+                resbox = self.optimizer.constant_fold(op)
+                # note that INT_xxx_OVF is not done from here, and the
+                # overflows in the INT_xxx operations are ignored
+                self.optimizer.make_constant(op.result, resbox)
+                return
+
+            # did we do the exact same operation already?
+            args = self.optimizer.make_args_key(op)
+            oldop = self.pure_operations.get(args, None)
+            if oldop is not None and oldop.getdescr() is op.getdescr():
+                assert oldop.getopnum() == op.getopnum()
+                self.optimizer.make_equal_to(op.result, self.getvalue(oldop.result),
+                                   True)
+                return
+            else:
+                self.pure_operations[args] = op
+                self.remember_emitting_pure(op)
+
+        # otherwise, the operation remains
+        self.emit_operation(op)
+        if op.returns_bool_result():
+            self.optimizer.bool_boxes[self.getvalue(op.result)] = None        
+        if nextop:
+            self.emit_operation(nextop)
+
+    def optimize_CALL_PURE(self, op):
+        args = self.optimizer.make_args_key(op)
+        oldop = self.pure_operations.get(args, None)
+        if oldop is not None and oldop.getdescr() is op.getdescr():
+            assert oldop.getopnum() == op.getopnum()
+            self.make_equal_to(op.result, self.getvalue(oldop.result))
+            return
+        else:
+            self.pure_operations[args] = op
+            self.remember_emitting_pure(op)
+
+        # replace CALL_PURE with just CALL
+        args = op.getarglist()
+        self.emit_operation(ResOperation(rop.CALL, args, op.result,
+                                         op.getdescr()))
+
+    def flush(self):
+        assert self.posponedop is None
+
+    def new(self):
+        assert self.posponedop is None
+        return OptPure()
+
+    def setup(self):
+        self.optimizer.optpure = self
+
+    def pure(self, opnum, args, result):
+        op = ResOperation(opnum, args, result)
+        key = self.optimizer.make_args_key(op)
+        if key not in self.pure_operations:
+            self.pure_operations[key] = op
+
+    def has_pure_result(self, opnum, args, descr):
+        op = ResOperation(opnum, args, None, descr)
+        key = self.optimizer.make_args_key(op)
+        op = self.pure_operations.get(key, None)
+        if op is None:
+            return False
+        return op.getdescr() is descr
+
+    def get_pure_result(self, key):
+        return self.pure_operations.get(key, None)
+
+    def remember_emitting_pure(self, op):
+        self.emitted_pure_operations[op] = True
+
+    def produce_potential_short_preamble_ops(self, sb):
+        for op in self.emitted_pure_operations:
+            if op.getopnum() == rop.GETARRAYITEM_GC_PURE or \
+               op.getopnum() == rop.STRGETITEM or \
+               op.getopnum() == rop.UNICODEGETITEM:
+                if not self.getvalue(op.getarg(1)).is_constant():
+                    continue
+            sb.add_potential(op)
+
+dispatch_opt = make_dispatcher_method(OptPure, 'optimize_',
+                                      default=OptPure.optimize_default)
diff --git a/pypy/jit/metainterp/optimizeopt/rewrite.py b/pypy/jit/metainterp/optimizeopt/rewrite.py
--- a/pypy/jit/metainterp/optimizeopt/rewrite.py
+++ b/pypy/jit/metainterp/optimizeopt/rewrite.py
@@ -31,21 +31,8 @@
 
         dispatch_opt(self, op)
 
-    def test_emittable(self, op):
-        opnum = op.getopnum()
-        for value, cls, func in optimize_guards:
-            if opnum == value:
-                assert isinstance(op, cls)
-                try:
-                    func(self, op, dryrun=True)
-                    return self.is_emittable(op)
-                except InvalidLoop:
-                    return False
-        return self.is_emittable(op)
-
-
     def try_boolinvers(self, op, targs):
-        oldop = self.optimizer.pure_operations.get(targs, None)
+        oldop = self.get_pure_result(targs)
         if oldop is not None and oldop.getdescr() is op.getdescr():
             value = self.getvalue(oldop.result)
             if value.is_constant():
@@ -73,7 +60,7 @@
             oldopnum = opboolreflex[op.getopnum()] # FIXME: add INT_ADD, INT_MUL
             targs = self.optimizer.make_args_key(ResOperation(oldopnum, [args[1], args[0]],
                                                               None))
-            oldop = self.optimizer.pure_operations.get(targs, None)
+            oldop = self.get_pure_result(targs)
             if oldop is not None and oldop.getdescr() is op.getdescr():
                 self.make_equal_to(op.result, self.getvalue(oldop.result))
                 return True
@@ -214,40 +201,7 @@
         self.emit_operation(op)
         self.pure(rop.FLOAT_NEG, [op.result], v1)
 
-    def optimize_CALL_PURE(self, op):
-        arg_consts = []
-        for i in range(op.numargs()):
-            arg = op.getarg(i)
-            const = self.get_constant_box(arg)
-            if const is None:
-                break
-            arg_consts.append(const)
-        else:
-            # all constant arguments: check if we already know the result
-            try:
-                result = self.optimizer.call_pure_results[arg_consts]
-            except KeyError:
-                pass
-            else:
-                self.make_constant(op.result, result)
-                return
-
-        args = self.optimizer.make_args_key(op)
-        oldop = self.optimizer.pure_operations.get(args, None)
-        if oldop is not None and oldop.getdescr() is op.getdescr():
-            assert oldop.getopnum() == op.getopnum()
-            self.make_equal_to(op.result, self.getvalue(oldop.result))
-            return
-        else:
-            self.optimizer.pure_operations[args] = op
-            self.optimizer.remember_emitting_pure(op)
-
-        # replace CALL_PURE with just CALL
-        args = op.getarglist()
-        self.emit_operation(ResOperation(rop.CALL, args, op.result,
-                                         op.getdescr()))
-
-    def optimize_guard(self, op, constbox, emit_operation=True, dryrun=False):
+    def optimize_guard(self, op, constbox, emit_operation=True):
         value = self.getvalue(op.getarg(0))
         if value.is_constant():
             box = value.box
@@ -255,62 +209,57 @@
             if not box.same_constant(constbox):
                 raise InvalidLoop
             return
-        if dryrun: return
         if emit_operation:
             self.emit_operation(op)
         value.make_constant(constbox)
         self.optimizer.turned_constant(value)
 
-    def optimize_GUARD_ISNULL(self, op, dryrun=False):
+    def optimize_GUARD_ISNULL(self, op):
         value = self.getvalue(op.getarg(0))
         if value.is_null():
             return
         elif value.is_nonnull():
             raise InvalidLoop
-        if dryrun: return
         self.emit_operation(op)
         value.make_constant(self.optimizer.cpu.ts.CONST_NULL)
 
-    def optimize_GUARD_NONNULL(self, op, dryrun=False):
+    def optimize_GUARD_NONNULL(self, op):
         value = self.getvalue(op.getarg(0))
         if value.is_nonnull():
             return
         elif value.is_null():
             raise InvalidLoop
-        if dryrun: return
         self.emit_operation(op)
-        value.make_nonnull(len(self.optimizer.newoperations) - 1)
+        value.make_nonnull(op)
 
-    def optimize_GUARD_VALUE(self, op, dryrun=False):
+    def optimize_GUARD_VALUE(self, op):
         value = self.getvalue(op.getarg(0))
-        emit_operation = True
-        if not dryrun and value.last_guard_index != -1:
+        if value.last_guard:
             # there already has been a guard_nonnull or guard_class or
             # guard_nonnull_class on this value, which is rather silly.
             # replace the original guard with a guard_value
-            old_guard_op = self.optimizer.newoperations[value.last_guard_index]
-            new_guard_op = old_guard_op.copy_and_change(rop.GUARD_VALUE,
-                                             args = [old_guard_op.getarg(0), op.getarg(1)])
-            self.optimizer.newoperations[value.last_guard_index] = new_guard_op
+            old_guard_op = value.last_guard
+            op = old_guard_op.copy_and_change(rop.GUARD_VALUE,
+                                      args = [old_guard_op.getarg(0), op.getarg(1)])
+            self.optimizer.replaces_guard[op] = old_guard_op
             # hack hack hack.  Change the guard_opnum on
             # new_guard_op.getdescr() so that when resuming,
             # the operation is not skipped by pyjitpl.py.
-            descr = new_guard_op.getdescr()
+            descr = op.getdescr()
             assert isinstance(descr, compile.ResumeGuardDescr)
             descr.guard_opnum = rop.GUARD_VALUE
-            descr.make_a_counter_per_value(new_guard_op)
-            emit_operation = False
+            descr.make_a_counter_per_value(op)
         constbox = op.getarg(1)
         assert isinstance(constbox, Const)
-        self.optimize_guard(op, constbox, emit_operation, dryrun)
+        self.optimize_guard(op, constbox)
 
-    def optimize_GUARD_TRUE(self, op, dryrun=False):
-        self.optimize_guard(op, CONST_1, dryrun=dryrun)
+    def optimize_GUARD_TRUE(self, op):
+        self.optimize_guard(op, CONST_1)
 
-    def optimize_GUARD_FALSE(self, op, dryrun=False):
-        self.optimize_guard(op, CONST_0, dryrun=dryrun)
+    def optimize_GUARD_FALSE(self, op):
+        self.optimize_guard(op, CONST_0)
 
-    def optimize_GUARD_CLASS(self, op, dryrun=False):
+    def optimize_GUARD_CLASS(self, op):
         value = self.getvalue(op.getarg(0))
         expectedclassbox = op.getarg(1)
         assert isinstance(expectedclassbox, Const)
@@ -319,38 +268,32 @@
             if realclassbox.same_constant(expectedclassbox):
                 return
             raise InvalidLoop
-        if dryrun: return
-        emit_operation = True
-        if value.last_guard_index != -1:
+        if value.last_guard:
             # there already has been a guard_nonnull or guard_class or
             # guard_nonnull_class on this value.
-            old_guard_op = self.optimizer.newoperations[value.last_guard_index]
+            old_guard_op = value.last_guard
             if old_guard_op.getopnum() == rop.GUARD_NONNULL:
                 # it was a guard_nonnull, which we replace with a
                 # guard_nonnull_class.
-                new_guard_op = old_guard_op.copy_and_change (rop.GUARD_NONNULL_CLASS,
+                op = old_guard_op.copy_and_change (rop.GUARD_NONNULL_CLASS,
                                          args = [old_guard_op.getarg(0), op.getarg(1)])
-                self.optimizer.newoperations[value.last_guard_index] = new_guard_op
+                self.optimizer.replaces_guard[op] = old_guard_op
                 # hack hack hack.  Change the guard_opnum on
                 # new_guard_op.getdescr() so that when resuming,
                 # the operation is not skipped by pyjitpl.py.
-                descr = new_guard_op.getdescr()
+                descr = op.getdescr()
                 assert isinstance(descr, compile.ResumeGuardDescr)
                 descr.guard_opnum = rop.GUARD_NONNULL_CLASS
-                emit_operation = False
-        if emit_operation:
-            self.emit_operation(op)
-            last_guard_index = len(self.optimizer.newoperations) - 1
-        else:
-            last_guard_index = value.last_guard_index
-        value.make_constant_class(expectedclassbox, last_guard_index)
+        self.emit_operation(op)
+        value.make_constant_class(expectedclassbox, op)
 
-    def optimize_GUARD_NONNULL_CLASS(self, op, dryrun=False):
-        self.optimize_GUARD_NONNULL(op, True)
-        self.optimize_GUARD_CLASS(op, dryrun)
+    def optimize_GUARD_NONNULL_CLASS(self, op):
+        value = self.getvalue(op.getarg(0))
+        if value.is_null():
+            raise InvalidLoop
+        self.optimize_GUARD_CLASS(op)
 
-    def optimize_GUARD_NO_EXCEPTION(self, op, dryrun=False):
-        if dryrun: return
+    def optimize_GUARD_NO_EXCEPTION(self, op):
         if not self.optimizer.exception_might_have_happened:
             return
         self.emit_operation(op)
@@ -470,7 +413,7 @@
                     newop = ResOperation(rop.SETARRAYITEM_GC,
                                          [op.getarg(2),
                                           ConstInt(index + dest_start),
-                                          val.force_box()], None,
+                                          val.get_key_box()], None,
                                          descr=source_value.arraydescr)
                     self.emit_operation(newop)
             return True
@@ -478,6 +421,25 @@
             return True # 0-length arraycopy
         return False
 
+    def optimize_CALL_PURE(self, op):
+        arg_consts = []
+        for i in range(op.numargs()):
+            arg = op.getarg(i)
+            const = self.get_constant_box(arg)
+            if const is None:
+                break
+            arg_consts.append(const)
+        else:
+            # all constant arguments: check if we already know the result
+            try:
+                result = self.optimizer.call_pure_results[arg_consts]
+            except KeyError:
+                pass
+            else:
+                self.make_constant(op.result, result)
+                return
+        self.emit_operation(op)
+
     def optimize_INT_FLOORDIV(self, op):
         v1 = self.getvalue(op.getarg(0))
         v2 = self.getvalue(op.getarg(1))
@@ -492,6 +454,10 @@
                                         args = [op.getarg(0), ConstInt(highest_bit(val))])
         self.emit_operation(op)
 
+    def optimize_CAST_OPAQUE_PTR(self, op):
+        value = self.getvalue(op.getarg(0))
+        self.optimizer.opaque_pointers[value] = True
+        self.make_equal_to(op.result, value)
 
 dispatch_opt = make_dispatcher_method(OptRewrite, 'optimize_',
         default=OptRewrite.emit_operation)
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_optimizebasic.py b/pypy/jit/metainterp/optimizeopt/test/test_optimizebasic.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_optimizebasic.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_optimizebasic.py
@@ -39,8 +39,9 @@
 
 def test_sharing_field_lists_of_virtual():
     class FakeOptimizer(object):
-        class cpu(object):
-            pass
+        class optimizer(object):
+            class cpu(object):
+                pass
     opt = FakeOptimizer()
     virt1 = virtualize.AbstractVirtualStructValue(opt, None)
     lst1 = virt1._get_field_descr_list()
@@ -69,7 +70,7 @@
     class FakeVirtualValue(virtualize.AbstractVirtualValue):
         def _make_virtual(self, *args):
             return FakeVInfo()
-    v1 = FakeVirtualValue(None, None, None)
+    v1 = FakeVirtualValue(None, None)
     vinfo1 = v1.make_virtual_info(None, [1, 2, 4])
     vinfo2 = v1.make_virtual_info(None, [1, 2, 4])
     assert vinfo1 is vinfo2
@@ -111,7 +112,7 @@
 
 class BaseTestBasic(BaseTest):
 
-    enable_opts = "intbounds:rewrite:virtualize:string:heap"
+    enable_opts = "intbounds:rewrite:virtualize:string:earlyforce:pure:heap"
 
     def optimize_loop(self, ops, optops, call_pure_results=None):
 
@@ -651,8 +652,8 @@
         i3 = getfield_gc(p3, descr=valuedescr)
         escape(i3)
         p1 = new_with_vtable(ConstClass(node_vtable))
+        p1sub = new_with_vtable(ConstClass(node_vtable2))
         setfield_gc(p1, i1, descr=valuedescr)
-        p1sub = new_with_vtable(ConstClass(node_vtable2))
         setfield_gc(p1sub, i1, descr=valuedescr)
         setfield_gc(p1, p1sub, descr=nextdescr)
         jump(i1, p1, p2)
@@ -667,10 +668,10 @@
         p3sub = getfield_gc(p3, descr=nextdescr)
         i3 = getfield_gc(p3sub, descr=valuedescr)
         escape(i3)
+        p1 = new_with_vtable(ConstClass(node_vtable))
         p2sub = new_with_vtable(ConstClass(node_vtable2))
         setfield_gc(p2sub, i1, descr=valuedescr)
         setfield_gc(p2, p2sub, descr=nextdescr)
-        p1 = new_with_vtable(ConstClass(node_vtable))
         jump(i1, p1, p2)
         """
         # The same as test_p123_simple, but in the end the "old" p2 contains
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py b/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py
@@ -36,7 +36,7 @@
 
 class TestFfiCall(BaseTestBasic, LLtypeMixin):
 
-    enable_opts = "intbounds:rewrite:virtualize:string:heap:ffi"
+    enable_opts = "intbounds:rewrite:virtualize:string:pure:earlyforce:heap:ffi"
 
     class namespace:
         cpu = LLtypeMixin.cpu
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py b/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py
@@ -68,7 +68,7 @@
 
 class BaseTestWithUnroll(BaseTest):
 
-    enable_opts = "intbounds:rewrite:virtualize:string:heap:unroll"
+    enable_opts = "intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll"
 
     def optimize_loop(self, ops, expected, expected_preamble=None,
                       call_pure_results=None, expected_short=None):
@@ -825,8 +825,8 @@
         i3 = getfield_gc(p2, descr=valuedescr)
         escape(i3)
         p4 = new_with_vtable(ConstClass(node_vtable))
+        p1sub = new_with_vtable(ConstClass(node_vtable2))
         setfield_gc(p4, i1, descr=valuedescr)
-        p1sub = new_with_vtable(ConstClass(node_vtable2))
         setfield_gc(p1sub, i1, descr=valuedescr)
         setfield_gc(p4, p1sub, descr=nextdescr)
         jump(i1, p4)
@@ -865,13 +865,7 @@
         p3sub = new_with_vtable(ConstClass(node_vtable2))
         setfield_gc(p3sub, i1, descr=valuedescr)
         setfield_gc(p1, p3sub, descr=nextdescr)
-        # XXX: We get two extra operations here because the setfield
-        #      above is the result of forcing p1 and thus not
-        #      registered with the heap optimizer. I've makred tests
-        #      below with VIRTUALHEAP if they suffer from this issue
-        p3sub2 = getfield_gc(p1, descr=nextdescr)
-        guard_nonnull_class(p3sub2, ConstClass(node_vtable2)) []
-        jump(i1, p1, p3sub2)
+        jump(i1, p1, p3sub)
         """
         self.optimize_loop(ops, expected, preamble)
 
@@ -902,9 +896,7 @@
         guard_true(i2b) []
         p3 = new_with_vtable(ConstClass(node_vtable))
         setfield_gc(p3, i2, descr=nextdescr)
-        # XXX: VIRTUALHEAP (see above)
-        i3 = getfield_gc(p3, descr=nextdescr)
-        jump(p3, i3)
+        jump(p3, i2)
         """
         self.optimize_loop(ops, expected, preamble)
 
@@ -1219,7 +1211,15 @@
         setfield_gc(p3, p30, descr=valuedescr)
         jump(i29, p30, p3)
         """
-        expected = preamble
+        expected = """
+        [i0, p1, p3]
+        i28 = int_add(i0, 1)
+        i29 = int_add(i28, 1)
+        p30 = new_with_vtable(ConstClass(node_vtable))
+        setfield_gc(p3, p30, descr=valuedescr)
+        setfield_gc(p30, i28, descr=nextdescr)
+        jump(i29, p30, p3)
+        """
         self.optimize_loop(ops, expected, preamble)
 
     def test_nonvirtual_1(self):
@@ -2408,8 +2408,8 @@
         guard_class(p2, ConstClass(node_vtable)) []
         p3 = getfield_gc(p1, descr=otherdescr)
         guard_class(p3, ConstClass(node_vtable)) []
+        p3a = new_with_vtable(ConstClass(node_vtable))
         setfield_gc(p3, p2, descr=otherdescr)
-        p3a = new_with_vtable(ConstClass(node_vtable))
         escape(p3a)
         jump(p3a)
         """
@@ -2421,9 +2421,9 @@
         # setfield_gc(p3, p2, descr=otherdescr) # p3a.other = p2a
         # p1a = new_with_vtable(ConstClass(node_vtable2))
         # p2a = new_with_vtable(ConstClass(node_vtable))
+        p3anew = new_with_vtable(ConstClass(node_vtable))
         p2 = new_with_vtable(ConstClass(node_vtable))
         setfield_gc(p3a, p2, descr=otherdescr) # p3a.other = p2a
-        p3anew = new_with_vtable(ConstClass(node_vtable))
         escape(p3anew)
         jump(p3anew)
         """
@@ -2458,9 +2458,9 @@
         p3 = getfield_gc(p1, descr=otherdescr)
         guard_class(p3, ConstClass(node_vtable)) []
         # p1a = new_with_vtable(ConstClass(node_vtable2))
+        p3a = new_with_vtable(ConstClass(node_vtable))
         p2a = new_with_vtable(ConstClass(node_vtable))
         setfield_gc(p3, p2a, descr=otherdescr)
-        p3a = new_with_vtable(ConstClass(node_vtable))
         escape(p3a)
         # setfield_gc(p1a, p2a, descr=nextdescr)
         # setfield_gc(p1a, p3a, descr=otherdescr)
@@ -2468,9 +2468,9 @@
         """
         expected = """
         [p2, p3]
+        p3a = new_with_vtable(ConstClass(node_vtable))
         p2a = new_with_vtable(ConstClass(node_vtable))
         setfield_gc(p3, p2a, descr=otherdescr)
-        p3a = new_with_vtable(ConstClass(node_vtable))
         escape(p3a)
         jump(p2a, p3a)
         """
@@ -2790,7 +2790,6 @@
         self.optimize_loop(ops, expected, preamble)
 
     def test_remove_duplicate_pure_op_ovf_with_lazy_setfield(self):
-        py.test.skip('this optimization is not yet supprted')
         ops = """
         [i1, p1]
         i3 = int_add_ovf(i1, 1)
@@ -5961,13 +5960,18 @@
         escape(i0)
         jump(p1)
         """
-        expected = """
+        preamble = """
         [p1]
         i0 = ptr_eq(p1, NULL)
         escape(i0)
-        jump(p1)
-        """
-        self.optimize_strunicode_loop_extradescrs(ops, expected, expected)
+        jump(p1, i0)
+        """
+        expected = """
+        [p1, i0]
+        escape(i0)
+        jump(p1, i0)
+        """
+        self.optimize_strunicode_loop_extradescrs(ops, expected, preamble)
 
     def test_str_equal_none2(self):
         ops = """
@@ -5976,13 +5980,18 @@
         escape(i0)
         jump(p1)
         """
-        expected = """
+        preamble = """
         [p1]
         i0 = ptr_eq(p1, NULL)
         escape(i0)
-        jump(p1)
-        """
-        self.optimize_strunicode_loop_extradescrs(ops, expected, expected)
+        jump(p1, i0)
+        """
+        expected = """
+        [p1, i0]
+        escape(i0)
+        jump(p1, i0)
+        """
+        self.optimize_strunicode_loop_extradescrs(ops, expected, preamble)
 
     def test_str_equal_nonnull1(self):
         ops = """
@@ -7220,6 +7229,29 @@
         """
         self.optimize_loop(ops, expected)
 
+    def test_heap_cache_forced_virtuals(self):
+        ops = """
+        [i1, i2, p0]
+        p1 = new(descr=ssize)
+        setfield_gc(p1, i1, descr=adescr)
+        setfield_gc(p1, i2, descr=bdescr)
+        call(p0, p1, descr=writeadescr)
+        i3 = getfield_gc(p1, descr=adescr)
+        i4 = getfield_gc(p1, descr=bdescr)
+        jump(i3, i4, p0)
+        """
+        expected = """
+        [i1, i2, p0]
+        p1 = new(descr=ssize)
+        setfield_gc(p1, i1, descr=adescr)
+        call(p0, p1, descr=writeadescr)
+        i3 = getfield_gc(p1, descr=adescr)
+        setfield_gc(p1, i2, descr=bdescr)
+        jump(i3, i2, p0)
+        """
+        self.optimize_loop(ops, expected)
+
+
     def test_setarrayitem_followed_by_arraycopy(self):
         ops = """
         [p1, p2]
@@ -7230,6 +7262,27 @@
         """
         self.optimize_loop(ops, ops)
 
+    def test_heap_cache_virtuals_forced_by_delayed_setfield(self):
+        py.test.skip('not yet supoprted')
+        ops = """
+        [i1, p0]
+        p1 = new(descr=ssize)
+        setfield_gc(p1, i1, descr=valuedescr)
+        setfield_gc(p0, p1, descr=adescr)
+        call(p0, descr=writeadescr)
+        i2 = getfield_gc(p1, descr=valuedescr)
+        jump(i2, p0)
+        """
+        expected = """
+        [i1, p0]
+        p1 = new(descr=ssize)
+        setfield_gc(p1, i1, descr=valuedescr)
+        setfield_gc(p0, p1, descr=adescr)
+        call(p0, descr=writeadescr)
+        jump(i1, p0)
+        """
+        self.optimize_loop(ops, expected)
+
 class TestLLtype(OptimizeOptTest, LLtypeMixin):
     pass
 
diff --git a/pypy/jit/metainterp/optimizeopt/unroll.py b/pypy/jit/metainterp/optimizeopt/unroll.py
--- a/pypy/jit/metainterp/optimizeopt/unroll.py
+++ b/pypy/jit/metainterp/optimizeopt/unroll.py
@@ -75,7 +75,6 @@
         self.importable_values = {}
         self.emitting_dissabled = False
         self.emitted_guards = 0
-        self.emitted_pure_operations = {}
 
     def ensure_imported(self, value):
         if not self.emitting_dissabled and value in self.importable_values:
@@ -96,21 +95,6 @@
         new = UnrollableOptimizer(self.metainterp_sd, self.loop)
         return self._new(new)
 
-    def remember_emitting_pure(self, op):
-        self.emitted_pure_operations[op] = True
-
-    def produce_potential_short_preamble_ops(self, sb):
-        for op in self.emitted_pure_operations:
-            if op.getopnum() == rop.GETARRAYITEM_GC_PURE or \
-               op.getopnum() == rop.STRGETITEM or \
-               op.getopnum() == rop.UNICODEGETITEM:
-                if not self.getvalue(op.getarg(1)).is_constant():
-                    continue
-            sb.add_potential(op)
-        for opt in self.optimizations:
-            opt.produce_potential_short_preamble_ops(sb)
-
-
 
 class UnrollOptimizer(Optimization):
     """Unroll the loop into two iterations. The first one will
@@ -154,7 +138,7 @@
 
             KillHugeIntBounds(self.optimizer).apply()
             
-            loop.preamble.operations = self.optimizer.newoperations
+            loop.preamble.operations = self.optimizer.get_newoperations()
             jump_args = [self.getvalue(a).get_key_box() for a in jump_args]
 
             start_resumedescr = loop.preamble.start_resumedescr.clone_if_mutable()
@@ -167,8 +151,9 @@
             virtual_state = modifier.get_virtual_state(jump_args)
             
             values = [self.getvalue(arg) for arg in jump_args]
-            inputargs = virtual_state.make_inputargs(values)
-            short_inputargs = virtual_state.make_inputargs(values, keyboxes=True)
+            inputargs = virtual_state.make_inputargs(values, self.optimizer)
+            short_inputargs = virtual_state.make_inputargs(values, self.optimizer,
+                                                           keyboxes=True)
 
             self.constant_inputargs = {}
             for box in jump_args: 
@@ -197,7 +182,7 @@
             # operations needed to setup the proper state of those virtuals
             # in the peeled loop
             inputarg_setup_ops = []
-            preamble_optimizer.newoperations = []
+            preamble_optimizer.clear_newoperations()
             seen = {}
             for box in inputargs:
                 if box in seen:
@@ -211,9 +196,8 @@
                     continue
                 seen[box] = True
                 value = preamble_optimizer.getvalue(box)
-                value.force_box()
-            preamble_optimizer.flush()
-            inputarg_setup_ops += preamble_optimizer.newoperations
+                value.force_box(preamble_optimizer)
+            inputarg_setup_ops += preamble_optimizer.get_newoperations()
 
             # Setup the state of the new optimizer by emiting the
             # short preamble operations and discarding the result
@@ -244,13 +228,13 @@
                                 virtual_state)
             
             loop.inputargs = inputargs
-            args = [preamble_optimizer.getvalue(self.short_boxes.original(a)).force_box()\
+            args = [preamble_optimizer.getvalue(self.short_boxes.original(a)).force_box(preamble_optimizer)\
                     for a in inputargs]
             jmp = ResOperation(rop.JUMP, args, None)
             jmp.setdescr(loop.token)
             loop.preamble.operations.append(jmp)
 
-            loop.operations = self.optimizer.newoperations
+            loop.operations = self.optimizer.get_newoperations()
             maxguards = self.optimizer.metainterp_sd.warmrunnerdesc.memory_manager.max_retrace_guards
             
             if self.optimizer.emitted_guards > maxguards:
@@ -335,9 +319,10 @@
         assert jumpop
         original_jumpargs = jumpop.getarglist()[:]
         values = [self.getvalue(arg) for arg in jumpop.getarglist()]
-        jumpargs = virtual_state.make_inputargs(values)
+        jumpargs = virtual_state.make_inputargs(values, self.optimizer)
         jumpop.initarglist(jumpargs)
-        jmp_to_short_args = virtual_state.make_inputargs(values, keyboxes=True)
+        jmp_to_short_args = virtual_state.make_inputargs(values, self.optimizer,
+                                                         keyboxes=True)
         self.short_inliner = Inliner(short_inputargs, jmp_to_short_args)
         
         for box, const in self.constant_inputargs.items():
@@ -347,11 +332,11 @@
             newop = self.short_inliner.inline_op(op)
             self.optimizer.send_extra_operation(newop)
         
-        self.optimizer.flush()
+        newoperations = self.optimizer.get_newoperations()
 
         i = j = 0
-        while i < len(self.optimizer.newoperations) or j < len(jumpargs):
-            if i == len(self.optimizer.newoperations):
+        while i < len(newoperations) or j < len(jumpargs):
+            if i == len(newoperations):
                 while j < len(jumpargs):
                     a = jumpargs[j]
                     if self.optimizer.loop.logops:
@@ -360,7 +345,7 @@
                                     jumpargs, short_seen)
                     j += 1
             else:
-                op = self.optimizer.newoperations[i]
+                op = newoperations[i]
 
                 self.boxes_created_this_iteration[op.result] = True
                 args = op.getarglist()
@@ -375,6 +360,7 @@
                     self.import_box(a, inputargs, short, short_jumpargs,
                                     jumpargs, short_seen)
                 i += 1
+            newoperations = self.optimizer.get_newoperations()
 
         jumpop.initarglist(jumpargs)
         self.optimizer.send_extra_operation(jumpop)
@@ -468,7 +454,7 @@
         inputargs.append(box)
         box = newresult
         if box in self.optimizer.values:
-            box = self.optimizer.values[box].force_box()
+            box = self.optimizer.values[box].force_box(self.optimizer)
         jumpargs.append(box)
         
 
@@ -483,11 +469,6 @@
         if op.getopnum() == rop.JUMP:
             loop_token = op.getdescr()
             assert isinstance(loop_token, LoopToken)
-            # FIXME: Use a tree, similar to the tree formed by the full
-            # preamble and it's bridges, instead of a list to save time and
-            # memory. This should also allow better behaviour in
-            # situations that the is_emittable() chain currently cant
-            # handle and the inlining fails unexpectedly belwo.
             short = loop_token.short_preamble
             if short:
                 args = op.getarglist()
@@ -523,7 +504,7 @@
 
                         values = [self.getvalue(arg)
                                   for arg in op.getarglist()]
-                        args = sh.virtual_state.make_inputargs(values,
+                        args = sh.virtual_state.make_inputargs(values, self.optimizer,
                                                                keyboxes=True)
                         inliner = Inliner(sh.inputargs, args)
                         
diff --git a/pypy/jit/metainterp/optimizeopt/virtualize.py b/pypy/jit/metainterp/optimizeopt/virtualize.py
--- a/pypy/jit/metainterp/optimizeopt/virtualize.py
+++ b/pypy/jit/metainterp/optimizeopt/virtualize.py
@@ -10,13 +10,12 @@
 
 
 class AbstractVirtualValue(optimizer.OptValue):
-    _attrs_ = ('optimizer', 'keybox', 'source_op', '_cached_vinfo')
+    _attrs_ = ('keybox', 'source_op', '_cached_vinfo')
     box = None
     level = optimizer.LEVEL_NONNULL
     _cached_vinfo = None
 
-    def __init__(self, optimizer, keybox, source_op=None):
-        self.optimizer = optimizer
+    def __init__(self, keybox, source_op=None):
         self.keybox = keybox   # only used as a key in dictionaries
         self.source_op = source_op  # the NEW_WITH_VTABLE/NEW_ARRAY operation
                                     # that builds this box
@@ -29,17 +28,17 @@
             return self.keybox
         return self.box
 
-    def force_box(self):
+    def force_box(self, optforce):
         if self.box is None:
-            self.optimizer.forget_numberings(self.keybox)
-            self._really_force()
+            optforce.forget_numberings(self.keybox)
+            self._really_force(optforce)
         return self.box
 
-    def force_at_end_of_preamble(self, already_forced):
+    def force_at_end_of_preamble(self, already_forced, optforce):
         value = already_forced.get(self, None)
         if value:
             return value
-        return OptValue(self.force_box())
+        return OptValue(self.force_box(optforce))
 
     def make_virtual_info(self, modifier, fieldnums):
         if fieldnums is None:
@@ -55,7 +54,7 @@
     def _make_virtual(self, modifier):
         raise NotImplementedError("abstract base")
 
-    def _really_force(self):
+    def _really_force(self, optforce):
         raise NotImplementedError("abstract base")
 
     def import_from(self, other, optimizer):
@@ -70,10 +69,11 @@
 get_fielddescrlist_cache._annspecialcase_ = "specialize:memo"
 
 class AbstractVirtualStructValue(AbstractVirtualValue):
-    _attrs_ = ('_fields', '_cached_sorted_fields')
+    _attrs_ = ('_fields', 'cpu', '_cached_sorted_fields')
 
-    def __init__(self, optimizer, keybox, source_op=None):
-        AbstractVirtualValue.__init__(self, optimizer, keybox, source_op)
+    def __init__(self, cpu, keybox, source_op=None):
+        AbstractVirtualValue.__init__(self, keybox, source_op)
+        self.cpu = cpu
         self._fields = {}
         self._cached_sorted_fields = None
 
@@ -87,45 +87,44 @@
     def _get_descr(self):
         raise NotImplementedError
 
-    def _is_immutable_and_filled_with_constants(self):
+    def _is_immutable_and_filled_with_constants(self, optforce):
         count = self._get_descr().count_fields_if_immutable()
         if count != len(self._fields):    # always the case if count == -1
             return False
         for value in self._fields.itervalues():
-            subbox = value.force_box()
+            subbox = value.force_box(optforce)
             if not isinstance(subbox, Const):
                 return False
         return True
 
-    def force_at_end_of_preamble(self, already_forced):
+    def force_at_end_of_preamble(self, already_forced, optforce):
         if self in already_forced:
             return self
         already_forced[self] = self
         if self._fields:
             for ofs in self._fields.keys():
-                self._fields[ofs] = self._fields[ofs].force_at_end_of_preamble(already_forced)
+                self._fields[ofs] = self._fields[ofs].force_at_end_of_preamble(already_forced, optforce)
         return self
 
-    def _really_force(self):
+    def _really_force(self, optforce):
         op = self.source_op
         assert op is not None
         # ^^^ This case should not occur any more (see test_bug_3).
         #
         if not we_are_translated():
             op.name = 'FORCE ' + self.source_op.name
-
-        if self._is_immutable_and_filled_with_constants():
-            box = self.optimizer.constant_fold(op)
+            
+        if self._is_immutable_and_filled_with_constants(optforce):
+            box = optforce.optimizer.constant_fold(op)
             self.make_constant(box)
             for ofs, value in self._fields.iteritems():
-                subbox = value.force_box()
+                subbox = value.force_box(optforce)
                 assert isinstance(subbox, Const)
-                execute(self.optimizer.cpu, None, rop.SETFIELD_GC,
+                execute(optforce.optimizer.cpu, None, rop.SETFIELD_GC,
                         ofs, box, subbox)
             # keep self._fields, because it's all immutable anyway
         else:
-            newoperations = self.optimizer.newoperations
-            newoperations.append(op)
+            optforce.emit_operation(op)
             self.box = box = op.result
             #
             iteritems = self._fields.iteritems()
@@ -135,10 +134,11 @@
             for ofs, value in iteritems:
                 if value.is_null():
                     continue
-                subbox = value.force_box()
+                subbox = value.force_box(optforce)
                 op = ResOperation(rop.SETFIELD_GC, [box, subbox], None,
                                   descr=ofs)
-                newoperations.append(op)
+
+                optforce.emit_operation(op)
 
     def _get_field_descr_list(self):
         _cached_sorted_fields = self._cached_sorted_fields
@@ -155,7 +155,7 @@
             else:
                 lst = self._fields.keys()
             sort_descrs(lst)
-            cache = get_fielddescrlist_cache(self.optimizer.cpu)
+            cache = get_fielddescrlist_cache(self.cpu)
             result = cache.get(lst, None)
             if result is None:
                 cache[lst] = lst
@@ -180,8 +180,8 @@
 class VirtualValue(AbstractVirtualStructValue):
     level = optimizer.LEVEL_KNOWNCLASS
 
-    def __init__(self, optimizer, known_class, keybox, source_op=None):
-        AbstractVirtualStructValue.__init__(self, optimizer, keybox, source_op)
+    def __init__(self, cpu, known_class, keybox, source_op=None):
+        AbstractVirtualStructValue.__init__(self, cpu, keybox, source_op)
         assert isinstance(known_class, Const)
         self.known_class = known_class
 
@@ -190,7 +190,7 @@
         return modifier.make_virtual(self.known_class, fielddescrs)
 
     def _get_descr(self):
-        return vtable2descr(self.optimizer.cpu, self.known_class.getint())
+        return vtable2descr(self.cpu, self.known_class.getint())
 
     def __repr__(self):
         cls_name = self.known_class.value.adr.ptr._obj._TYPE._name
@@ -201,8 +201,8 @@
 
 class VStructValue(AbstractVirtualStructValue):
 
-    def __init__(self, optimizer, structdescr, keybox, source_op=None):
-        AbstractVirtualStructValue.__init__(self, optimizer, keybox, source_op)
+    def __init__(self, cpu, structdescr, keybox, source_op=None):
+        AbstractVirtualStructValue.__init__(self, cpu, keybox, source_op)
         self.structdescr = structdescr
 
     def _make_virtual(self, modifier):
@@ -215,10 +215,10 @@
 
 class VArrayValue(AbstractVirtualValue):
 
-    def __init__(self, optimizer, arraydescr, size, keybox, source_op=None):
-        AbstractVirtualValue.__init__(self, optimizer, keybox, source_op)
+    def __init__(self, arraydescr, constvalue, size, keybox, source_op=None):
+        AbstractVirtualValue.__init__(self, keybox, source_op)
         self.arraydescr = arraydescr
-        self.constvalue = optimizer.new_const_item(arraydescr)
+        self.constvalue = constvalue
         self._items = [self.constvalue] * size
 
     def getlength(self):
@@ -232,31 +232,30 @@
         assert isinstance(itemvalue, optimizer.OptValue)
         self._items[index] = itemvalue
 
-    def force_at_end_of_preamble(self, already_forced):
+    def force_at_end_of_preamble(self, already_forced, optforce):
         if self in already_forced:
             return self
         already_forced[self] = self
         for index in range(len(self._items)):
-            self._items[index] = self._items[index].force_at_end_of_preamble(already_forced)
+            self._items[index] = self._items[index].force_at_end_of_preamble(already_forced, optforce)
         return self
     
-    def _really_force(self):
+    def _really_force(self, optforce):
         assert self.source_op is not None
         if not we_are_translated():
             self.source_op.name = 'FORCE ' + self.source_op.name
-        newoperations = self.optimizer.newoperations
-        newoperations.append(self.source_op)
+        optforce.emit_operation(self.source_op)        
         self.box = box = self.source_op.result
         for index in range(len(self._items)):
             subvalue = self._items[index]
             if subvalue is not self.constvalue:
                 if subvalue.is_null():
                     continue
-                subbox = subvalue.force_box()
+                subbox = subvalue.force_box(optforce)
                 op = ResOperation(rop.SETARRAYITEM_GC,
                                   [box, ConstInt(index), subbox], None,
                                   descr=self.arraydescr)
-                newoperations.append(op)
+                optforce.emit_operation(op)
 
     def get_args_for_fail(self, modifier):
         if self.box is None and not modifier.already_seen_virtual(self.keybox):
@@ -279,17 +278,18 @@
         return OptVirtualize()
         
     def make_virtual(self, known_class, box, source_op=None):
-        vvalue = VirtualValue(self.optimizer, known_class, box, source_op)
+        vvalue = VirtualValue(self.optimizer.cpu, known_class, box, source_op)
         self.make_equal_to(box, vvalue)
         return vvalue
 
     def make_varray(self, arraydescr, size, box, source_op=None):
-        vvalue = VArrayValue(self.optimizer, arraydescr, size, box, source_op)
+        constvalue = self.new_const_item(arraydescr)
+        vvalue = VArrayValue(arraydescr, constvalue, size, box, source_op)
         self.make_equal_to(box, vvalue)
         return vvalue
 
     def make_vstruct(self, structdescr, box, source_op=None):
-        vvalue = VStructValue(self.optimizer, structdescr, box, source_op)
+        vvalue = VStructValue(self.optimizer.cpu, structdescr, box, source_op)
         self.make_equal_to(box, vvalue)
         return vvalue
 
@@ -362,7 +362,6 @@
             self.make_equal_to(op.result, fieldvalue)
         else:
             value.ensure_nonnull()
-            ###self.heap_op_optimizer.optimize_GETFIELD_GC(op, value)
             self.emit_operation(op)
 
     # note: the following line does not mean that the two operations are
@@ -377,7 +376,6 @@
             value.setfield(op.getdescr(), fieldvalue)
         else:
             value.ensure_nonnull()
-            ###self.heap_op_optimizer.optimize_SETFIELD_GC(op, value, fieldvalue)
             self.emit_operation(op)
 
     def optimize_NEW_WITH_VTABLE(self, op):
@@ -417,7 +415,6 @@
                 self.make_equal_to(op.result, itemvalue)
                 return
         value.ensure_nonnull()
-        ###self.heap_op_optimizer.optimize_GETARRAYITEM_GC(op, value)
         self.emit_operation(op)
 
     # note: the following line does not mean that the two operations are
@@ -432,7 +429,6 @@
                 value.setitem(indexbox.getint(), self.getvalue(op.getarg(2)))
                 return
         value.ensure_nonnull()
-        ###self.heap_op_optimizer.optimize_SETARRAYITEM_GC(op, value, fieldvalue)
         self.emit_operation(op)
 
 
diff --git a/pypy/jit/metainterp/optimizeopt/virtualstate.py b/pypy/jit/metainterp/optimizeopt/virtualstate.py
--- a/pypy/jit/metainterp/optimizeopt/virtualstate.py
+++ b/pypy/jit/metainterp/optimizeopt/virtualstate.py
@@ -30,7 +30,7 @@
     def _generate_guards(self, other, box, cpu, extra_guards):
         raise InvalidLoop
 
-    def enum_forced_boxes(self, boxes, value):
+    def enum_forced_boxes(self, boxes, value, optimizer):
         raise NotImplementedError
 
     def enum(self, virtual_state):
@@ -100,14 +100,14 @@
     def _generalization_of(self, other):
         raise NotImplementedError
 
-    def enum_forced_boxes(self, boxes, value):
+    def enum_forced_boxes(self, boxes, value, optimizer):
         assert isinstance(value, virtualize.AbstractVirtualStructValue)
         assert value.is_virtual()
         for i in range(len(self.fielddescrs)):
             v = value._fields[self.fielddescrs[i]]
             s = self.fieldstate[i]
             if s.position > self.position:
-                s.enum_forced_boxes(boxes, v)
+                s.enum_forced_boxes(boxes, v, optimizer)
 
     def _enum(self, virtual_state):
         for s in self.fieldstate:
@@ -177,14 +177,14 @@
                 return False
         return True
 
-    def enum_forced_boxes(self, boxes, value):
+    def enum_forced_boxes(self, boxes, value, optimizer):
         assert isinstance(value, virtualize.VArrayValue)
         assert value.is_virtual()
         for i in range(len(self.fieldstate)):
             v = value._items[i]
             s = self.fieldstate[i]
             if s.position > self.position:
-                s.enum_forced_boxes(boxes, v)
+                s.enum_forced_boxes(boxes, v, optimizer)
 
     def _enum(self, virtual_state):
         for s in self.fieldstate:
@@ -316,11 +316,11 @@
             import pdb; pdb.set_trace()
             raise NotImplementedError
 
-    def enum_forced_boxes(self, boxes, value):
+    def enum_forced_boxes(self, boxes, value, optimizer):
         if self.level == LEVEL_CONSTANT:
             return
         assert 0 <= self.position_in_notvirtuals 
-        boxes[self.position_in_notvirtuals] = value.force_box()
+        boxes[self.position_in_notvirtuals] = value.force_box(optimizer)
 
     def _enum(self, virtual_state):
         if self.level == LEVEL_CONSTANT:
@@ -377,11 +377,13 @@
             self.state[i].generate_guards(other.state[i], args[i],
                                           cpu, extra_guards, renum)
 
-    def make_inputargs(self, values, keyboxes=False):
+    def make_inputargs(self, values, optimizer, keyboxes=False):
+        if optimizer.optearlyforce:
+            optimizer = optimizer.optearlyforce
         assert len(values) == len(self.state)
         inputargs = [None] * len(self.notvirtuals)
         for i in range(len(values)):
-            self.state[i].enum_forced_boxes(inputargs, values[i])
+            self.state[i].enum_forced_boxes(inputargs, values[i], optimizer)
 
         if keyboxes:
             for i in range(len(values)):
@@ -434,7 +436,12 @@
     def get_virtual_state(self, jump_args):
         self.optimizer.force_at_end_of_preamble()
         already_forced = {}
-        values = [self.getvalue(box).force_at_end_of_preamble(already_forced)
+        if self.optimizer.optearlyforce:
+            opt = self.optimizer.optearlyforce
+        else:
+            opt = self.optimizer
+        values = [self.getvalue(box).force_at_end_of_preamble(already_forced,
+                                                              opt)
                   for box in jump_args]
 
         for value in values:
diff --git a/pypy/jit/metainterp/optimizeopt/vstring.py b/pypy/jit/metainterp/optimizeopt/vstring.py
--- a/pypy/jit/metainterp/optimizeopt/vstring.py
+++ b/pypy/jit/metainterp/optimizeopt/vstring.py
@@ -43,7 +43,7 @@
 class __extend__(optimizer.OptValue):
     """New methods added to the base class OptValue for this file."""
 
-    def getstrlen(self, optimization, mode):
+    def getstrlen(self, string_optimizer, mode):
         if mode is mode_string:
             s = self.get_constant_string_spec(mode_string)
             if s is not None:
@@ -52,12 +52,12 @@
             s = self.get_constant_string_spec(mode_unicode)
             if s is not None:
                 return ConstInt(len(s))
-        if optimization is None:
+        if string_optimizer is None:
             return None
         self.ensure_nonnull()
-        box = self.force_box()
+        box = self.force_box(string_optimizer)
         lengthbox = BoxInt()
-        optimization.propagate_forward(ResOperation(mode.STRLEN, [box], lengthbox))
+        string_optimizer.emit_operation(ResOperation(mode.STRLEN, [box], lengthbox))
         return lengthbox
 
     @specialize.arg(1)
@@ -68,25 +68,25 @@
         else:
             return None
 
-    def string_copy_parts(self, optimizer, targetbox, offsetbox, mode):
+    def string_copy_parts(self, string_optimizer, targetbox, offsetbox, mode):
         # Copies the pointer-to-string 'self' into the target string
         # given by 'targetbox', at the specified offset.  Returns the offset
         # at the end of the copy.
-        lengthbox = self.getstrlen(optimizer, mode)
-        srcbox = self.force_box()
-        return copy_str_content(optimizer, srcbox, targetbox,
+        lengthbox = self.getstrlen(string_optimizer, mode)
+        srcbox = self.force_box(string_optimizer)
+        return copy_str_content(string_optimizer, srcbox, targetbox,
                                 CONST_0, offsetbox, lengthbox, mode)
 
 
 class VAbstractStringValue(virtualize.AbstractVirtualValue):
     _attrs_ = ('mode',)
 
-    def __init__(self, optimizer, keybox, source_op, mode):
-        virtualize.AbstractVirtualValue.__init__(self, optimizer, keybox,
+    def __init__(self, keybox, source_op, mode):
+        virtualize.AbstractVirtualValue.__init__(self, keybox,
                                                  source_op)
         self.mode = mode
 
-    def _really_force(self):
+    def _really_force(self, optforce):
         if self.mode is mode_string:
             s = self.get_constant_string_spec(mode_string)
             if s is not None:
@@ -101,12 +101,12 @@
                 return
         assert self.source_op is not None
         self.box = box = self.source_op.result
-        lengthbox = self.getstrlen(self.optimizer, self.mode)
+        lengthbox = self.getstrlen(optforce, self.mode)
         op = ResOperation(self.mode.NEWSTR, [lengthbox], box)
         if not we_are_translated():
             op.name = 'FORCE'
-        self.optimizer.emit_operation(op)
-        self.string_copy_parts(self.optimizer, box, CONST_0, self.mode)
+        optforce.emit_operation(op)
+        self.string_copy_parts(optforce, box, CONST_0, self.mode)
 
 
 class VStringPlainValue(VAbstractStringValue):
@@ -140,20 +140,20 @@
         return mode.emptystr.join([mode.chr(c.box.getint())
                                    for c in self._chars])
 
-    def string_copy_parts(self, optimizer, targetbox, offsetbox, mode):
+    def string_copy_parts(self, string_optimizer, targetbox, offsetbox, mode):
         if not self.is_virtual() and targetbox is not self.box:
-            lengthbox = self.getstrlen(optimizer, mode)
-            srcbox = self.force_box()
-            return copy_str_content(optimizer, srcbox, targetbox,
+            lengthbox = self.getstrlen(string_optimizer, mode)
+            srcbox = self.force_box(string_optimizer)
+            return copy_str_content(string_optimizer, srcbox, targetbox,
                                 CONST_0, offsetbox, lengthbox, mode)
         for i in range(len(self._chars)):
-            charbox = self._chars[i].force_box()
+            charbox = self._chars[i].force_box(string_optimizer)
             if not (isinstance(charbox, Const) and charbox.same_constant(CONST_0)):
-                optimizer.emit_operation(ResOperation(mode.STRSETITEM, [targetbox,
-                                                                    offsetbox,
-                                                                    charbox],
+                string_optimizer.emit_operation(ResOperation(mode.STRSETITEM, [targetbox,
+                                                                               offsetbox,
+                                                                               charbox],
                                                   None))
-            offsetbox = _int_add(optimizer, offsetbox, CONST_1)
+            offsetbox = _int_add(string_optimizer, offsetbox, CONST_1)
         return offsetbox
 
     def get_args_for_fail(self, modifier):
@@ -187,16 +187,16 @@
         self.left = left
         self.right = right
 
-    def getstrlen(self, optimizer, mode):
+    def getstrlen(self, string_optimizer, mode):
         if self.lengthbox is None:
-            len1box = self.left.getstrlen(optimizer, mode)
+            len1box = self.left.getstrlen(string_optimizer, mode)
             if len1box is None:
                 return None
-            len2box = self.right.getstrlen(optimizer, mode)
+            len2box = self.right.getstrlen(string_optimizer, mode)
             if len2box is None:
                 return None
-            self.lengthbox = _int_add(optimizer, len1box, len2box)
-            # ^^^ may still be None, if optimizer is None
+            self.lengthbox = _int_add(string_optimizer, len1box, len2box)
+            # ^^^ may still be None, if string_optimizer is None
         return self.lengthbox
 
     @specialize.arg(1)
@@ -209,10 +209,10 @@
             return None
         return s1 + s2
 
-    def string_copy_parts(self, optimizer, targetbox, offsetbox, mode):
-        offsetbox = self.left.string_copy_parts(optimizer, targetbox,
+    def string_copy_parts(self, string_optimizer, targetbox, offsetbox, mode):
+        offsetbox = self.left.string_copy_parts(string_optimizer, targetbox,
                                                 offsetbox, mode)
-        offsetbox = self.right.string_copy_parts(optimizer, targetbox,
+        offsetbox = self.right.string_copy_parts(string_optimizer, targetbox,
                                                  offsetbox, mode)
         return offsetbox
 
@@ -251,8 +251,8 @@
         self.vstart = vstart
         self.vlength = vlength
 
-    def getstrlen(self, _, mode):
-        return self.vlength.force_box()
+    def getstrlen(self, optforce, mode):
+        return self.vlength.force_box(optforce)
 
     @specialize.arg(1)
     def get_constant_string_spec(self, mode):
@@ -267,11 +267,11 @@
             return s1[start : start + length]
         return None
 
-    def string_copy_parts(self, optimizer, targetbox, offsetbox, mode):
-        lengthbox = self.getstrlen(optimizer, mode)
-        return copy_str_content(optimizer,
-                                self.vstr.force_box(), targetbox,
-                                self.vstart.force_box(), offsetbox,
+    def string_copy_parts(self, string_optimizer, targetbox, offsetbox, mode):
+        lengthbox = self.getstrlen(string_optimizer, mode)
+        return copy_str_content(string_optimizer,
+                                self.vstr.force_box(string_optimizer), targetbox,
+                                self.vstart.force_box(string_optimizer), offsetbox,
                                 lengthbox, mode)
 
     def get_args_for_fail(self, modifier):
@@ -300,7 +300,7 @@
         return modifier.make_vstrslice(self.mode is mode_unicode)
 
 
-def copy_str_content(optimizer, srcbox, targetbox,
+def copy_str_content(string_optimizer, srcbox, targetbox,
                      srcoffsetbox, offsetbox, lengthbox, mode, need_next_offset=True):
     if isinstance(srcbox, ConstPtr) and isinstance(srcoffsetbox, Const):
         M = 5
@@ -310,26 +310,26 @@
         # up to M characters are done "inline", i.e. with STRGETITEM/STRSETITEM
         # instead of just a COPYSTRCONTENT.
         for i in range(lengthbox.value):
-            charbox = _strgetitem(optimizer, srcbox, srcoffsetbox, mode)
-            srcoffsetbox = _int_add(optimizer, srcoffsetbox, CONST_1)
-            optimizer.emit_operation(ResOperation(mode.STRSETITEM, [targetbox,
-                                                                offsetbox,
-                                                                charbox],
+            charbox = _strgetitem(string_optimizer, srcbox, srcoffsetbox, mode)
+            srcoffsetbox = _int_add(string_optimizer, srcoffsetbox, CONST_1)
+            string_optimizer.emit_operation(ResOperation(mode.STRSETITEM, [targetbox,
+                                                                           offsetbox,
+                                                                           charbox],
                                               None))
-            offsetbox = _int_add(optimizer, offsetbox, CONST_1)
+            offsetbox = _int_add(string_optimizer, offsetbox, CONST_1)
     else:
         if need_next_offset:
-            nextoffsetbox = _int_add(optimizer, offsetbox, lengthbox)
+            nextoffsetbox = _int_add(string_optimizer, offsetbox, lengthbox)
         else:
             nextoffsetbox = None
         op = ResOperation(mode.COPYSTRCONTENT, [srcbox, targetbox,
                                                 srcoffsetbox, offsetbox,
                                                 lengthbox], None)
-        optimizer.emit_operation(op)
+        string_optimizer.emit_operation(op)
         offsetbox = nextoffsetbox
     return offsetbox
 
-def _int_add(optimizer, box1, box2):
+def _int_add(string_optimizer, box1, box2):
     if isinstance(box1, ConstInt):
         if box1.value == 0:
             return box2
@@ -337,23 +337,23 @@
             return ConstInt(box1.value + box2.value)
     elif isinstance(box2, ConstInt) and box2.value == 0:
         return box1
-    if optimizer is None:
+    if string_optimizer is None:
         return None
     resbox = BoxInt()
-    optimizer.propagate_forward(ResOperation(rop.INT_ADD, [box1, box2], resbox))
+    string_optimizer.emit_operation(ResOperation(rop.INT_ADD, [box1, box2], resbox))
     return resbox
 
-def _int_sub(optimizer, box1, box2):
+def _int_sub(string_optimizer, box1, box2):
     if isinstance(box2, ConstInt):
         if box2.value == 0:
             return box1
         if isinstance(box1, ConstInt):
             return ConstInt(box1.value - box2.value)
     resbox = BoxInt()
-    optimizer.propagate_forward(ResOperation(rop.INT_SUB, [box1, box2], resbox))
+    string_optimizer.emit_operation(ResOperation(rop.INT_SUB, [box1, box2], resbox))
     return resbox
 
-def _strgetitem(optimizer, strbox, indexbox, mode):
+def _strgetitem(string_optimizer, strbox, indexbox, mode):
     if isinstance(strbox, ConstPtr) and isinstance(indexbox, ConstInt):
         if mode is mode_string:
             s = strbox.getref(lltype.Ptr(rstr.STR))
@@ -362,30 +362,28 @@
             s = strbox.getref(lltype.Ptr(rstr.UNICODE))
             return ConstInt(ord(s.chars[indexbox.getint()]))
     resbox = BoxInt()
-    optimizer.propagate_forward(ResOperation(mode.STRGETITEM, [strbox, indexbox],
-                                      resbox))
+    string_optimizer.emit_operation(ResOperation(mode.STRGETITEM, [strbox, indexbox],
+                                                 resbox))
     return resbox
 
 
 class OptString(optimizer.Optimization):
     "Handling of strings and unicodes."
-    enabled = True
-
     def new(self):
         return OptString()
 
     def make_vstring_plain(self, box, source_op, mode):
-        vvalue = VStringPlainValue(self.optimizer, box, source_op, mode)
+        vvalue = VStringPlainValue(box, source_op, mode)
         self.make_equal_to(box, vvalue)
         return vvalue
 
     def make_vstring_concat(self, box, source_op, mode):
-        vvalue = VStringConcatValue(self.optimizer, box, source_op, mode)
+        vvalue = VStringConcatValue(box, source_op, mode)
         self.make_equal_to(box, vvalue)
         return vvalue
 
     def make_vstring_slice(self, box, source_op, mode):
-        vvalue = VStringSliceValue(self.optimizer, box, source_op, mode)
+        vvalue = VStringSliceValue(box, source_op, mode)
         self.make_equal_to(box, vvalue)
         return vvalue
 
@@ -435,9 +433,9 @@
         value.ensure_nonnull()
         #
         if value.is_virtual() and isinstance(value, VStringSliceValue):
-            fullindexbox = _int_add(self.optimizer,
-                                    value.vstart.force_box(),
-                                    vindex.force_box())
+            fullindexbox = _int_add(self,
+                                    value.vstart.force_box(self),
+                                    vindex.force_box(self))
             value = value.vstr
             vindex = self.getvalue(fullindexbox)
         #
@@ -449,7 +447,7 @@
                 if res is not optimizer.CVAL_UNINITIALIZED_ZERO:
                     return res
         #
-        resbox = _strgetitem(self.optimizer, value.force_box(), vindex.force_box(), mode)
+        resbox = _strgetitem(self, value.force_box(self), vindex.force_box(self), mode)
         return self.getvalue(resbox)
 
     def optimize_STRLEN(self, op):
@@ -459,7 +457,7 @@
 
     def _optimize_STRLEN(self, op, mode):
         value = self.getvalue(op.getarg(0))
-        lengthbox = value.getstrlen(self.optimizer, mode)
+        lengthbox = value.getstrlen(self, mode)
         self.make_equal_to(op.result, self.getvalue(lengthbox))
 
     def optimize_COPYSTRCONTENT(self, op):
@@ -477,12 +475,12 @@
 
         if length.is_constant() and length.box.getint() == 0:
             return
-        copy_str_content(self.optimizer,
-            src.force_box(),
-            dst.force_box(),
-            srcstart.force_box(),
-            dststart.force_box(),
-            length.force_box(),
+        copy_str_content(self,
+            src.force_box(self),
+            dst.force_box(self),
+            srcstart.force_box(self),
+            dststart.force_box(self),
+            length.force_box(self),
             mode, need_next_offset=False
         )
 
@@ -508,6 +506,8 @@
                     return
         self.emit_operation(op)
 
+    optimize_CALL_PURE = optimize_CALL
+
     def opt_call_str_STR2UNICODE(self, op):
         # Constant-fold unicode("constant string").
         # More generally, supporting non-constant but virtual cases is
@@ -547,16 +547,16 @@
             return True
         #
         vstr.ensure_nonnull()
-        lengthbox = _int_sub(self.optimizer, vstop.force_box(),
-                                            vstart.force_box())
+        lengthbox = _int_sub(self, vstop.force_box(self),
+                                   vstart.force_box(self))
         #
         if isinstance(vstr, VStringSliceValue):
             # double slicing  s[i:j][k:l]
             vintermediate = vstr
             vstr = vintermediate.vstr
-            startbox = _int_add(self.optimizer,
-                                vintermediate.vstart.force_box(),
-                                vstart.force_box())
+            startbox = _int_add(self,
+                                vintermediate.vstart.force_box(self),
+                                vstart.force_box(self))
             vstart = self.getvalue(startbox)
         #
         value = self.make_vstring_slice(op.result, op, mode)
@@ -594,8 +594,8 @@
                 do = EffectInfo.OS_STREQ_LENGTHOK
             else:
                 do = EffectInfo.OS_STREQ_NONNULL
-            self.generate_modified_call(do, [v1.force_box(),
-                                             v2.force_box()], op.result, mode)
+            self.generate_modified_call(do, [v1.force_box(self),
+                                             v2.force_box(self)], op.result, mode)
             return True
         return False
 
@@ -603,7 +603,7 @@
         l2box = v2.getstrlen(None, mode)
         if isinstance(l2box, ConstInt):
             if l2box.value == 0:
-                lengthbox = v1.getstrlen(self.optimizer, mode)
+                lengthbox = v1.getstrlen(self, mode)
                 seo = self.optimizer.send_extra_operation
                 seo(ResOperation(rop.INT_EQ, [lengthbox, CONST_0], resultbox))
                 return True
@@ -614,17 +614,17 @@
                     vchar1 = self.strgetitem(v1, optimizer.CVAL_ZERO, mode)
                     vchar2 = self.strgetitem(v2, optimizer.CVAL_ZERO, mode)
                     seo = self.optimizer.send_extra_operation
-                    seo(ResOperation(rop.INT_EQ, [vchar1.force_box(),
-                                                  vchar2.force_box()],
+                    seo(ResOperation(rop.INT_EQ, [vchar1.force_box(self),
+                                                  vchar2.force_box(self)],
                                      resultbox))
                     return True
                 if isinstance(v1, VStringSliceValue):
                     vchar = self.strgetitem(v2, optimizer.CVAL_ZERO, mode)
                     do = EffectInfo.OS_STREQ_SLICE_CHAR
-                    self.generate_modified_call(do, [v1.vstr.force_box(),
-                                                     v1.vstart.force_box(),
-                                                     v1.vlength.force_box(),
-                                                     vchar.force_box()],
+                    self.generate_modified_call(do, [v1.vstr.force_box(self),
+                                                     v1.vstart.force_box(self),
+                                                     v1.vlength.force_box(self),
+                                                     vchar.force_box(self)],
                                                 resultbox, mode)
                     return True
         #
@@ -635,10 +635,10 @@
             if v1.is_null():
                 self.make_constant(resultbox, CONST_1)
                 return True
-            op = ResOperation(rop.PTR_EQ, [v1.force_box(),
+            op = ResOperation(rop.PTR_EQ, [v1.force_box(self),
                                            llhelper.CONST_NULL],
                               resultbox)
-            self.optimizer.emit_operation(op)
+            self.emit_operation(op)
             return True
         #
         return False
@@ -652,8 +652,8 @@
                     do = EffectInfo.OS_STREQ_NONNULL_CHAR
                 else:
                     do = EffectInfo.OS_STREQ_CHECKNULL_CHAR
-                self.generate_modified_call(do, [v1.force_box(),
-                                                 vchar.force_box()], resultbox,
+                self.generate_modified_call(do, [v1.force_box(self),
+                                                 vchar.force_box(self)], resultbox,
                                             mode)
                 return True
         #
@@ -662,10 +662,10 @@
                 do = EffectInfo.OS_STREQ_SLICE_NONNULL
             else:
                 do = EffectInfo.OS_STREQ_SLICE_CHECKNULL
-            self.generate_modified_call(do, [v1.vstr.force_box(),
-                                             v1.vstart.force_box(),
-                                             v1.vlength.force_box(),
-                                             v2.force_box()], resultbox, mode)
+            self.generate_modified_call(do, [v1.vstr.force_box(self),
+                                             v1.vstart.force_box(self),
+                                             v1.vlength.force_box(self),
+                                             v2.force_box(self)], resultbox, mode)
             return True
         return False
 
@@ -675,16 +675,11 @@
         calldescr, func = cic.callinfo_for_oopspec(oopspecindex)
         op = ResOperation(rop.CALL, [ConstInt(func)] + args, result,
                           descr=calldescr)
-        self.optimizer.emit_operation(op)
+        self.emit_operation(op)
 
     def propagate_forward(self, op):
-        if not self.enabled:
-            self.emit_operation(op)
-            return
-
         dispatch_opt(self, op)
 
-
 dispatch_opt = make_dispatcher_method(OptString, 'optimize_',
         default=OptString.emit_operation)
 
diff --git a/pypy/jit/metainterp/resoperation.py b/pypy/jit/metainterp/resoperation.py
--- a/pypy/jit/metainterp/resoperation.py
+++ b/pypy/jit/metainterp/resoperation.py
@@ -221,7 +221,6 @@
         newop.setfailargs(self.getfailargs())
         return newop
 
-
 # ============
 # arity mixins
 # ============
diff --git a/pypy/jit/metainterp/test/test_ajit.py b/pypy/jit/metainterp/test/test_ajit.py
--- a/pypy/jit/metainterp/test/test_ajit.py
+++ b/pypy/jit/metainterp/test/test_ajit.py
@@ -2956,6 +2956,18 @@
         assert res == f(32)
         self.check_loops(arraylen_gc=2)
 
+    def test_ulonglong_mod(self):
+        myjitdriver = JitDriver(greens = [], reds = ['n', 'sa', 'i'])
+        def f(n):
+            sa = i = rffi.cast(rffi.ULONGLONG, 1)
+            while i < rffi.cast(rffi.ULONGLONG, n):
+                myjitdriver.jit_merge_point(sa=sa, n=n, i=i)
+                sa += sa % i
+                i += 1
+        res = self.meta_interp(f, [32])
+        assert res == f(32)
+
+
 class TestOOtype(BasicTests, OOJitMixin):
 
     def test_oohash(self):
diff --git a/pypy/jit/metainterp/test/test_longlong.py b/pypy/jit/metainterp/test/test_longlong.py
--- a/pypy/jit/metainterp/test/test_longlong.py
+++ b/pypy/jit/metainterp/test/test_longlong.py
@@ -118,6 +118,26 @@
         res = self.interp_operations(f, [1000000000])
         assert res == 123500000000.0
 
+    def test_floats_negative(self):
+        def f(i):
+            # i == 1000000000
+            f = i * -123.5
+            n = r_longlong(f)
+            compare(n, -29, 1054051584)
+            return float(n)
+        res = self.interp_operations(f, [1000000000])
+        assert res == -123500000000.0
+
+    def test_floats_ulonglong(self):
+        def f(i):
+            # i == 1000000000
+            f = i * 12350000000.0
+            n = r_ulonglong(f)
+            compare(n, -1419508847, 538116096)
+            return float(n)
+        res = self.interp_operations(f, [1000000000])
+        assert res == 12350000000000000000.0
+
     def test_unsigned_compare_ops(self):
         def f(n1, n2):
             # n == 30002000000000
diff --git a/pypy/jit/metainterp/test/test_resume.py b/pypy/jit/metainterp/test/test_resume.py
--- a/pypy/jit/metainterp/test/test_resume.py
+++ b/pypy/jit/metainterp/test/test_resume.py
@@ -576,8 +576,9 @@
 
 
 class FakeOptimizer_VirtualValue(object):
-    class cpu:
-        pass
+    class optimizer:
+        class cpu:
+            pass
 fakeoptimizer = FakeOptimizer_VirtualValue()
 
 def ConstAddr(addr, cpu):   # compatibility
@@ -1135,12 +1136,7 @@
     modifier.liveboxes = {}
     modifier.vfieldboxes = {}
 
-    class FakeOptimizer(object):
-        class cpu:
-            pass
-        def new_const_item(self, descr):
-            return None
-    v2 = VArrayValue(FakeOptimizer(), LLtypeMixin.arraydescr, 2, b2s)
+    v2 = VArrayValue(LLtypeMixin.arraydescr, None, 2, b2s)
     v2._items = [b4s, c1s]
     modifier.register_virtual_fields(b2s, [b4s, c1s])
     liveboxes = []
diff --git a/pypy/jit/metainterp/test/test_string.py b/pypy/jit/metainterp/test/test_string.py
--- a/pypy/jit/metainterp/test/test_string.py
+++ b/pypy/jit/metainterp/test/test_string.py
@@ -484,6 +484,30 @@
             return len(sa.val)
         assert self.meta_interp(f, ['a']) == f('a')
 
+    def test_string_comepare_quasiimmutable(self):
+        class Sys(object):
+            _immutable_fields_ = ["defaultencoding?"]
+            def __init__(self, s):
+                self.defaultencoding = s
+        _str = self._str
+        sys = Sys(_str('ascii'))        
+        mydriver = JitDriver(reds = ['n', 'sa'], greens = [])
+        def f(n):
+            sa = 0
+            sys.defaultencoding = _str('ascii')
+            while n:
+                mydriver.jit_merge_point(n=n, sa=sa)
+                if sys.defaultencoding == _str('ascii'):
+                    sa += 1
+                n -= 1
+            sys.defaultencoding = _str('utf-8')
+            return sa
+        assert self.meta_interp(f, [8]) == f(8)
+        self.check_loops({'int_add': 1, 'guard_true': 1, 'int_sub': 1,
+                          'jump': 1, 'int_is_true': 1,
+                          'guard_not_invalidated': 1})
+
+
 #class TestOOtype(StringTests, OOJitMixin):
 #    CALL = "oosend"
 #    CALL_PURE = "oosend_pure"
diff --git a/pypy/jit/metainterp/test/test_virtualstate.py b/pypy/jit/metainterp/test/test_virtualstate.py
--- a/pypy/jit/metainterp/test/test_virtualstate.py
+++ b/pypy/jit/metainterp/test/test_virtualstate.py
@@ -431,7 +431,7 @@
         
 
 class BaseTestBridges(BaseTest):
-    enable_opts = "intbounds:rewrite:virtualize:string:heap:unroll"
+    enable_opts = "intbounds:rewrite:virtualize:string:pure:heap:unroll"
 
     def _do_optimize_bridge(self, bridge, call_pure_results):
         from pypy.jit.metainterp.optimizeopt import optimize_bridge_1, build_opt_chain
diff --git a/pypy/module/_numpy/__init__.py b/pypy/module/_numpy/__init__.py
--- a/pypy/module/_numpy/__init__.py
+++ b/pypy/module/_numpy/__init__.py
@@ -23,6 +23,7 @@
         ("arcsin", "arcsin"),
         ("arctan", "arctan"),
         ("arcsinh", "arcsinh"),
+        ("arctanh", "arctanh"),
         ("copysign", "copysign"),
         ("cos", "cos"),
         ("divide", "divide"),
diff --git a/pypy/module/_numpy/app_numpy.py b/pypy/module/_numpy/app_numpy.py
deleted file mode 100644
diff --git a/pypy/module/_numpy/interp_dtype.py b/pypy/module/_numpy/interp_dtype.py
--- a/pypy/module/_numpy/interp_dtype.py
+++ b/pypy/module/_numpy/interp_dtype.py
@@ -7,13 +7,14 @@
 from pypy.interpreter.typedef import TypeDef, interp_attrproperty, GetSetProperty
 from pypy.module._numpy import signature
 from pypy.objspace.std.floatobject import float2string
-from pypy.rlib import rfloat
-from pypy.rlib.rarithmetic import widen
+from pypy.rlib import rarithmetic, rfloat
+from pypy.rlib.rarithmetic import LONG_BIT, widen
 from pypy.rlib.objectmodel import specialize, enforceargs
 from pypy.rlib.unroll import unrolling_iterable
 from pypy.rpython.lltypesystem import lltype, rffi
 
 
+UNSIGNEDLTR = "u"
 SIGNEDLTR = "i"
 BOOLLTR = "b"
 FLOATINGLTR = "f"
@@ -61,7 +62,10 @@
             self.val = val
 
         def wrap(self, space):
-            return space.wrap(self.val)
+            val = self.val
+            if valtype is rarithmetic.r_singlefloat:
+                val = float(val)
+            return space.wrap(val)
 
         def convert_to(self, dtype):
             return dtype.adapt_val(self.val)
@@ -145,7 +149,7 @@
         return self.adapt_val(func(self, self.for_computation(self.unbox(v))))
     return impl
 
-class ArithmaticTypeMixin(object):
+class ArithmeticTypeMixin(object):
     _mixin_ = True
 
     @binop
@@ -200,11 +204,17 @@
         return v1 >= v2
 
 
-class FloatArithmeticDtype(ArithmaticTypeMixin):
+class FloatArithmeticDtype(ArithmeticTypeMixin):
     _mixin_ = True
 
+    def unwrap(self, space, w_item):
+        return self.adapt_val(space.float_w(space.float(w_item)))
+
     def for_computation(self, v):
-        return v
+        return float(v)
+
+    def str_format(self, item):
+        return float2string(self.for_computation(self.unbox(item)), 'g', rfloat.DTSF_STR_PRECISION)
 
     @binop
     def mod(self, v1, v2):
@@ -250,12 +260,12 @@
         return math.tan(v)
     @unaryop
     def arcsin(self, v):
-        if v < -1.0 or v > 1.0:
+        if not -1.0 <= v <= 1.0:
             return rfloat.NAN
         return math.asin(v)
     @unaryop
     def arccos(self, v):
-        if v < -1.0 or v > 1.0:
+        if not -1.0 <= v <= 1.0:
             return rfloat.NAN
         return math.acos(v)
     @unaryop
@@ -266,7 +276,15 @@
     def arcsinh(self, v):
         return math.asinh(v)
 
-class IntegerArithmeticDtype(ArithmaticTypeMixin):
+    @unaryop
+    def arctanh(self, v):
+        if v == 1.0 or v == -1.0:
+            return math.copysign(rfloat.INFINITY, v)
+        if not -1.0 < v < 1.0:
+            return rfloat.NAN
+        return math.atanh(v)
+
+class IntegerArithmeticDtype(ArithmeticTypeMixin):
     _mixin_ = True
 
     def unwrap(self, space, w_item):
@@ -275,10 +293,16 @@
     def for_computation(self, v):
         return widen(v)
 
+    def str_format(self, item):
+        return str(widen(self.unbox(item)))
+
     @binop
     def mod(self, v1, v2):
         return v1 % v2
 
+class SignedIntegerArithmeticDtype(IntegerArithmeticDtype):
+    _mixin_ = True
+
     @unaryop
     def sign(self, v):
         if v > 0:
@@ -289,17 +313,21 @@
             assert v == 0
             return 0
 
-    def str_format(self, item):
-        return str(widen(self.unbox(item)))
+class UnsignedIntegerArithmeticDtype(IntegerArithmeticDtype):
+    _mixin_ = True
+
+    @unaryop
+    def sign(self, v):
+        return int(v != 0)
 
 W_BoolDtype = create_low_level_dtype(
     num = 0, kind = BOOLLTR, name = "bool",
-    aliases = ["?"],
+    aliases = ["?", "bool", "bool8"],
     applevel_types = ["bool"],
     T = lltype.Bool,
     valtype = bool,
 )
-class W_BoolDtype(IntegerArithmeticDtype, W_BoolDtype):
+class W_BoolDtype(SignedIntegerArithmeticDtype, W_BoolDtype):
     def unwrap(self, space, w_item):
         return self.adapt_val(space.is_true(w_item))
 
@@ -312,69 +340,140 @@
 
 W_Int8Dtype = create_low_level_dtype(
     num = 1, kind = SIGNEDLTR, name = "int8",
-    aliases = ["int8"],
+    aliases = ["b", "int8", "i1"],
     applevel_types = [],
     T = rffi.SIGNEDCHAR,
     valtype = rffi.SIGNEDCHAR._type,
     expected_size = 1,
 )
-class W_Int8Dtype(IntegerArithmeticDtype, W_Int8Dtype):
+class W_Int8Dtype(SignedIntegerArithmeticDtype, W_Int8Dtype):
+    pass
+
+W_UInt8Dtype = create_low_level_dtype(
+    num = 2, kind = UNSIGNEDLTR, name = "uint8",
+    aliases = ["B", "uint8", "I1"],
+    applevel_types = [],
+    T = rffi.UCHAR,
+    valtype = rffi.UCHAR._type,
+    expected_size = 1,
+)
+class W_UInt8Dtype(UnsignedIntegerArithmeticDtype, W_UInt8Dtype):
     pass
 
 W_Int16Dtype = create_low_level_dtype(
     num = 3, kind = SIGNEDLTR, name = "int16",
-    aliases = ["int16"],
+    aliases = ["h", "int16", "i2"],
     applevel_types = [],
     T = rffi.SHORT,
     valtype = rffi.SHORT._type,
     expected_size = 2,
 )
-class W_Int16Dtype(IntegerArithmeticDtype, W_Int16Dtype):
+class W_Int16Dtype(SignedIntegerArithmeticDtype, W_Int16Dtype):
+    pass
+
+W_UInt16Dtype = create_low_level_dtype(
+    num = 4, kind = UNSIGNEDLTR, name = "uint16",
+    aliases = ["H", "uint16", "I2"],
+    applevel_types = [],
+    T = rffi.USHORT,
+    valtype = rffi.USHORT._type,
+    expected_size = 2,
+)
+class W_UInt16Dtype(UnsignedIntegerArithmeticDtype, W_UInt16Dtype):
     pass
 
 W_Int32Dtype = create_low_level_dtype(
     num = 5, kind = SIGNEDLTR, name = "int32",
-    aliases = ["i"],
+    aliases = ["i", "int32", "i4"],
     applevel_types = [],
     T = rffi.INT,
     valtype = rffi.INT._type,
     expected_size = 4,
 )
-class W_Int32Dtype(IntegerArithmeticDtype, W_Int32Dtype):
+class W_Int32Dtype(SignedIntegerArithmeticDtype, W_Int32Dtype):
+    pass
+
+W_UInt32Dtype = create_low_level_dtype(
+    num = 6, kind = UNSIGNEDLTR, name = "uint32",
+    aliases = ["I", "uint32", "I4"],
+    applevel_types = [],
+    T = rffi.UINT,
+    valtype = rffi.UINT._type,
+    expected_size = 4,
+)
+class W_UInt32Dtype(UnsignedIntegerArithmeticDtype, W_UInt32Dtype):
     pass
 
 W_Int64Dtype = create_low_level_dtype(
     num = 9, kind = SIGNEDLTR, name = "int64",
-    aliases = [],
+    aliases = ["q", "int64", "i8"],
     applevel_types = ["long"],
     T = rffi.LONGLONG,
     valtype = rffi.LONGLONG._type,
     expected_size = 8,
 )
-class W_Int64Dtype(IntegerArithmeticDtype, W_Int64Dtype):
+class W_Int64Dtype(SignedIntegerArithmeticDtype, W_Int64Dtype):
+    pass
+
+W_UInt64Dtype = create_low_level_dtype(
+    num = 10, kind = UNSIGNEDLTR, name = "uint64",
+    aliases = ["Q", "uint64", "I8"],
+    applevel_types = [],
+    T = rffi.ULONGLONG,
+    valtype = rffi.ULONGLONG._type,
+    expected_size = 8,
+)
+class W_UInt64Dtype(UnsignedIntegerArithmeticDtype, W_UInt64Dtype):
+    pass
+
+if LONG_BIT == 32:
+    class W_LongDtype(W_Int32Dtype):
+        pass
+
+    class W_ULongDtype(W_UInt32Dtype):
+        pass
+else:
+    class W_LongDtype(W_Int64Dtype):
+        pass
+
+    class W_ULongDtype(W_UInt64Dtype):
+        pass
+
+W_LongDtype.num = 7
+W_LongDtype.aliases = ["l"]
+W_LongDtype.applevel_types = ["int"]
+W_ULongDtype.num = 8
+W_ULongDtype.aliases = ["L"]
+
+W_Float32Dtype = create_low_level_dtype(
+    num = 11, kind = FLOATINGLTR, name = "float32",
+    aliases = ["f", "float32", "f4"],
+    applevel_types = [],
+    T = lltype.SingleFloat,
+    valtype = rarithmetic.r_singlefloat,
+    expected_size = 4,
+)
+class W_Float32Dtype(FloatArithmeticDtype, W_Float32Dtype):
     pass
 
 W_Float64Dtype = create_low_level_dtype(
     num = 12, kind = FLOATINGLTR, name = "float64",
-    aliases = [],
+    aliases = ["d", "float64", "f8"],
     applevel_types = ["float"],
     T = lltype.Float,
     valtype = float,
     expected_size = 8,
 )
 class W_Float64Dtype(FloatArithmeticDtype, W_Float64Dtype):
-    def unwrap(self, space, w_item):
-        return self.adapt_val(space.float_w(space.float(w_item)))
-
-    def str_format(self, item):
-        return float2string(self.unbox(item), 'g', rfloat.DTSF_STR_PRECISION)
+    pass
 
 ALL_DTYPES = [
     W_BoolDtype,
-    W_Int8Dtype, W_Int16Dtype, W_Int32Dtype, W_Int64Dtype,
-    W_Float64Dtype
+    W_Int8Dtype, W_UInt8Dtype, W_Int16Dtype, W_UInt16Dtype,
+    W_Int32Dtype, W_UInt32Dtype, W_LongDtype, W_ULongDtype,
+    W_Int64Dtype, W_UInt64Dtype,
+    W_Float32Dtype, W_Float64Dtype,
 ]
-
 dtypes_by_alias = unrolling_iterable([
     (alias, dtype)
     for dtype in ALL_DTYPES
@@ -399,7 +498,7 @@
 
     num = interp_attrproperty("num", cls=W_Dtype),
     kind = interp_attrproperty("kind", cls=W_Dtype),
+    itemsize = interp_attrproperty("num_bytes", cls=W_Dtype),
     shape = GetSetProperty(W_Dtype.descr_get_shape),
-    itemsize = interp_attrproperty("num_bytes", cls=W_Dtype),
 )
 W_Dtype.typedef.acceptable_as_base_class = False
diff --git a/pypy/module/_numpy/interp_ufuncs.py b/pypy/module/_numpy/interp_ufuncs.py
--- a/pypy/module/_numpy/interp_ufuncs.py
+++ b/pypy/module/_numpy/interp_ufuncs.py
@@ -4,6 +4,7 @@
 from pypy.interpreter.typedef import TypeDef, GetSetProperty, interp_attrproperty
 from pypy.module._numpy import interp_dtype, signature
 from pypy.rlib import jit
+from pypy.rlib.rarithmetic import LONG_BIT
 from pypy.tool.sourcetools import func_with_new_name
 
 
@@ -180,23 +181,56 @@
 
     # Everything promotes to float, and bool promotes to everything.
     if dt2.kind == interp_dtype.FLOATINGLTR or dt1.kind == interp_dtype.BOOLLTR:
+        # Float32 + 8-bit int = Float64
+        if dt2.num == 11 and dt1.num_bytes >= 4:
+            return space.fromcache(interp_dtype.W_Float64Dtype)
         return dt2
 
-    assert False
+    # for now this means mixing signed and unsigned
+    if dt2.kind == interp_dtype.SIGNEDLTR:
+        # if dt2 has a greater number of bytes, then just go with it
+        if dt1.num_bytes < dt2.num_bytes:
+            return dt2
+        # we need to promote both dtypes
+        dtypenum = dt2.num + 2
+    else:
+        # increase to the next signed type (or to float)
+        dtypenum = dt2.num + 1
+        # UInt64 + signed = Float64
+        if dt2.num == 10:
+            dtypenum += 1
+    newdtype = interp_dtype.ALL_DTYPES[dtypenum]
+
+    if newdtype.num_bytes > dt2.num_bytes or newdtype.kind == interp_dtype.FLOATINGLTR:
+        return space.fromcache(newdtype)
+    else:
+        # we only promoted to long on 32-bit or to longlong on 64-bit
+        # this is really for dealing with the Long and Ulong dtypes
+        if LONG_BIT == 32:
+            dtypenum += 2
+        else:
+            dtypenum += 3
+        return space.fromcache(interp_dtype.ALL_DTYPES[dtypenum])
 
 def find_unaryop_result_dtype(space, dt, promote_to_float=False,
     promote_bools=False, promote_to_largest=False):
     if promote_bools and (dt.kind == interp_dtype.BOOLLTR):
         return space.fromcache(interp_dtype.W_Int8Dtype)
     if promote_to_float:
+        if dt.kind == interp_dtype.FLOATINGLTR:
+            return dt
+        if dt.num >= 5:
+            return space.fromcache(interp_dtype.W_Float64Dtype)
         for bytes, dtype in interp_dtype.dtypes_by_num_bytes:
-            if dtype.kind == interp_dtype.FLOATINGLTR and dtype.num_bytes >= dt.num_bytes:
+            if dtype.kind == interp_dtype.FLOATINGLTR and dtype.num_bytes > dt.num_bytes:
                 return space.fromcache(dtype)
     if promote_to_largest:
         if dt.kind == interp_dtype.BOOLLTR or dt.kind == interp_dtype.SIGNEDLTR:
             return space.fromcache(interp_dtype.W_Int64Dtype)
         elif dt.kind == interp_dtype.FLOATINGLTR:
             return space.fromcache(interp_dtype.W_Float64Dtype)
+        elif dt.kind == interp_dtype.UNSIGNEDLTR:
+            return space.fromcache(interp_dtype.W_UInt64Dtype)
         else:
             assert False
     return dt
@@ -205,15 +239,23 @@
     w_type = space.type(w_obj)
 
     bool_dtype = space.fromcache(interp_dtype.W_BoolDtype)
+    long_dtype = space.fromcache(interp_dtype.W_LongDtype)
     int64_dtype = space.fromcache(interp_dtype.W_Int64Dtype)
 
     if space.is_w(w_type, space.w_bool):
-        if current_guess is None:
+        if current_guess is None or current_guess is bool_dtype:
             return bool_dtype
+        return current_guess
     elif space.is_w(w_type, space.w_int):
         if (current_guess is None or current_guess is bool_dtype or
-            current_guess is int64_dtype):
-            return int64_dtype
+            current_guess is long_dtype):
+            return long_dtype
+        return current_guess
+    elif space.is_w(w_type, space.w_long):
+        if (current_guess is None or current_guess is bool_dtype or
+            current_guess is long_dtype or current_guess is int64_dtype):
+             return int64_dtype
+        return current_guess
     return space.fromcache(interp_dtype.W_Float64Dtype)
 
 
@@ -225,7 +267,9 @@
         def impl(res_dtype, lvalue, rvalue):
             res = getattr(res_dtype, op_name)(lvalue, rvalue)
             if comparison_func:
-                res = space.fromcache(interp_dtype.W_BoolDtype).box(res)
+                booldtype = space.fromcache(interp_dtype.W_BoolDtype)
+                assert isinstance(booldtype, interp_dtype.W_BoolDtype)
+                res = booldtype.box(res)
             return res
     return func_with_new_name(impl, ufunc_name)
 
@@ -269,6 +313,7 @@
             ("arccos", "arccos", 1, {"promote_to_float": True}),
             ("arctan", "arctan", 1, {"promote_to_float": True}),
             ("arcsinh", "arcsinh", 1, {"promote_to_float": True}),
+            ("arctanh", "arctanh", 1, {"promote_to_float": True}),
         ]:
             self.add_ufunc(space, *ufunc_def)
 
@@ -278,7 +323,7 @@
 
         identity = extra_kwargs.get("identity")
         if identity is not None:
-            identity = space.fromcache(interp_dtype.W_Int64Dtype).adapt_val(identity)
+            identity = space.fromcache(interp_dtype.W_LongDtype).adapt_val(identity)
         extra_kwargs["identity"] = identity
 
         func = ufunc_dtype_caller(space, ufunc_name, op_name, argcount,
diff --git a/pypy/module/_numpy/test/test_base.py b/pypy/module/_numpy/test/test_base.py
--- a/pypy/module/_numpy/test/test_base.py
+++ b/pypy/module/_numpy/test/test_base.py
@@ -64,18 +64,46 @@
     def test_unaryops(self, space):
         bool_dtype = space.fromcache(interp_dtype.W_BoolDtype)
         int8_dtype = space.fromcache(interp_dtype.W_Int8Dtype)
+        uint8_dtype = space.fromcache(interp_dtype.W_UInt8Dtype)
+        int16_dtype = space.fromcache(interp_dtype.W_Int16Dtype)
+        uint16_dtype = space.fromcache(interp_dtype.W_UInt16Dtype)
         int32_dtype = space.fromcache(interp_dtype.W_Int32Dtype)
+        uint32_dtype = space.fromcache(interp_dtype.W_UInt32Dtype)
+        long_dtype = space.fromcache(interp_dtype.W_LongDtype)
+        ulong_dtype = space.fromcache(interp_dtype.W_ULongDtype)
+        int64_dtype = space.fromcache(interp_dtype.W_Int64Dtype)
+        uint64_dtype = space.fromcache(interp_dtype.W_UInt64Dtype)
+        float32_dtype = space.fromcache(interp_dtype.W_Float32Dtype)
         float64_dtype = space.fromcache(interp_dtype.W_Float64Dtype)
 
-        # Normal rules, everythign returns itself
+        # Normal rules, everything returns itself
         assert find_unaryop_result_dtype(space, bool_dtype) is bool_dtype
         assert find_unaryop_result_dtype(space, int8_dtype) is int8_dtype
+        assert find_unaryop_result_dtype(space, uint8_dtype) is uint8_dtype
+        assert find_unaryop_result_dtype(space, int16_dtype) is int16_dtype
+        assert find_unaryop_result_dtype(space, uint16_dtype) is uint16_dtype
         assert find_unaryop_result_dtype(space, int32_dtype) is int32_dtype
+        assert find_unaryop_result_dtype(space, uint32_dtype) is uint32_dtype
+        assert find_unaryop_result_dtype(space, long_dtype) is long_dtype
+        assert find_unaryop_result_dtype(space, ulong_dtype) is ulong_dtype
+        assert find_unaryop_result_dtype(space, int64_dtype) is int64_dtype
+        assert find_unaryop_result_dtype(space, uint64_dtype) is uint64_dtype
+        assert find_unaryop_result_dtype(space, float32_dtype) is float32_dtype
         assert find_unaryop_result_dtype(space, float64_dtype) is float64_dtype
 
         # Coerce to floats, some of these will eventually be float16, or
         # whatever our smallest float type is.
-        assert find_unaryop_result_dtype(space, bool_dtype, promote_to_float=True) is float64_dtype
-        assert find_unaryop_result_dtype(space, int8_dtype, promote_to_float=True) is float64_dtype
+        assert find_unaryop_result_dtype(space, bool_dtype, promote_to_float=True) is float32_dtype # will be float16 if we ever put that in
+        assert find_unaryop_result_dtype(space, int8_dtype, promote_to_float=True) is float32_dtype # will be float16 if we ever put that in
+        assert find_unaryop_result_dtype(space, uint8_dtype, promote_to_float=True) is float32_dtype # will be float16 if we ever put that in
+        assert find_unaryop_result_dtype(space, int16_dtype, promote_to_float=True) is float32_dtype
+        assert find_unaryop_result_dtype(space, uint16_dtype, promote_to_float=True) is float32_dtype
         assert find_unaryop_result_dtype(space, int32_dtype, promote_to_float=True) is float64_dtype
+        assert find_unaryop_result_dtype(space, uint32_dtype, promote_to_float=True) is float64_dtype
+        assert find_unaryop_result_dtype(space, int64_dtype, promote_to_float=True) is float64_dtype
+        assert find_unaryop_result_dtype(space, uint64_dtype, promote_to_float=True) is float64_dtype
+        assert find_unaryop_result_dtype(space, float32_dtype, promote_to_float=True) is float32_dtype
         assert find_unaryop_result_dtype(space, float64_dtype, promote_to_float=True) is float64_dtype
+
+        # promote bools, happens with sign ufunc
+        assert find_unaryop_result_dtype(space, bool_dtype, promote_bools=True) is int8_dtype
diff --git a/pypy/module/_numpy/test/test_dtypes.py b/pypy/module/_numpy/test/test_dtypes.py
--- a/pypy/module/_numpy/test/test_dtypes.py
+++ b/pypy/module/_numpy/test/test_dtypes.py
@@ -17,6 +17,7 @@
         from _numpy import dtype
 
         assert dtype(bool).num == 0
+        assert dtype(int).num == 7
         assert dtype(long).num == 9
         assert dtype(float).num == 12
 
@@ -81,6 +82,48 @@
             assert isinstance(a[i], (int, long))
             assert a[1] == 1
 
+    def test_overflow(self):
+        from _numpy import array, dtype
+        assert array([128], 'b')[0] == -128
+        assert array([256], 'B')[0] == 0
+        assert array([32768], 'h')[0] == -32768
+        assert array([65536], 'H')[0] == 0
+        if dtype('l').itemsize == 4: # 32-bit
+            raises(OverflowError, "array([2**32/2], 'i')")
+            raises(OverflowError, "array([2**32], 'I')")
+        raises(OverflowError, "array([2**64/2], 'q')")
+        raises(OverflowError, "array([2**64], 'Q')")
+
+    def test_bool_binop_types(self):
+        from _numpy import array, dtype
+        types = ('?','b','B','h','H','i','I','l','L','q','Q','f','d')
+        N = len(types)
+        a = array([True], '?')
+        for t in types:
+            assert (a + array([0], t)).dtype is dtype(t)
+
+    def test_binop_types(self):
+        from _numpy import array, dtype
+        tests = [('b','B','h'), ('b','h','h'), ('b','H','i'), ('b','i','i'),
+                 ('b','l','l'), ('b','q','q'), ('b','Q','d'), ('B','h','h'),
+                 ('B','H','H'), ('B','i','i'), ('B','I','I'), ('B','l','l'),
+                 ('B','L','L'), ('B','q','q'), ('B','Q','Q'), ('h','H','i'),
+                 ('h','i','i'), ('h','l','l'), ('h','q','q'), ('h','Q','d'),
+                 ('H','i','i'), ('H','I','I'), ('H','l','l'), ('H','L','L'),
+                 ('H','q','q'), ('H','Q','Q'), ('i','l','l'), ('i','q','q'),
+                 ('i','Q','d'), ('I','L','L'), ('I','q','q'), ('I','Q','Q'),
+                 ('q','Q','d'), ('b','f','f'), ('B','f','f'), ('h','f','f'),
+                 ('H','f','f'), ('i','f','d'), ('I','f','d'), ('l','f','d'),
+                 ('L','f','d'), ('q','f','d'), ('Q','f','d'), ('q','d','d')]
+        if dtype('i').itemsize == dtype('l').itemsize: # 32-bit
+            tests.extend([('b','I','q'), ('b','L','q'), ('h','I','q'),
+                          ('h','L','q'), ('i','I','q'), ('i','L','q')])
+        else:
+            tests.extend([('b','I','l'), ('b','L','d'), ('h','I','l'),
+                          ('h','L','d'), ('i','I','l'), ('i','L','d')])
+        for d1, d2, dout in tests:
+            assert (array([1], d1) + array([1], d2)).dtype is dtype(dout)
+
     def test_add_int8(self):
         from _numpy import array, dtype
 
@@ -99,6 +142,15 @@
         for i in range(5):
             assert b[i] == i * 2
 
+    def test_add_uint32(self):
+        from _numpy import array, dtype
+
+        a = array(range(5), dtype="I")
+        b = a + a
+        assert b.dtype is dtype("I")
+        for i in range(5):
+            assert b[i] == i * 2
+
     def test_shape(self):
         from _numpy import dtype
 
diff --git a/pypy/module/_numpy/test/test_numarray.py b/pypy/module/_numpy/test/test_numarray.py
--- a/pypy/module/_numpy/test/test_numarray.py
+++ b/pypy/module/_numpy/test/test_numarray.py
@@ -18,7 +18,7 @@
         assert a[13] == 5.3
 
     def test_zeros_tuple_shape(self):
-        from numpy import zeros, ones, empty
+        from _numpy import zeros, ones, empty
         a = zeros((15,))
         assert len(a) == 15
         b = ones((3,))
@@ -560,8 +560,10 @@
         from _numpy import array, dtype
 
         assert array([True]).dtype is dtype(bool)
-        assert array([True, 1]).dtype is dtype(long)
-        assert array([1, 2, 3]).dtype is dtype(long)
+        assert array([True, False]).dtype is dtype(bool)
+        assert array([True, 1]).dtype is dtype(int)
+        assert array([1, 2, 3]).dtype is dtype(int)
+        assert array([1L, 2, 3]).dtype is dtype(long)
         assert array([1.2, True]).dtype is dtype(float)
         assert array([1.2, 5]).dtype is dtype(float)
         assert array([]).dtype is dtype(float)
diff --git a/pypy/module/_numpy/test/test_ufuncs.py b/pypy/module/_numpy/test/test_ufuncs.py
--- a/pypy/module/_numpy/test/test_ufuncs.py
+++ b/pypy/module/_numpy/test/test_ufuncs.py
@@ -234,7 +234,7 @@
             assert b[i] == math.sin(a[i])
 
         a = sin(array([True, False], dtype=bool))
-        assert a[0] == sin(1)
+        assert abs(a[0] - sin(1)) < 1e-7 # a[0] will be less precise
         assert a[1] == 0.0
 
     def test_cos(self):
@@ -300,12 +300,24 @@
 
     def test_arcsinh(self):
         import math
-        from numpy import arcsinh, inf
+        from _numpy import arcsinh
+        from numpy import inf
 
         for v in [inf, -inf, 1.0, math.e]:
             assert math.asinh(v) == arcsinh(v)
         assert math.isnan(arcsinh(float("nan")))
 
+    def test_arctanh(self):
+        import math
+        from numpy import arctanh
+
+        for v in [.99, .5, 0, -.5, -.99]:
+            assert math.atanh(v) == arctanh(v)
+        for v in [2.0, -2.0]:
+            assert math.isnan(arctanh(v))
+        for v in [1.0, -1.0]:
+            assert arctanh(v) == math.copysign(float("inf"), v)
+
     def test_reduce_errors(self):
         from _numpy import sin, add
 
diff --git a/pypy/module/_numpy/test/test_zjit.py b/pypy/module/_numpy/test/test_zjit.py
--- a/pypy/module/_numpy/test/test_zjit.py
+++ b/pypy/module/_numpy/test/test_zjit.py
@@ -1,20 +1,24 @@
 from pypy.jit.metainterp.test.support import LLJitMixin
 from pypy.module._numpy import interp_ufuncs, signature
 from pypy.module._numpy.compile import (numpy_compile, FakeSpace,
-    FloatObject)
-from pypy.module._numpy.interp_dtype import W_Float64Dtype, W_Int64Dtype
+    FloatObject, IntObject)
+from pypy.module._numpy.interp_dtype import W_Int32Dtype, W_Float64Dtype, W_Int64Dtype, W_UInt64Dtype
 from pypy.module._numpy.interp_numarray import (BaseArray, SingleDimArray,
     SingleDimSlice, scalar_w)
 from pypy.rlib.nonconst import NonConstant
 from pypy.rpython.annlowlevel import llstr
 from pypy.rpython.test.test_llinterp import interpret
 
+import py
+
 
 class TestNumpyJIt(LLJitMixin):
     def setup_class(cls):
         cls.space = FakeSpace()
         cls.float64_dtype = cls.space.fromcache(W_Float64Dtype)
         cls.int64_dtype = cls.space.fromcache(W_Int64Dtype)
+        cls.uint64_dtype = cls.space.fromcache(W_UInt64Dtype)
+        cls.int32_dtype = cls.space.fromcache(W_Int32Dtype)
 
     def test_add(self):
         def f(i):
@@ -303,6 +307,31 @@
                           'int_lt': 1, 'guard_true': 1, 'jump': 1})
         assert result == 11.0
 
+    def test_int32_sum(self):
+        py.test.skip("pypy/jit/backend/llimpl.py needs to be changed to "
+                     "deal correctly with int dtypes for this test to "
+                     "work. skip for now until someone feels up to the task")
+        space = self.space
+        float64_dtype = self.float64_dtype
+        int32_dtype = self.int32_dtype
+
+        def f(n):
+            if NonConstant(False):
+                dtype = float64_dtype
+            else:
+                dtype = int32_dtype
+            ar = SingleDimArray(n, dtype=dtype)
+            i = 0
+            while i < n:
+                ar.get_concrete().setitem(i, int32_dtype.box(7))
+                i += 1
+            v = ar.descr_add(space, ar).descr_sum(space)
+            assert isinstance(v, IntObject)
+            return v.intval
+
+        result = self.meta_interp(f, [5], listops=True, backendopt=True)
+        assert result == f(5)
+
 class TestTranslation(object):
     def test_compile(self):
         x = numpy_compile('aa+f*f/a-', 10)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_00_model.py b/pypy/module/pypyjit/test_pypy_c/test_00_model.py
--- a/pypy/module/pypyjit/test_pypy_c/test_00_model.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_00_model.py
@@ -50,8 +50,7 @@
         cmdline.append(str(self.filepath))
         #
         print cmdline, logfile
-        env={'PYPYLOG': 'jit-log-opt,jit-summary:' + str(logfile)}
-        #env={'PYPYLOG': ':' + str(logfile)}
+        env={'PYPYLOG': 'jit-log-opt,jit-log-noopt,jit-summary:' + str(logfile)}
         pipe = subprocess.Popen(cmdline,
                                 env=env,
                                 stdout=subprocess.PIPE,
diff --git a/pypy/module/pypyjit/test_pypy_c/test_call.py b/pypy/module/pypyjit/test_pypy_c/test_call.py
--- a/pypy/module/pypyjit/test_pypy_c/test_call.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_call.py
@@ -366,12 +366,12 @@
             # make sure that the "block" is not allocated
             ...
             i20 = force_token()
-            setfield_gc(p0, i20, descr=<SignedFieldDescr .*PyFrame.vable_token .*>)
             p22 = new_with_vtable(19511408)
             p24 = new_array(1, descr=<GcPtrArrayDescr>)
             p26 = new_with_vtable(ConstClass(W_ListObject))
             p27 = new(descr=<SizeDescr .*>)
             p29 = new_array(0, descr=<GcPtrArrayDescr>)
+            setfield_gc(p0, i20, descr=<SignedFieldDescr .*PyFrame.vable_token .*>)
             setfield_gc(p27, p29, descr=<GcPtrFieldDescr list.items .*>)
             setfield_gc(p26, p27, descr=<.* .*W_ListObject.inst_wrappeditems .*>)
             setarrayitem_gc(p24, 0, p26, descr=<GcPtrArrayDescr>)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_generators.py b/pypy/module/pypyjit/test_pypy_c/test_generators.py
--- a/pypy/module/pypyjit/test_pypy_c/test_generators.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_generators.py
@@ -19,8 +19,8 @@
         assert loop.match_by_id("generator", """
             i16 = force_token()
             p45 = new_with_vtable(ConstClass(W_IntObject))
-            setfield_gc(p45, i29, descr=<SignedFieldDescr .*>)
             i47 = arraylen_gc(p8, descr=<GcPtrArrayDescr>) # Should be removed by backend
             setarrayitem_gc(p8, 0, p45, descr=<GcPtrArrayDescr>)
+            setfield_gc(p45, i29, descr=<SignedFieldDescr .*>)
             jump(..., descr=...)
             """)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_instance.py b/pypy/module/pypyjit/test_pypy_c/test_instance.py
--- a/pypy/module/pypyjit/test_pypy_c/test_instance.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_instance.py
@@ -125,8 +125,8 @@
             i12 = force_token()
             --TICK--
             p20 = new_with_vtable(ConstClass(W_IntObject))
+            setfield_gc(ConstPtr(ptr21), p20, descr=<GcPtrFieldDescr .*TypeCell.inst_w_value .*>)
             setfield_gc(p20, i11, descr=<SignedFieldDescr.*W_IntObject.inst_intval .*>)
-            setfield_gc(ConstPtr(ptr21), p20, descr=<GcPtrFieldDescr .*TypeCell.inst_w_value .*>)
             jump(p0, p1, p2, p3, p4, p20, p6, i7, p20, descr=<Loop.>)
         """)
 
diff --git a/pypy/module/thread/gil.py b/pypy/module/thread/gil.py
--- a/pypy/module/thread/gil.py
+++ b/pypy/module/thread/gil.py
@@ -16,7 +16,7 @@
 
 class GILThreadLocals(OSThreadLocals):
     """A version of OSThreadLocals that enforces a GIL."""
-    ll_GIL = thread.null_ll_lock
+    gil_ready = False
 
     def initialize(self, space):
         # add the GIL-releasing callback as an action on the space
@@ -25,12 +25,10 @@
 
     def setup_threads(self, space):
         """Enable threads in the object space, if they haven't already been."""
-        if not self.ll_GIL:
-            try:
-                self.ll_GIL = thread.allocate_ll_lock()
-            except thread.error:
+        if not self.gil_ready:
+            if not thread.gil_allocate():
                 raise wrap_thread_error(space, "can't allocate GIL")
-            thread.acquire_NOAUTO(self.ll_GIL, True)
+            self.gil_ready = True
             self.enter_thread(space)   # setup the main thread
             result = True
         else:
@@ -44,19 +42,16 @@
         # test_lock_again after the global state was cleared by
         # test_compile_lock.  As a workaround, we repatch these global
         # fields systematically.
-        spacestate.ll_GIL = self.ll_GIL
         invoke_around_extcall(before_external_call, after_external_call)
         return result
 
     def reinit_threads(self, space):
-        if self.ll_GIL:
-            self.ll_GIL = thread.allocate_ll_lock()
-            thread.acquire_NOAUTO(self.ll_GIL, True)
-            self.enter_thread(space)
+        if self.gil_ready:
+            self.gil_ready = False
+            self.setup_threads(space)
 
     def yield_thread(self):
-        thread.yield_thread()  # explicitly release the gil (used by test_gil)
-
+        do_yield_thread()
 
 class GILReleaseAction(PeriodicAsyncAction):
     """An action called every sys.checkinterval bytecodes.  It releases
@@ -64,16 +59,12 @@
     """
 
     def perform(self, executioncontext, frame):
-        # Other threads can run between the release() and the acquire()
-        # implicit in the following external function call (which has
-        # otherwise no effect).
-        thread.yield_thread()
+        do_yield_thread()
 
 
 class SpaceState:
 
     def _freeze_(self):
-        self.ll_GIL = thread.null_ll_lock
         self.action_after_thread_switch = None
         # ^^^ set by AsyncAction.fire_after_thread_switch()
         return False
@@ -95,14 +86,14 @@
     # this function must not raise, in such a way that the exception
     # transformer knows that it cannot raise!
     e = get_errno()
-    thread.release_NOAUTO(spacestate.ll_GIL)
+    thread.gil_release()
     set_errno(e)
 before_external_call._gctransformer_hint_cannot_collect_ = True
 before_external_call._dont_reach_me_in_del_ = True
 
 def after_external_call():
     e = get_errno()
-    thread.acquire_NOAUTO(spacestate.ll_GIL, True)
+    thread.gil_acquire()
     thread.gc_thread_run()
     spacestate.after_thread_switch()
     set_errno(e)
@@ -115,3 +106,18 @@
 # pointers in the shadow stack.  This is necessary because the GIL is
 # not held after the call to before_external_call() or before the call
 # to after_external_call().
+
+def do_yield_thread():
+    # explicitly release the gil, in a way that tries to give more
+    # priority to other threads (as opposed to continuing to run in
+    # the same thread).
+    if thread.gil_yield_thread():
+        thread.gc_thread_run()
+        spacestate.after_thread_switch()
+do_yield_thread._gctransformer_hint_close_stack_ = True
+do_yield_thread._dont_reach_me_in_del_ = True
+do_yield_thread._dont_inline_ = True
+
+# do_yield_thread() needs a different hint: _gctransformer_hint_close_stack_.
+# The *_external_call() functions are themselves called only from the rffi
+# module from a helper function that also has this hint.
diff --git a/pypy/module/thread/ll_thread.py b/pypy/module/thread/ll_thread.py
--- a/pypy/module/thread/ll_thread.py
+++ b/pypy/module/thread/ll_thread.py
@@ -17,7 +17,8 @@
     include_dirs = [str(py.path.local(autopath.pypydir).join('translator', 'c'))],
     export_symbols = ['RPyThreadGetIdent', 'RPyThreadLockInit',
                       'RPyThreadAcquireLock', 'RPyThreadReleaseLock',
-                      'RPyThreadYield',
+                      'RPyGilAllocate', 'RPyGilYieldThread',
+                      'RPyGilRelease', 'RPyGilAcquire',
                       'RPyThreadGetStackSize', 'RPyThreadSetStackSize',
                       'RPyOpaqueDealloc_ThreadLock',
                       'RPyThreadAfterFork']
@@ -69,8 +70,16 @@
                                          [TLOCKP], lltype.Void,
                                          _nowrapper=True)
 
-# this function does nothing apart from releasing the GIL temporarily.
-yield_thread = llexternal('RPyThreadYield', [], lltype.Void, threadsafe=True)
+# these functions manipulate directly the GIL, whose definition does not
+# escape the C code itself
+gil_allocate     = llexternal('RPyGilAllocate', [], lltype.Signed,
+                              _nowrapper=True)
+gil_yield_thread = llexternal('RPyGilYieldThread', [], lltype.Signed,
+                              _nowrapper=True)
+gil_release      = llexternal('RPyGilRelease', [], lltype.Void,
+                              _nowrapper=True)
+gil_acquire      = llexternal('RPyGilAcquire', [], lltype.Void,
+                              _nowrapper=True)
 
 def allocate_lock():
     return Lock(allocate_ll_lock())
diff --git a/pypy/module/thread/test/test_gil.py b/pypy/module/thread/test/test_gil.py
--- a/pypy/module/thread/test/test_gil.py
+++ b/pypy/module/thread/test/test_gil.py
@@ -30,19 +30,34 @@
     use_threads = True
     bigtest = False
 
-    def test_one_thread(self):
+    def test_one_thread(self, skew=+1):
+        from pypy.rlib.debug import debug_print
         if self.bigtest:
-            N = 1000000
+            N = 100000
+            skew *= 25000
         else:
             N = 100
+            skew *= 25
         space = FakeSpace()
         class State:
             pass
         state = State()
-        def runme():
-            for i in range(N):
+        def runme(main=False):
+            j = 0
+            for i in range(N + [-skew, skew][main]):
+                state.datalen1 += 1   # try to crash if the GIL is not
+                state.datalen2 += 1   # correctly acquired
                 state.data.append((thread.get_ident(), i))
+                state.datalen3 += 1
+                state.datalen4 += 1
+                assert state.datalen1 == len(state.data)
+                assert state.datalen2 == len(state.data)
+                assert state.datalen3 == len(state.data)
+                assert state.datalen4 == len(state.data)
+                debug_print(main, i, state.datalen4)
                 state.threadlocals.yield_thread()
+                assert i == j
+                j += 1
         def bootstrap():
             try:
                 runme()
@@ -50,20 +65,26 @@
                 thread.gc_thread_die()
         def f():
             state.data = []
+            state.datalen1 = 0
+            state.datalen2 = 0
+            state.datalen3 = 0
+            state.datalen4 = 0
             state.threadlocals = gil.GILThreadLocals()
             state.threadlocals.setup_threads(space)
             thread.gc_thread_prepare()
             subident = thread.start_new_thread(bootstrap, ())
             mainident = thread.get_ident()
-            runme()
+            runme(True)
             still_waiting = 3000
             while len(state.data) < 2*N:
+                debug_print(len(state.data))
                 if not still_waiting:
                     raise ValueError("time out")
                 still_waiting -= 1
                 if not we_are_translated(): gil.before_external_call()
                 time.sleep(0.01)
                 if not we_are_translated(): gil.after_external_call()
+            debug_print("leaving!")
             i1 = i2 = 0
             for tid, i in state.data:
                 if tid == mainident:
@@ -72,14 +93,17 @@
                     assert i == i2; i2 += 1
                 else:
                     assert 0
-            assert i1 == N
-            assert i2 == N
+            assert i1 == N + skew
+            assert i2 == N - skew
             return len(state.data)
 
         fn = self.getcompiled(f, [])
         res = fn()
         assert res == 2*N
 
+    def test_one_thread_rev(self):
+        self.test_one_thread(skew=-1)
+
 
 class TestRunDirectly(GILTests):
     def getcompiled(self, f, argtypes):
diff --git a/pypy/module/thread/test/test_thread.py b/pypy/module/thread/test/test_thread.py
--- a/pypy/module/thread/test/test_thread.py
+++ b/pypy/module/thread/test/test_thread.py
@@ -225,7 +225,8 @@
 
         def busy_wait():
             for x in range(1000):
-                time.sleep(0.01)
+                print 'tick...', x  # <-force the GIL to be released, as
+                time.sleep(0.01)    #   time.sleep doesn't do non-translated
 
         # This is normally called by app_main.py
         signal.signal(signal.SIGINT, signal.default_int_handler)
diff --git a/pypy/rpython/lltypesystem/opimpl.py b/pypy/rpython/lltypesystem/opimpl.py
--- a/pypy/rpython/lltypesystem/opimpl.py
+++ b/pypy/rpython/lltypesystem/opimpl.py
@@ -357,7 +357,7 @@
 
 def op_cast_float_to_uint(f):
     assert type(f) is float
-    return r_uint(int(f))
+    return r_uint(long(f))
 
 def op_cast_float_to_longlong(f):
     assert type(f) is float
@@ -369,7 +369,7 @@
 
 def op_cast_float_to_ulonglong(f):
     assert type(f) is float
-    return r_ulonglong(r_longlong(f))
+    return r_ulonglong(long(f))
 
 def op_cast_char_to_int(b):
     assert type(b) is str and len(b) == 1
diff --git a/pypy/rpython/lltypesystem/test/test_lloperation.py b/pypy/rpython/lltypesystem/test/test_lloperation.py
--- a/pypy/rpython/lltypesystem/test/test_lloperation.py
+++ b/pypy/rpython/lltypesystem/test/test_lloperation.py
@@ -5,6 +5,7 @@
 from pypy.rpython.llinterp import LLFrame
 from pypy.rpython.test.test_llinterp import interpret
 from pypy.rpython import rclass
+from pypy.rlib.rarithmetic import LONGLONG_MASK, r_longlong, r_ulonglong
 
 LL_INTERP_OPERATIONS = [name[3:] for name in LLFrame.__dict__.keys()
                                  if name.startswith('op_')]
@@ -133,6 +134,14 @@
         py.test.raises(TypeError, llop.getinteriorfield,
                        lltype.Signed, s3, 'y')
 
+def test_cast_float_to_ulonglong():
+    f = 12350000000000000000.0
+    py.test.raises(OverflowError, r_longlong, f)
+    r_longlong(f / 2)   # does not raise OverflowError
+    #
+    x = llop.cast_float_to_ulonglong(lltype.UnsignedLongLong, f)
+    assert x == r_ulonglong(f)
+
 # ___________________________________________________________________________
 # This tests that the LLInterpreter and the LL_OPERATIONS tables are in sync.
 
diff --git a/pypy/translator/c/gcc/trackgcroot.py b/pypy/translator/c/gcc/trackgcroot.py
--- a/pypy/translator/c/gcc/trackgcroot.py
+++ b/pypy/translator/c/gcc/trackgcroot.py
@@ -486,6 +486,8 @@
         'paddq', 'pinsr',
         # zero-extending moves should not produce GC pointers
         'movz', 
+        # locked operations should not move GC pointers, at least so far
+        'lock',
         ])
 
     # a partial list is hopefully good enough for now; it's all to support
diff --git a/pypy/translator/c/src/thread.h b/pypy/translator/c/src/thread.h
--- a/pypy/translator/c/src/thread.h
+++ b/pypy/translator/c/src/thread.h
@@ -37,14 +37,9 @@
 
 #endif
 
-/* common helper: this does nothing, but is called with the GIL released.
-   This gives other threads a chance to grab the GIL and run. */
-void RPyThreadYield(void);
-
-#ifndef PYPY_NOT_MAIN_FILE
-void RPyThreadYield(void)
-{
-}
-#endif
+long RPyGilAllocate(void);
+long RPyGilYieldThread(void);
+void RPyGilRelease(void);
+void RPyGilAcquire(void);
 
 #endif
diff --git a/pypy/translator/c/src/thread_nt.h b/pypy/translator/c/src/thread_nt.h
--- a/pypy/translator/c/src/thread_nt.h
+++ b/pypy/translator/c/src/thread_nt.h
@@ -221,4 +221,57 @@
 #define RPyThreadTLS_Set(key, value)	TlsSetValue(key, value)
 
 
+/************************************************************/
+/* GIL code                                                 */
+/************************************************************/
+
+static volatile LONG pending_acquires = -1;
+static CRITICAL_SECTION mutex_gil;
+static HANDLE cond_gil;
+
+long RPyGilAllocate(void)
+{
+    pending_acquires = 0;
+    InitializeCriticalSection(&mutex_gil);
+    EnterCriticalSection(&mutex_gil);
+    cond_gil = CreateEvent (NULL, FALSE, FALSE, NULL);
+    return 1;
+}
+
+long RPyGilYieldThread(void)
+{
+    /* can be called even before RPyGilAllocate(), but in this case,
+       pending_acquires will be -1 */
+    if (pending_acquires <= 0)
+        return 0;
+    InterlockedIncrement(&pending_acquires);
+    PulseEvent(&cond_gil);
+
+    /* hack: the three following lines do a pthread_cond_wait(), and
+       normally specifying a timeout of INFINITE would be fine.  But the
+       first and second operations are not done atomically, so there is a
+       (small) risk that PulseEvent misses the WaitForSingleObject().
+       In this case the process will just sleep a few milliseconds. */
+    LeaveCriticalSection(&mutex_gil);
+    WaitForSingleObject(&cond_gil, 15);
+    EnterCriticalSection(&mutex_gil);
+
+    InterlockedDecrement(&pending_acquires);
+    return 1;
+}
+
+void RPyGilRelease(void)
+{
+    LeaveCriticalSection(&mutex_gil);
+    PulseEvent(&cond_gil);
+}
+
+void RPyGilAcquire(void)
+{
+    InterlockedIncrement(&pending_acquires);
+    EnterCriticalSection(&mutex_gil);
+    InterlockedDecrement(&pending_acquires);
+}
+
+
 #endif /* PYPY_NOT_MAIN_FILE */
diff --git a/pypy/translator/c/src/thread_pthread.h b/pypy/translator/c/src/thread_pthread.h
--- a/pypy/translator/c/src/thread_pthread.h
+++ b/pypy/translator/c/src/thread_pthread.h
@@ -12,6 +12,7 @@
 #include <signal.h>
 #include <stdio.h>
 #include <errno.h>
+#include <assert.h>
 
 /* The following is hopefully equivalent to what CPython does
    (which is trying to compile a snippet of code using it) */
@@ -459,4 +460,113 @@
 #define RPyThreadTLS_Set(key, value)	pthread_setspecific(key, value)
 
 
+/************************************************************/
+/* GIL code                                                 */
+/************************************************************/
+
+#ifdef __llvm__
+#  define HAS_ATOMIC_ADD
+#endif
+
+#ifdef __GNUC__
+#  if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+#    define HAS_ATOMIC_ADD
+#  endif
+#endif
+
+#ifdef HAS_ATOMIC_ADD
+#  define atomic_add __sync_fetch_and_add
+#else
+#  if defined(__amd64__)
+#    define atomic_add(ptr, value)  asm volatile ("lock addq %0, %1"        \
+                                 : : "ri"(value), "m"(*(ptr)) : "memory")
+#  elif defined(__i386__)
+#    define atomic_add(ptr, value)  asm volatile ("lock addl %0, %1"        \
+                                 : : "ri"(value), "m"(*(ptr)) : "memory")
+#  else
+#    error "Please use gcc >= 4.1 or write a custom 'asm' for your CPU."
+#  endif
+#endif
+
+#define ASSERT_STATUS(call)                             \
+    if (call != 0) {                                    \
+        fprintf(stderr, "Fatal error: " #call "\n");    \
+        abort();                                        \
+    }
+
+static void _debug_print(const char *msg)
+{
+#if 0
+    int col = (int)pthread_self();
+    col = 31 + ((col / 8) % 8);
+    fprintf(stderr, "\033[%dm%s\033[0m", col, msg);
+#endif
+}
+
+static volatile long pending_acquires = -1;
+static pthread_mutex_t mutex_gil = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cond_gil = PTHREAD_COND_INITIALIZER;
+
+static void assert_has_the_gil(void)
+{
+#ifdef RPY_ASSERT
+    assert(pthread_mutex_trylock(&mutex_gil) != 0);
+    assert(pending_acquires >= 0);
+#endif
+}
+
+long RPyGilAllocate(void)
+{
+    _debug_print("RPyGilAllocate\n");
+    pending_acquires = 0;
+    pthread_mutex_trylock(&mutex_gil);
+    assert_has_the_gil();
+    return 1;
+}
+
+long RPyGilYieldThread(void)
+{
+    /* can be called even before RPyGilAllocate(), but in this case,
+       pending_acquires will be -1 */
+#ifdef RPY_ASSERT
+    if (pending_acquires >= 0)
+        assert_has_the_gil();
+#endif
+    if (pending_acquires <= 0)
+        return 0;
+    atomic_add(&pending_acquires, 1L);
+    _debug_print("{");
+    ASSERT_STATUS(pthread_cond_signal(&cond_gil));
+    ASSERT_STATUS(pthread_cond_wait(&cond_gil, &mutex_gil));
+    _debug_print("}");
+    atomic_add(&pending_acquires, -1L);
+    assert_has_the_gil();
+    return 1;
+}
+
+void RPyGilRelease(void)
+{
+    _debug_print("RPyGilRelease\n");
+#ifdef RPY_ASSERT
+    assert(pending_acquires >= 0);
+#endif
+    assert_has_the_gil();
+    ASSERT_STATUS(pthread_mutex_unlock(&mutex_gil));
+    ASSERT_STATUS(pthread_cond_signal(&cond_gil));
+}
+
+void RPyGilAcquire(void)
+{
+    _debug_print("about to RPyGilAcquire...\n");
+#ifdef RPY_ASSERT
+    assert(pending_acquires >= 0);
+#endif
+    atomic_add(&pending_acquires, 1L);
+    ASSERT_STATUS(pthread_mutex_lock(&mutex_gil));
+    atomic_add(&pending_acquires, -1L);
+    assert_has_the_gil();
+    _debug_print("RPyGilAcquire\n");
+}
+
+
 #endif /* PYPY_NOT_MAIN_FILE */