[pypy-commit] pypy jit-duplicated_short_boxes: hg merge default

Mon Aug 29 21:21:51 CEST 2011

Author: Hakan Ardo <hakan at debian.org>
Branch: jit-duplicated_short_boxes
Changeset: r46904:2f08d7f8b5bf
Date: 2011-08-29 21:17 +0200
http://bitbucket.org/pypy/pypy/changeset/2f08d7f8b5bf/

Log:	hg merge default

diff too long, truncating to 10000 out of 17774 lines

diff --git a/lib_pypy/_ctypes/function.py b/lib_pypy/_ctypes/function.py
--- a/lib_pypy/_ctypes/function.py
+++ b/lib_pypy/_ctypes/function.py
@@ -469,7 +469,8 @@
         newargs = []
         for argtype, arg in zip(argtypes, args):
             param = argtype.from_param(arg)
-            if argtype._type_ == 'P': # special-case for c_void_p
+            _type_ = getattr(argtype, '_type_', None)
+            if _type_ == 'P': # special-case for c_void_p
                 param = param._get_buffer_value()
             elif self._is_primitive(argtype):
                 param = param.value
diff --git a/lib_pypy/_ctypes/structure.py b/lib_pypy/_ctypes/structure.py
--- a/lib_pypy/_ctypes/structure.py
+++ b/lib_pypy/_ctypes/structure.py
@@ -169,6 +169,8 @@
 
     def from_address(self, address):
         instance = StructOrUnion.__new__(self)
+        if isinstance(address, _rawffi.StructureInstance):
+            address = address.buffer
         instance.__dict__['_buffer'] = self._ffistruct.fromaddress(address)
         return instance
 
diff --git a/lib_pypy/greenlet.py b/lib_pypy/greenlet.py
--- a/lib_pypy/greenlet.py
+++ b/lib_pypy/greenlet.py
@@ -1,1 +1,138 @@
-from _stackless import greenlet
+import _continuation, sys
+
+
+# ____________________________________________________________
+# Exceptions
+
+class GreenletExit(Exception):
+    """This special exception does not propagate to the parent greenlet; it
+can be used to kill a single greenlet."""
+
+error = _continuation.error
+
+# ____________________________________________________________
+# Helper function
+
+def getcurrent():
+    "Returns the current greenlet (i.e. the one which called this function)."
+    try:
+        return _tls.current
+    except AttributeError:
+        # first call in this thread: current == main
+        _green_create_main()
+        return _tls.current
+
+# ____________________________________________________________
+# The 'greenlet' class
+
+_continulet = _continuation.continulet
+
+class greenlet(_continulet):
+    getcurrent = staticmethod(getcurrent)
+    error = error
+    GreenletExit = GreenletExit
+    __main = False
+    __started = False
+
+    def __new__(cls, *args, **kwds):
+        self = _continulet.__new__(cls)
+        self.parent = getcurrent()
+        return self
+
+    def __init__(self, run=None, parent=None):
+        if run is not None:
+            self.run = run
+        if parent is not None:
+            self.parent = parent
+
+    def switch(self, *args):
+        "Switch execution to this greenlet, optionally passing the values "
+        "given as argument(s).  Returns the value passed when switching back."
+        return self.__switch(_continulet.switch, args)
+
+    def throw(self, typ=GreenletExit, val=None, tb=None):
+        "raise exception in greenlet, return value passed when switching back"
+        return self.__switch(_continulet.throw, typ, val, tb)
+
+    def __switch(target, unbound_method, *args):
+        current = getcurrent()
+        #
+        while not target:
+            if not target.__started:
+                _continulet.__init__(target, _greenlet_start, *args)
+                args = ()
+                target.__started = True
+                break
+            # already done, go to the parent instead
+            # (NB. infinite loop possible, but unlikely, unless you mess
+            # up the 'parent' explicitly.  Good enough, because a Ctrl-C
+            # will show that the program is caught in this loop here.)
+            target = target.parent
+        #
+        try:
+            if current.__main:
+                if target.__main:
+                    # switch from main to main
+                    if unbound_method == _continulet.throw:
+                        raise args[0], args[1], args[2]
+                    (args,) = args
+                else:
+                    # enter from main to target
+                    args = unbound_method(target, *args)
+            else:
+                if target.__main:
+                    # leave to go to target=main
+                    args = unbound_method(current, *args)
+                else:
+                    # switch from non-main to non-main
+                    args = unbound_method(current, *args, to=target)
+        except GreenletExit, e:
+            args = (e,)
+        finally:
+            _tls.current = current
+        #
+        if len(args) == 1:
+            return args[0]
+        else:
+            return args
+
+    def __nonzero__(self):
+        return self.__main or _continulet.is_pending(self)
+
+    @property
+    def dead(self):
+        return self.__started and not self
+
+    @property
+    def gr_frame(self):
+        raise NotImplementedError("attribute 'gr_frame' of greenlet objects")
+
+# ____________________________________________________________
+# Internal stuff
+
+try:
+    from thread import _local
+except ImportError:
+    class _local(object):    # assume no threads
+        pass
+
+_tls = _local()
+
+def _green_create_main():
+    # create the main greenlet for this thread
+    _tls.current = None
+    gmain = greenlet.__new__(greenlet)
+    gmain._greenlet__main = True
+    gmain._greenlet__started = True
+    assert gmain.parent is None
+    _tls.main = gmain
+    _tls.current = gmain
+
+def _greenlet_start(greenlet, args):
+    _tls.current = greenlet
+    try:
+        res = greenlet.run(*args)
+    finally:
+        if greenlet.parent is not _tls.main:
+            _continuation.permute(greenlet, greenlet.parent)
+    return (res,)
diff --git a/pypy/annotation/description.py b/pypy/annotation/description.py
--- a/pypy/annotation/description.py
+++ b/pypy/annotation/description.py
@@ -399,9 +399,7 @@
                 if b1 is object:
                     continue
                 if b1.__dict__.get('_mixin_', False):
-                    assert b1.__bases__ == () or b1.__bases__ == (object,), (
-                        "mixin class %r should have no base" % (b1,))
-                    self.add_sources_for_class(b1, mixin=True)
+                    self.add_mixin(b1)
                 else:
                     assert base is object, ("multiple inheritance only supported "
                                             "with _mixin_: %r" % (cls,))
@@ -469,6 +467,15 @@
                 return
         self.classdict[name] = Constant(value)
 
+    def add_mixin(self, base):
+        for subbase in base.__bases__:
+            if subbase is object:
+                continue
+            assert subbase.__dict__.get("_mixin_", False), ("Mixin class %r has non"
+                "mixin base class %r" % (base, subbase))
+            self.add_mixin(subbase)
+        self.add_sources_for_class(base, mixin=True)
+
     def add_sources_for_class(self, cls, mixin=False):
         for name, value in cls.__dict__.items():
             self.add_source_attribute(name, value, mixin)
diff --git a/pypy/config/makerestdoc.py b/pypy/config/makerestdoc.py
--- a/pypy/config/makerestdoc.py
+++ b/pypy/config/makerestdoc.py
@@ -134,7 +134,7 @@
         for child in self._children:
             subpath = fullpath + "." + child._name
             toctree.append(subpath)
-        content.add(Directive("toctree", *toctree, maxdepth=4))
+        content.add(Directive("toctree", *toctree, **{'maxdepth': 4}))
         content.join(
             ListItem(Strong("name:"), self._name),
             ListItem(Strong("description:"), self.doc))
diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py
--- a/pypy/config/pypyoption.py
+++ b/pypy/config/pypyoption.py
@@ -33,7 +33,8 @@
      "struct", "_hashlib", "_md5", "_sha", "_minimal_curses", "cStringIO",
      "thread", "itertools", "pyexpat", "_ssl", "cpyext", "array",
      "_bisect", "binascii", "_multiprocessing", '_warnings',
-     "_collections", "_multibytecodec", "micronumpy", "_ffi"]
+     "_collections", "_multibytecodec", "micronumpy", "_ffi",
+     "_continuation"]
 ))
 
 translation_modules = default_modules.copy()
@@ -99,6 +100,7 @@
     "_ssl"      : ["pypy.module._ssl.interp_ssl"],
     "_hashlib"  : ["pypy.module._ssl.interp_ssl"],
     "_minimal_curses": ["pypy.module._minimal_curses.fficurses"],
+    "_continuation": ["pypy.rlib.rstacklet"],
     }
 
 def get_module_validator(modname):
diff --git a/pypy/config/test/test_config.py b/pypy/config/test/test_config.py
--- a/pypy/config/test/test_config.py
+++ b/pypy/config/test/test_config.py
@@ -1,5 +1,5 @@
 from pypy.config.config import *
-import py
+import py, sys
 
 def make_description():
     gcoption = ChoiceOption('name', 'GC name', ['ref', 'framework'], 'ref')
@@ -69,13 +69,15 @@
     attrs = dir(config)
     assert '__repr__' in attrs        # from the type
     assert '_cfgimpl_values' in attrs # from self
-    assert 'gc' in attrs              # custom attribute
-    assert 'objspace' in attrs        # custom attribute
+    if sys.version_info >= (2, 6):
+        assert 'gc' in attrs              # custom attribute
+        assert 'objspace' in attrs        # custom attribute
     #
     attrs = dir(config.gc)
-    assert 'name' in attrs
-    assert 'dummy' in attrs
-    assert 'float' in attrs
+    if sys.version_info >= (2, 6):
+        assert 'name' in attrs
+        assert 'dummy' in attrs
+        assert 'float' in attrs
 
 def test_arbitrary_option():
     descr = OptionDescription("top", "", [
diff --git a/pypy/config/translationoption.py b/pypy/config/translationoption.py
--- a/pypy/config/translationoption.py
+++ b/pypy/config/translationoption.py
@@ -28,10 +28,9 @@
 
 translation_optiondescription = OptionDescription(
         "translation", "Translation Options", [
-    BoolOption("stackless", "enable stackless features during compilation",
-               default=False, cmdline="--stackless",
-               requires=[("translation.type_system", "lltype"),
-                         ("translation.gcremovetypeptr", False)]),  # XXX?
+    BoolOption("continuation", "enable single-shot continuations",
+               default=False, cmdline="--continuation",
+               requires=[("translation.type_system", "lltype")]),
     ChoiceOption("type_system", "Type system to use when RTyping",
                  ["lltype", "ootype"], cmdline=None, default="lltype",
                  requires={
@@ -70,7 +69,8 @@
                      "statistics": [("translation.gctransformer", "framework")],
                      "generation": [("translation.gctransformer", "framework")],
                      "hybrid": [("translation.gctransformer", "framework")],
-                     "boehm": [("translation.gctransformer", "boehm")],
+                     "boehm": [("translation.gctransformer", "boehm"),
+                               ("translation.continuation", False)],  # breaks
                      "markcompact": [("translation.gctransformer", "framework")],
                      "minimark": [("translation.gctransformer", "framework")],
                      },
@@ -389,8 +389,6 @@
             config.translation.suggest(withsmallfuncsets=5)
         elif word == 'jit':
             config.translation.suggest(jit=True)
-            if config.translation.stackless:
-                raise NotImplementedError("JIT conflicts with stackless for now")
         elif word == 'removetypeptr':
             config.translation.suggest(gcremovetypeptr=True)
         else:
diff --git a/pypy/doc/_ref.txt b/pypy/doc/_ref.txt
--- a/pypy/doc/_ref.txt
+++ b/pypy/doc/_ref.txt
@@ -1,11 +1,10 @@
 .. _`ctypes_configure/doc/sample.py`: https://bitbucket.org/pypy/pypy/src/default/ctypes_configure/doc/sample.py
 .. _`demo/`: https://bitbucket.org/pypy/pypy/src/default/demo/
-.. _`demo/pickle_coroutine.py`: https://bitbucket.org/pypy/pypy/src/default/demo/pickle_coroutine.py
 .. _`lib-python/`: https://bitbucket.org/pypy/pypy/src/default/lib-python/
 .. _`lib-python/2.7/dis.py`: https://bitbucket.org/pypy/pypy/src/default/lib-python/2.7/dis.py
 .. _`lib_pypy/`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/
+.. _`lib_pypy/greenlet.py`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/greenlet.py
 .. _`lib_pypy/pypy_test/`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/pypy_test/
-.. _`lib_pypy/stackless.py`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/stackless.py
 .. _`lib_pypy/tputil.py`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/tputil.py
 .. _`pypy/annotation`:
 .. _`pypy/annotation/`: https://bitbucket.org/pypy/pypy/src/default/pypy/annotation/
@@ -55,7 +54,6 @@
 .. _`pypy/module`:
 .. _`pypy/module/`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/
 .. _`pypy/module/__builtin__/__init__.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/__builtin__/__init__.py
-.. _`pypy/module/_stackless/test/test_composable_coroutine.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/_stackless/test/test_composable_coroutine.py
 .. _`pypy/objspace`:
 .. _`pypy/objspace/`: https://bitbucket.org/pypy/pypy/src/default/pypy/objspace/
 .. _`pypy/objspace/dump.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/objspace/dump.py
@@ -117,6 +115,7 @@
 .. _`pypy/translator/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/
 .. _`pypy/translator/backendopt/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/backendopt/
 .. _`pypy/translator/c/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/c/
+.. _`pypy/translator/c/src/stacklet/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/c/src/stacklet/
 .. _`pypy/translator/cli/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/cli/
 .. _`pypy/translator/goal/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/goal/
 .. _`pypy/translator/jvm/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/jvm/
diff --git a/pypy/doc/architecture.rst b/pypy/doc/architecture.rst
--- a/pypy/doc/architecture.rst
+++ b/pypy/doc/architecture.rst
@@ -153,7 +153,7 @@
 
 * Optionally, `various transformations`_ can then be applied which, for
   example, perform optimizations such as inlining, add capabilities
-  such as stackless_-style concurrency, or insert code for the
+  such as stackless-style concurrency (deprecated), or insert code for the
   `garbage collector`_.
 
 * Then, the graphs are converted to source code for the target platform
@@ -255,7 +255,6 @@
 
 .. _Python: http://docs.python.org/reference/
 .. _Psyco: http://psyco.sourceforge.net
-.. _stackless: stackless.html
 .. _`generate Just-In-Time Compilers`: jit/index.html
 .. _`JIT Generation in PyPy`: jit/index.html
 .. _`implement your own interpreter`: http://morepypy.blogspot.com/2011/04/tutorial-writing-interpreter-with-pypy.html
diff --git a/pypy/doc/config/objspace.usemodules._stackless.txt b/pypy/doc/config/objspace.usemodules._continuation.txt
copy from pypy/doc/config/objspace.usemodules._stackless.txt
copy to pypy/doc/config/objspace.usemodules._continuation.txt
--- a/pypy/doc/config/objspace.usemodules._stackless.txt
+++ b/pypy/doc/config/objspace.usemodules._continuation.txt
@@ -1,6 +1,4 @@
-Use the '_stackless' module. 
+Use the '_continuation' module. 
 
-Exposes the `stackless` primitives, and also implies a stackless build. 
-See also :config:`translation.stackless`.
-
-.. _`stackless`: ../stackless.html
+Exposes the `continulet` app-level primitives.
+See also :config:`translation.continuation`.
diff --git a/pypy/doc/config/objspace.usemodules._stackless.txt b/pypy/doc/config/objspace.usemodules._stackless.txt
--- a/pypy/doc/config/objspace.usemodules._stackless.txt
+++ b/pypy/doc/config/objspace.usemodules._stackless.txt
@@ -1,6 +1,1 @@
-Use the '_stackless' module. 
-
-Exposes the `stackless` primitives, and also implies a stackless build. 
-See also :config:`translation.stackless`.
-
-.. _`stackless`: ../stackless.html
+Deprecated.
diff --git a/pypy/doc/config/translation.stackless.txt b/pypy/doc/config/translation.continuation.txt
rename from pypy/doc/config/translation.stackless.txt
rename to pypy/doc/config/translation.continuation.txt
--- a/pypy/doc/config/translation.stackless.txt
+++ b/pypy/doc/config/translation.continuation.txt
@@ -1,5 +1,2 @@
-Run the `stackless transform`_ on each generated graph, which enables the use
-of coroutines at RPython level and the "stackless" module when translating
-PyPy.
-
-.. _`stackless transform`: ../stackless.html
+Enable the use of a stackless-like primitive called "stacklet".
+In PyPy, this is exposed at app-level by the "_continuation" module.
diff --git a/pypy/doc/cpython_differences.rst b/pypy/doc/cpython_differences.rst
--- a/pypy/doc/cpython_differences.rst
+++ b/pypy/doc/cpython_differences.rst
@@ -24,6 +24,7 @@
     _bisect
     _codecs
     _collections
+    `_continuation`_
     `_ffi`_
     _hashlib
     _io
@@ -84,10 +85,6 @@
 
     _winreg
 
-  Extra module with Stackless_ only:
-
-    _stackless
-
   Note that only some of these modules are built-in in a typical
   CPython installation, and the rest is from non built-in extension
   modules.  This means that e.g. ``import parser`` will, on CPython,
@@ -108,11 +105,11 @@
 
 .. the nonstandard modules are listed below...
 .. _`__pypy__`: __pypy__-module.html
+.. _`_continuation`: stackless.html
 .. _`_ffi`: ctypes-implementation.html
 .. _`_rawffi`: ctypes-implementation.html
 .. _`_minimal_curses`: config/objspace.usemodules._minimal_curses.html
 .. _`cpyext`: http://morepypy.blogspot.com/2010/04/using-cpython-extension-modules-with.html
-.. _Stackless: stackless.html
 
 
 Differences related to garbage collection strategies
diff --git a/pypy/doc/getting-started-python.rst b/pypy/doc/getting-started-python.rst
--- a/pypy/doc/getting-started-python.rst
+++ b/pypy/doc/getting-started-python.rst
@@ -67,7 +67,6 @@
    * ``libssl-dev`` (for the optional ``_ssl`` module)
    * ``libgc-dev`` (for the Boehm garbage collector: only needed when translating with `--opt=0, 1` or `size`)
    * ``python-sphinx`` (for the optional documentation build.  You need version 1.0.7 or later)
-   * ``python-greenlet`` (for the optional stackless support in interpreted mode/testing)
 
 
 3. Translation is time-consuming -- 45 minutes on a very fast machine --
@@ -120,19 +119,8 @@
 Installation_ below.
 
 The ``translate.py`` script takes a very large number of options controlling
-what to translate and how.  See ``translate.py -h``. Some of the more
-interesting options (but for now incompatible with the JIT) are:
-
-   * ``--stackless``: this produces a pypy-c that includes features
-     inspired by `Stackless Python <http://www.stackless.com>`__.
-
-   * ``--gc=boehm|ref|marknsweep|semispace|generation|hybrid|minimark``:
-     choose between using
-     the `Boehm-Demers-Weiser garbage collector`_, our reference
-     counting implementation or one of own collector implementations
-     (the default depends on the optimization level but is usually
-     ``minimark``).
-
+what to translate and how.  See ``translate.py -h``. The default options
+should be suitable for mostly everybody by now.
 Find a more detailed description of the various options in our `configuration
 sections`_.
 
diff --git a/pypy/doc/how-to-release.rst b/pypy/doc/how-to-release.rst
--- a/pypy/doc/how-to-release.rst
+++ b/pypy/doc/how-to-release.rst
@@ -42,7 +42,6 @@
     JIT: windows, linux, os/x
     no JIT: windows, linux, os/x
     sandbox: linux, os/x
-    stackless: windows, linux, os/x
 
 * write release announcement pypy/doc/release-x.y(.z).txt
   the release announcement should contain a direct link to the download page
diff --git a/pypy/doc/index.rst b/pypy/doc/index.rst
--- a/pypy/doc/index.rst
+++ b/pypy/doc/index.rst
@@ -35,7 +35,7 @@
 
   * `Differences between PyPy and CPython`_
   * `What PyPy can do for your objects`_
-  * `Stackless and coroutines`_
+  * `Continulets and greenlets`_
   * `JIT Generation in PyPy`_ 
   * `Sandboxing Python code`_
 
@@ -292,8 +292,6 @@
 
 `pypy/translator/jvm/`_            the Java backend
 
-`pypy/translator/stackless/`_      the `Stackless Transform`_
-
 `pypy/translator/tool/`_           helper tools for translation, including the Pygame
                                    `graph viewer`_
 
@@ -318,7 +316,7 @@
 .. _`transparent proxies`: objspace-proxies.html#tproxy
 .. _`Differences between PyPy and CPython`: cpython_differences.html
 .. _`What PyPy can do for your objects`: objspace-proxies.html
-.. _`Stackless and coroutines`: stackless.html
+.. _`Continulets and greenlets`: stackless.html
 .. _StdObjSpace: objspace.html#the-standard-object-space 
 .. _`abstract interpretation`: http://en.wikipedia.org/wiki/Abstract_interpretation
 .. _`rpython`: coding-guide.html#rpython 
@@ -337,7 +335,6 @@
 .. _`low-level type system`: rtyper.html#low-level-type
 .. _`object-oriented type system`: rtyper.html#oo-type
 .. _`garbage collector`: garbage_collection.html
-.. _`Stackless Transform`: translation.html#the-stackless-transform
 .. _`main PyPy-translation scripts`: getting-started-python.html#translating-the-pypy-python-interpreter
 .. _`.NET`: http://www.microsoft.com/net/
 .. _Mono: http://www.mono-project.com/
diff --git a/pypy/doc/rlib.rst b/pypy/doc/rlib.rst
--- a/pypy/doc/rlib.rst
+++ b/pypy/doc/rlib.rst
@@ -134,69 +134,6 @@
 a hierarchy of Address classes, in a typical static-OO-programming style.
 
 
-``rstack``
-==========
-
-The `pypy/rlib/rstack.py`_ module allows an RPython program to control its own execution stack.
-This is only useful if the program is translated using stackless. An old
-description of the exposed functions is below.
-
-We introduce an RPython type ``frame_stack_top`` and a built-in function
-``yield_current_frame_to_caller()`` that work as follows (see example below):
-
-* The built-in function ``yield_current_frame_to_caller()`` causes the current
-  function's state to be captured in a new ``frame_stack_top`` object that is
-  returned to the parent.  Only one frame, the current one, is captured this
-  way.  The current frame is suspended and the caller continues to run.  Note
-  that the caller is only resumed once: when
-  ``yield_current_frame_to_caller()`` is called.  See below.
-
-* A ``frame_stack_top`` object can be jumped to by calling its ``switch()``
-  method with no argument.
-
-* ``yield_current_frame_to_caller()`` and ``switch()`` themselves return a new
-  ``frame_stack_top`` object: the freshly captured state of the caller of the
-  source ``switch()`` that was just executed, or None in the case described
-  below.
-
-* the function that called ``yield_current_frame_to_caller()`` also has a
-  normal return statement, like all functions.  This statement must return
-  another ``frame_stack_top`` object.  The latter is *not* returned to the
-  original caller; there is no way to return several times to the caller.
-  Instead, it designates the place to which the execution must jump, as if by
-  a ``switch()``.  The place to which we jump this way will see a None as the
-  source frame stack top.
-
-* every frame stack top must be resumed once and only once.  Not resuming
-  it at all causes a leak.  Resuming it several times causes a crash.
-
-* a function that called ``yield_current_frame_to_caller()`` should not raise.
-  It would have no implicit parent frame to propagate the exception to.  That
-  would be a crashingly bad idea.
-
-The following example would print the numbers from 1 to 7 in order::
-
-    def g():
-        print 2
-        frametop_before_5 = yield_current_frame_to_caller()
-        print 4
-        frametop_before_7 = frametop_before_5.switch()
-        print 6
-        return frametop_before_7
-
-    def f():
-        print 1
-        frametop_before_4 = g()
-        print 3
-        frametop_before_6 = frametop_before_4.switch()
-        print 5
-        frametop_after_return = frametop_before_6.switch()
-        print 7
-        assert frametop_after_return is None
-
-    f()
-
-
 ``streamio``
 ============
 
diff --git a/pypy/doc/stackless.rst b/pypy/doc/stackless.rst
--- a/pypy/doc/stackless.rst
+++ b/pypy/doc/stackless.rst
@@ -8,446 +8,289 @@
 ================
 
 PyPy can expose to its user language features similar to the ones
-present in `Stackless Python`_: **no recursion depth limit**, and the
-ability to write code in a **massively concurrent style**.  It actually
-exposes three different paradigms to choose from:
+present in `Stackless Python`_: the ability to write code in a
+**massively concurrent style**.  (It does not (any more) offer the
+ability to run with no `recursion depth limit`_, but the same effect
+can be achieved indirectly.)
 
-* `Tasklets and channels`_;
+This feature is based on a custom primitive called a continulet_.
+Continulets can be directly used by application code, or it is possible
+to write (entirely at app-level) more user-friendly interfaces.
 
-* Greenlets_;
+Currently PyPy implements greenlets_ on top of continulets.  It would be
+easy to implement tasklets and channels as well, emulating the model
+of `Stackless Python`_.
 
-* Plain coroutines_.
+Continulets are extremely light-weight, which means that PyPy should be
+able to handle programs containing large amounts of them.  However, due
+to an implementation restriction, a PyPy compiled with
+``--gcrootfinder=shadowstack`` consumes at least one page of physical
+memory (4KB) per live continulet, and half a megabyte of virtual memory
+on 32-bit or a complete megabyte on 64-bit.  Moreover, the feature is
+only available (so far) on x86 and x86-64 CPUs; for other CPUs you need
+to add a short page of custom assembler to
+`pypy/translator/c/src/stacklet/`_.
 
-All of them are extremely light-weight, which means that PyPy should be
-able to handle programs containing large amounts of coroutines, tasklets
-and greenlets.
 
+Theory
+======
 
-Requirements
-++++++++++++++++
+The fundamental idea is that, at any point in time, the program happens
+to run one stack of frames (or one per thread, in case of
+multi-threading).  To see the stack, start at the top frame and follow
+the chain of ``f_back`` until you reach the bottom frame.  From the
+point of view of one of these frames, it has a ``f_back`` pointing to
+another frame (unless it is the bottom frame), and it is itself being
+pointed to by another frame (unless it is the top frame).
 
-If you are running py.py on top of CPython, then you need to enable
-the _stackless module by running it as follows::
+The theory behind continulets is to literally take the previous sentence
+as definition of "an O.K. situation".  The trick is that there are
+O.K. situations that are more complex than just one stack: you will
+always have one stack, but you can also have in addition one or more
+detached *cycles* of frames, such that by following the ``f_back`` chain
+you run in a circle.  But note that these cycles are indeed completely
+detached: the top frame (the currently running one) is always the one
+which is not the ``f_back`` of anybody else, and it is always the top of
+a stack that ends with the bottom frame, never a part of these extra
+cycles.
 
-    py.py --withmod-_stackless
+How do you create such cycles?  The fundamental operation to do so is to
+take two frames and *permute* their ``f_back`` --- i.e. exchange them.
+You can permute any two ``f_back`` without breaking the rule of "an O.K.
+situation".  Say for example that ``f`` is some frame halfway down the
+stack, and you permute its ``f_back`` with the ``f_back`` of the top
+frame.  Then you have removed from the normal stack all intermediate
+frames, and turned them into one stand-alone cycle.  By doing the same
+permutation again you restore the original situation.
 
-This is implemented internally using greenlets, so it only works on a
-platform where `greenlets`_ are supported.  A few features do
-not work this way, though, and really require a translated
-``pypy-c``.
+In practice, in PyPy, you cannot change the ``f_back`` of an abitrary
+frame, but only of frames stored in ``continulets``.
 
-To obtain a translated version of ``pypy-c`` that includes Stackless
-support, run translate.py as follows::
-
-    cd pypy/translator/goal
-    python translate.py --stackless
+Continulets are internally implemented using stacklets.  Stacklets are a
+bit more primitive (they are really one-shot continuations), but that
+idea only works in C, not in Python.  The basic idea of continulets is
+to have at any point in time a complete valid stack; this is important
+e.g. to correctly propagate exceptions (and it seems to give meaningful
+tracebacks too).
 
 
 Application level interface
 =============================
 
-A stackless PyPy contains a module called ``stackless``.  The interface
-exposed by this module have not been refined much, so it should be
-considered in-flux (as of 2007).
 
-So far, PyPy does not provide support for ``stackless`` in a threaded
-environment.  This limitation is not fundamental, as previous experience
-has shown, so supporting this would probably be reasonably easy.
+.. _continulet:
 
-An interesting point is that the same ``stackless`` module can provide
-a number of different concurrency paradigms at the same time.  From a
-theoretical point of view, none of above-mentioned existing three
-paradigms considered on its own is new: two of them are from previous
-Python work, and the third one is a variant of the classical coroutine.
-The new part is that the PyPy implementation manages to provide all of
-them and let the user implement more.  Moreover - and this might be an
-important theoretical contribution of this work - we manage to provide
-these concurrency concepts in a "composable" way.  In other words, it
-is possible to naturally mix in a single application multiple
-concurrency paradigms, and multiple unrelated usages of the same
-paradigm.  This is discussed in the Composability_ section below.
+Continulets
++++++++++++
 
+A translated PyPy contains by default a module called ``_continuation``
+exporting the type ``continulet``.  A ``continulet`` object from this
+module is a container that stores a "one-shot continuation".  It plays
+the role of an extra frame you can insert in the stack, and whose
+``f_back`` can be changed.
 
-Infinite recursion
-++++++++++++++++++
+To make a continulet object, call ``continulet()`` with a callable and
+optional extra arguments.
 
-Any stackless PyPy executable natively supports recursion that is only
-limited by the available memory.  As in normal Python, though, there is
-an initial recursion limit (which is 5000 in all pypy-c's, and 1000 in
-CPython).  It can be changed with ``sys.setrecursionlimit()``.  With a
-stackless PyPy, any value is acceptable - use ``sys.maxint`` for
-unlimited.
+Later, the first time you ``switch()`` to the continulet, the callable
+is invoked with the same continulet object as the extra first argument.
+At that point, the one-shot continuation stored in the continulet points
+to the caller of ``switch()``.  In other words you have a perfectly
+normal-looking stack of frames.  But when ``switch()`` is called again,
+this stored one-shot continuation is exchanged with the current one; it
+means that the caller of ``switch()`` is suspended with its continuation
+stored in the container, and the old continuation from the continulet
+object is resumed.
 
-In some cases, you can write Python code that causes interpreter-level
-infinite recursion -- i.e. infinite recursion without going via
-application-level function calls.  It is possible to limit that too,
-with ``_stackless.set_stack_depth_limit()``, or to unlimit it completely
-by setting it to ``sys.maxint``.
+The most primitive API is actually 'permute()', which just permutes the
+one-shot continuation stored in two (or more) continulets.
 
+In more details:
 
-Coroutines
-++++++++++
+* ``continulet(callable, *args, **kwds)``: make a new continulet.
+  Like a generator, this only creates it; the ``callable`` is only
+  actually called the first time it is switched to.  It will be
+  called as follows::
 
-A Coroutine is similar to a very small thread, with no preemptive scheduling.
-Within a family of coroutines, the flow of execution is explicitly
-transferred from one to another by the programmer.  When execution is
-transferred to a coroutine, it begins to execute some Python code.  When
-it transfers execution away from itself it is temporarily suspended, and
-when execution returns to it it resumes its execution from the
-point where it was suspended.  Conceptually, only one coroutine is
-actively running at any given time (but see Composability_ below).
+      callable(cont, *args, **kwds)
 
-The ``stackless.coroutine`` class is instantiated with no argument.
-It provides the following methods and attributes:
+  where ``cont`` is the same continulet object.
 
-* ``stackless.coroutine.getcurrent()``
+  Note that it is actually ``cont.__init__()`` that binds
+  the continulet.  It is also possible to create a not-bound-yet
+  continulet by calling explicitly ``continulet.__new__()``, and
+  only bind it later by calling explicitly ``cont.__init__()``.
 
-    Static method returning the currently running coroutine.  There is a
-    so-called "main" coroutine object that represents the "outer"
-    execution context, where your main program started and where it runs
-    as long as it does not switch to another coroutine.
+* ``cont.switch(value=None, to=None)``: start the continulet if
+  it was not started yet.  Otherwise, store the current continuation
+  in ``cont``, and activate the target continuation, which is the
+  one that was previously stored in ``cont``.  Note that the target
+  continuation was itself previously suspended by another call to
+  ``switch()``; this older ``switch()`` will now appear to return.
+  The ``value`` argument is any object that is carried to the target
+  and returned by the target's ``switch()``.
 
-* ``coro.bind(callable, *args, **kwds)``
+  If ``to`` is given, it must be another continulet object.  In
+  that case, performs a "double switch": it switches as described
+  above to ``cont``, and then immediately switches again to ``to``.
+  This is different from switching directly to ``to``: the current
+  continuation gets stored in ``cont``, the old continuation from
+  ``cont`` gets stored in ``to``, and only then we resume the
+  execution from the old continuation out of ``to``.
 
-    Bind the coroutine so that it will execute ``callable(*args,
-    **kwds)``.  The call is not performed immediately, but only the
-    first time we call the ``coro.switch()`` method.  A coroutine must
-    be bound before it is switched to.  When the coroutine finishes
-    (because the call to the callable returns), the coroutine exits and
-    implicitly switches back to another coroutine (its "parent"); after
-    this point, it is possible to bind it again and switch to it again.
-    (Which coroutine is the parent of which is not documented, as it is
-    likely to change when the interface is refined.)
+* ``cont.throw(type, value=None, tb=None, to=None)``: similar to
+  ``switch()``, except that immediately after the switch is done, raise
+  the given exception in the target.
 
-* ``coro.switch()``
+* ``cont.is_pending()``: return True if the continulet is pending.
+  This is False when it is not initialized (because we called
+  ``__new__`` and not ``__init__``) or when it is finished (because
+  the ``callable()`` returned).  When it is False, the continulet
+  object is empty and cannot be ``switch()``-ed to.
 
-    Suspend the current (caller) coroutine, and resume execution in the
-    target coroutine ``coro``.
+* ``permute(*continulets)``: a global function that permutes the
+  continuations stored in the given continulets arguments.  Mostly
+  theoretical.  In practice, using ``cont.switch()`` is easier and
+  more efficient than using ``permute()``; the latter does not on
+  its own change the currently running frame.
 
-* ``coro.kill()``
 
-    Kill ``coro`` by sending a CoroutineExit exception and switching
-    execution immediately to it. This exception can be caught in the 
-    coroutine itself and can be raised from any call to ``coro.switch()``. 
-    This exception isn't propagated to the parent coroutine.
+Genlets
++++++++
 
-* ``coro.throw(type, value)``
+The ``_continuation`` module also exposes the ``generator`` decorator::
 
-    Insert an exception in ``coro`` an resume switches execution
-    immediately to it. In the coroutine itself, this exception
-    will come from any call to ``coro.switch()`` and can be caught. If the
-    exception isn't caught, it will be propagated to the parent coroutine.
+    @generator
+    def f(cont, a, b):
+        cont.switch(a + b)
+        cont.switch(a + b + 1)
 
-When a coroutine is garbage-collected, it gets the ``.kill()`` method sent to
-it. This happens at the point the next ``.switch`` method is called, so the
-target coroutine of this call will be executed only after the ``.kill`` has
-finished.
+    for i in f(10, 20):
+        print i
 
-Example
-~~~~~~~
+This example prints 30 and 31.  The only advantage over using regular
+generators is that the generator itself is not limited to ``yield``
+statements that must all occur syntactically in the same function.
+Instead, we can pass around ``cont``, e.g. to nested sub-functions, and
+call ``cont.switch(x)`` from there.
 
-Here is a classical producer/consumer example: an algorithm computes a
-sequence of values, while another consumes them.  For our purposes we
-assume that the producer can generate several values at once, and the
-consumer can process up to 3 values in a batch - it can also process
-batches with fewer than 3 values without waiting for the producer (which
-would be messy to express with a classical Python generator). ::
+The ``generator`` decorator can also be applied to methods::
 
-    def producer(lst):
-        while True:
-            ...compute some more values...
-            lst.extend(new_values)
-            coro_consumer.switch()
-
-    def consumer(lst):
-        while True:
-            # First ask the producer for more values if needed
-            while len(lst) == 0:
-                coro_producer.switch()
-            # Process the available values in a batch, but at most 3
-            batch = lst[:3]
-            del lst[:3]
-            ...process batch...
-
-    # Initialize two coroutines with a shared list as argument
-    exchangelst = []
-    coro_producer = coroutine()
-    coro_producer.bind(producer, exchangelst)
-    coro_consumer = coroutine()
-    coro_consumer.bind(consumer, exchangelst)
-
-    # Start running the consumer coroutine
-    coro_consumer.switch()
-
-
-Tasklets and channels
-+++++++++++++++++++++
-
-The ``stackless`` module also provides an interface that is roughly
-compatible with the interface of the ``stackless`` module in `Stackless
-Python`_: it contains ``stackless.tasklet`` and ``stackless.channel``
-classes.  Tasklets are also similar to microthreads, but (like coroutines)
-they don't actually run in parallel with other microthreads; instead,
-they synchronize and exchange data with each other over Channels, and
-these exchanges determine which Tasklet runs next.
-
-For usage reference, see the documentation on the `Stackless Python`_
-website.
-
-Note that Tasklets and Channels are implemented at application-level in
-`lib_pypy/stackless.py`_ on top of coroutines_.  You can refer to this
-module for more details and API documentation.
-
-The stackless.py code tries to resemble the stackless C code as much
-as possible. This makes the code somewhat unpythonic.
-
-Bird's eye view of tasklets and channels
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Tasklets are a bit like threads: they encapsulate a function in such a way that
-they can be suspended/restarted any time. Unlike threads, they won't
-run concurrently, but must be cooperative. When using stackless
-features, it is vitally important that no action is performed that blocks
-everything else.  In particular, blocking input/output should be centralized
-to a single tasklet.
-
-Communication between tasklets is done via channels. 
-There are three ways for a tasklet to give up control:
-
-1. call ``stackless.schedule()``
-2. send something over a channel
-3. receive something from a channel
-
-A (live) tasklet can either be running, waiting to get scheduled, or be
-blocked by a channel.
-
-Scheduling is done in strictly round-robin manner. A blocked tasklet
-is removed from the scheduling queue and will be reinserted when it
-becomes unblocked.
-
-Example
-~~~~~~~
-
-Here is a many-producers many-consumers example, where any consumer can
-process the result of any producer.  For this situation we set up a
-single channel where all producer send, and on which all consumers
-wait::
-
-    def producer(chan):
-        while True:
-            chan.send(...next value...)
-
-    def consumer(chan):
-        while True:
-            x = chan.receive()
-            ...do something with x...
-
-    # Set up the N producer and M consumer tasklets
-    common_channel = stackless.channel()
-    for i in range(N):
-        stackless.tasklet(producer, common_channel)()
-    for i in range(M):
-        stackless.tasklet(consumer, common_channel)()
-
-    # Run it all
-    stackless.run()
-
-Each item sent over the channel is received by one of the waiting
-consumers; which one is not specified.  The producers block until their
-item is consumed: the channel is not a queue, but rather a meeting point
-which causes tasklets to block until both a consumer and a producer are
-ready.  In practice, the reason for having several consumers receiving
-on a single channel is that some of the consumers can be busy in other
-ways part of the time.  For example, each consumer might receive a
-database request, process it, and send the result to a further channel
-before it asks for the next request.  In this situation, further
-requests can still be received by other consumers.
+    class X:
+        @generator
+        def f(self, cont, a, b):
+            ...
 
 
 Greenlets
 +++++++++
 
-A Greenlet is a kind of primitive Tasklet with a lower-level interface
-and with exact control over the execution order.  Greenlets are similar
-to Coroutines, with a slightly different interface: greenlets put more
-emphasis on a tree structure.  The various greenlets of a program form a
-precise tree, which fully determines their order of execution.
+Greenlets are implemented on top of continulets in `lib_pypy/greenlet.py`_.
+See the official `documentation of the greenlets`_.
 
-For usage reference, see the `documentation of the greenlets`_.
-The PyPy interface is identical.  You should use ``greenlet.greenlet``
-instead of ``stackless.greenlet`` directly, because the greenlet library
-can give you the latter when you ask for the former on top of PyPy.
+Note that unlike the CPython greenlets, this version does not suffer
+from GC issues: if the program "forgets" an unfinished greenlet, it will
+always be collected at the next garbage collection.
 
-PyPy's greenlets do not suffer from the cyclic GC limitation that the
-CPython greenlets have: greenlets referencing each other via local
-variables tend to leak on top of CPython (where it is mostly impossible
-to do the right thing).  It works correctly on top of PyPy.
 
+Unimplemented features
+++++++++++++++++++++++
 
-Coroutine Pickling
-++++++++++++++++++
+The following features (present in some past Stackless version of PyPy)
+are for the time being not supported any more:
 
-Coroutines and tasklets can be pickled and unpickled, i.e. serialized to
-a string of bytes for the purpose of storage or transmission.  This
-allows "live" coroutines or tasklets to be made persistent, moved to
-other machines, or cloned in any way.  The standard ``pickle`` module
-works with coroutines and tasklets (at least in a translated ``pypy-c``;
-unpickling live coroutines or tasklets cannot be easily implemented on
-top of CPython).
+* Tasklets and channels (needs to be rewritten at app-level)
 
-To be able to achieve this result, we have to consider many objects that
-are not normally pickleable in CPython.  Here again, the `Stackless
-Python`_ implementation has paved the way, and we follow the same
-general design decisions: simple internal objects like bound method
-objects and various kinds of iterators are supported; frame objects can
-be fully pickled and unpickled
-(by serializing a reference to the bytecode they are
-running in addition to all the local variables).  References to globals
-and modules are pickled by name, similarly to references to functions
-and classes in the traditional CPython ``pickle``.
+* Coroutines (could be rewritten at app-level)
 
-The "magic" part of this process is the implementation of the unpickling
-of a chain of frames.  The Python interpreter of PyPy uses
-interpreter-level recursion to represent application-level calls.  The
-reason for this is that it tremendously simplifies the implementation of
-the interpreter itself.  Indeed, in Python, almost any operation can
-potentially result in a non-tail-recursive call to another Python
-function.  This makes writing a non-recursive interpreter extremely
-tedious; instead, we rely on lower-level transformations during the
-translation process to control this recursion.  This is the `Stackless
-Transform`_, which is at the heart of PyPy's support for stackless-style
-concurrency.
+* Pickling and unpickling continulets (*)
 
-At any point in time, a chain of Python-level frames corresponds to a
-chain of interpreter-level frames (e.g. C frames in pypy-c), where each
-single Python-level frame corresponds to one or a few interpreter-level
-frames - depending on the length of the interpreter-level call chain
-from one bytecode evaluation loop to the next (recursively invoked) one.
+* Continuing execution of a continulet in a different thread (*)
 
-This means that it is not sufficient to simply create a chain of Python
-frame objects in the heap of a process before we can resume execution of
-these newly built frames.  We must recreate a corresponding chain of
-interpreter-level frames.  To this end, we have inserted a few *named
-resume points* (see 3.2.4, in `D07.1 Massive Parallelism and Translation Aspects`_) in the Python interpreter of PyPy.  This is the
-motivation for implementing the interpreter-level primitives
-``resume_state_create()`` and ``resume_state_invoke()``, the powerful
-interface that allows an RPython program to artificially rebuild a chain
-of calls in a reflective way, completely from scratch, and jump to it.
+* Automatic unlimited stack (must be emulated__ so far)
 
-.. _`D07.1 Massive Parallelism and Translation Aspects`: http://codespeak.net/pypy/extradoc/eu-report/D07.1_Massive_Parallelism_and_Translation_Aspects-2007-02-28.pdf
+.. __: `recursion depth limit`_
 
-Example
-~~~~~~~
+(*) Pickling, as well as changing threads, could be implemented by using
+a "soft" stack switching mode again.  We would get either "hard" or
+"soft" switches, similarly to Stackless Python 3rd version: you get a
+"hard" switch (like now) when the C stack contains non-trivial C frames
+to save, and a "soft" switch (like previously) when it contains only
+simple calls from Python to Python.  Soft-switched continulets would
+also consume a bit less RAM, at the possible expense of making the
+switch a bit slower (unsure about that; what is the Stackless Python
+experience?).
 
-(See `demo/pickle_coroutine.py`_ for the complete source of this demo.)
 
-Consider a program which contains a part performing a long-running
-computation::
+Recursion depth limit
++++++++++++++++++++++
 
-    def ackermann(x, y):
-        if x == 0:
-            return y + 1
-        if y == 0:
-            return ackermann(x - 1, 1)
-        return ackermann(x - 1, ackermann(x, y - 1))
+You can use continulets to emulate the infinite recursion depth present
+in Stackless Python and in stackless-enabled older versions of PyPy.
 
-By using pickling, we can save the state of the computation while it is
-running, for the purpose of restoring it later and continuing the
-computation at another time or on a different machine.  However,
-pickling does not produce a whole-program dump: it can only pickle
-individual coroutines.  This means that the computation should be
-started in its own coroutine::
+The trick is to start a continulet "early", i.e. when the recursion
+depth is very low, and switch to it "later", i.e. when the recursion
+depth is high.  Example::
 
-    # Make a coroutine that will run 'ackermann(3, 8)'
-    coro = coroutine()
-    coro.bind(ackermann, 3, 8)
+    from _continuation import continulet
 
-    # Now start running the coroutine
-    result = coro.switch()
+    def invoke(_, callable, arg):
+        return callable(arg)
 
-The coroutine itself must switch back to the main program when it needs
-to be interrupted (we can only pickle suspended coroutines).  Due to
-current limitations this requires an explicit check in the
-``ackermann()`` function::
+    def bootstrap(c):
+        # this loop runs forever, at a very low recursion depth
+        callable, arg = c.switch()
+        while True:
+            # start a new continulet from here, and switch to
+            # it using an "exchange", i.e. a switch with to=.
+            to = continulet(invoke, callable, arg)
+            callable, arg = c.switch(to=to)
 
-    def ackermann(x, y):
-        if interrupt_flag:      # test a global flag
-            main.switch()       # and switch back to 'main' if it is set
-        if x == 0:
-            return y + 1
-        if y == 0:
-            return ackermann(x - 1, 1)
-        return ackermann(x - 1, ackermann(x, y - 1))
+    c = continulet(bootstrap)
+    c.switch()
 
-The global ``interrupt_flag`` would be set for example by a timeout, or
-by a signal handler reacting to Ctrl-C, etc.  It causes the coroutine to
-transfer control back to the main program.  The execution comes back
-just after the line ``coro.switch()``, where we can pickle the coroutine
-if necessary::
 
-    if not coro.is_alive:
-        print "finished; the result is:", result
-    else:
-        # save the state of the suspended coroutine
-        f = open('demo.pickle', 'w')
-        pickle.dump(coro, f)
-        f.close()
+    def recursive(n):
+        if n == 0:
+            return ("ok", n)
+        if n % 200 == 0:
+            prev = c.switch((recursive, n - 1))
+        else:
+            prev = recursive(n - 1)
+        return (prev[0], prev[1] + 1)
 
-The process can then stop.  At any later time, or on another machine,
-we can reload the file and restart the coroutine with::
+    print recursive(999999)     # prints ('ok', 999999)
 
-    f = open('demo.pickle', 'r')
-    coro = pickle.load(f)
-    f.close()
-    result = coro.switch()
+Note that if you press Ctrl-C while running this example, the traceback
+will be built with *all* recursive() calls so far, even if this is more
+than the number that can possibly fit in the C stack.  These frames are
+"overlapping" each other in the sense of the C stack; more precisely,
+they are copied out of and into the C stack as needed.
 
-Limitations
-~~~~~~~~~~~
+(The example above also makes use of the following general "guideline"
+to help newcomers write continulets: in ``bootstrap(c)``, only call
+methods on ``c``, not on another continulet object.  That's why we wrote
+``c.switch(to=to)`` and not ``to.switch()``, which would mess up the
+state.  This is however just a guideline; in general we would recommend
+to use other interfaces like genlets and greenlets.)
 
-Coroutine pickling is subject to some limitations.  First of all, it is
-not a whole-program "memory dump".  It means that only the "local" state
-of a coroutine is saved.  The local state is defined to include the
-chain of calls and the local variables, but not for example the value of
-any global variable.
 
-As in normal Python, the pickle will not include any function object's
-code, any class definition, etc., but only references to functions and
-classes.  Unlike normal Python, the pickle contains frames.  A pickled
-frame stores a bytecode index, representing the current execution
-position.  This means that the user program cannot be modified *at all*
-between pickling and unpickling!
-
-On the other hand, the pickled data is fairly independent from the
-platform and from the PyPy version.
-
-Pickling/unpickling fails if the coroutine is suspended in a state that
-involves Python frames which were *indirectly* called.  To define this
-more precisely, a Python function can issue a regular function or method
-call to invoke another Python function - this is a *direct* call and can
-be pickled and unpickled.  But there are many ways to invoke a Python
-function indirectly.  For example, most operators can invoke a special
-method ``__xyz__()`` on a class, various built-in functions can call
-back Python functions, signals can invoke signal handlers, and so on.
-These cases are not supported yet.
-
-
-Composability
-+++++++++++++
+Theory of composability
++++++++++++++++++++++++
 
 Although the concept of coroutines is far from new, they have not been
 generally integrated into mainstream languages, or only in limited form
 (like generators in Python and iterators in C#).  We can argue that a
 possible reason for that is that they do not scale well when a program's
 complexity increases: they look attractive in small examples, but the
-models that require explicit switching, by naming the target coroutine,
-do not compose naturally.  This means that a program that uses
-coroutines for two unrelated purposes may run into conflicts caused by
-unexpected interactions.
+models that require explicit switching, for example by naming the target
+coroutine, do not compose naturally.  This means that a program that
+uses coroutines for two unrelated purposes may run into conflicts caused
+by unexpected interactions.
 
 To illustrate the problem, consider the following example (simplified
-code; see the full source in
-`pypy/module/_stackless/test/test_composable_coroutine.py`_).  First, a
-simple usage of coroutine::
+code using a theorical ``coroutine`` class).  First, a simple usage of
+coroutine::
 
     main_coro = coroutine.getcurrent()    # the main (outer) coroutine
     data = []
@@ -530,74 +373,35 @@
 main coroutine, which confuses the ``generator_iterator.next()`` method
 (it gets resumed, but not as a result of a call to ``Yield()``).
 
-As part of trying to combine multiple different paradigms into a single
-application-level module, we have built a way to solve this problem.
-The idea is to avoid the notion of a single, global "main" coroutine (or
-a single main greenlet, or a single main tasklet).  Instead, each
-conceptually separated user of one of these concurrency interfaces can
-create its own "view" on what the main coroutine/greenlet/tasklet is,
-which other coroutine/greenlet/tasklets there are, and which of these is
-the currently running one.  Each "view" is orthogonal to the others.  In
-particular, each view has one (and exactly one) "current"
-coroutine/greenlet/tasklet at any point in time.  When the user switches
-to a coroutine/greenlet/tasklet, it implicitly means that he wants to
-switch away from the current coroutine/greenlet/tasklet *that belongs to
-the same view as the target*.
+Thus the notion of coroutine is *not composable*.  By opposition, the
+primitive notion of continulets is composable: if you build two
+different interfaces on top of it, or have a program that uses twice the
+same interface in two parts, then assuming that both parts independently
+work, the composition of the two parts still works.
 
-The precise application-level interface has not been fixed yet; so far,
-"views" in the above sense are objects of the type
-``stackless.usercostate``.  The above two examples can be rewritten in
-the following way::
+A full proof of that claim would require careful definitions, but let us
+just claim that this fact is true because of the following observation:
+the API of continulets is such that, when doing a ``switch()``, it
+requires the program to have some continulet to explicitly operate on.
+It shuffles the current continuation with the continuation stored in
+that continulet, but has no effect outside.  So if a part of a program
+has a continulet object, and does not expose it as a global, then the
+rest of the program cannot accidentally influence the continuation
+stored in that continulet object.
 
-    producer_view = stackless.usercostate()   # a local view
-    main_coro = producer_view.getcurrent()    # the main (outer) coroutine
-    ...
-    producer_coro = producer_view.newcoroutine()
-    ...
-
-and::
-
-    generators_view = stackless.usercostate()
-
-    def generator(f):
-        def wrappedfunc(*args, **kwds):
-            g = generators_view.newcoroutine(generator_iterator)
-            ...
-
-            ...generators_view.getcurrent()...
-
-Then the composition ``grab_values()`` works as expected, because the
-two views are independent.  The coroutine captured as ``self.caller`` in
-the ``generator_iterator.next()`` method is the main coroutine of the
-``generators_view``.  It is no longer the same object as the main
-coroutine of the ``producer_view``, so when ``data_producer()`` issues
-the following command::
-
-    main_coro.switch()
-
-the control flow cannot accidentally jump back to
-``generator_iterator.next()``.  In other words, from the point of view
-of ``producer_view``, the function ``grab_next_value()`` always runs in
-its main coroutine ``main_coro`` and the function ``data_producer`` in
-its coroutine ``producer_coro``.  This is the case independently of
-which ``generators_view``-based coroutine is the current one when
-``grab_next_value()`` is called.
-
-Only code that has explicit access to the ``producer_view`` or its
-coroutine objects can perform switches that are relevant for the
-generator code.  If the view object and the coroutine objects that share
-this view are all properly encapsulated inside the generator logic, no
-external code can accidentally temper with the expected control flow any
-longer.
-
-In conclusion: we will probably change the app-level interface of PyPy's
-stackless module in the future to not expose coroutines and greenlets at
-all, but only views.  They are not much more difficult to use, and they
-scale automatically to larger programs.
+In other words, if we regard the continulet object as being essentially
+a modifiable ``f_back``, then it is just a link between the frame of
+``callable()`` and the parent frame --- and it cannot be arbitrarily
+changed by unrelated code, as long as they don't explicitly manipulate
+the continulet object.  Typically, both the frame of ``callable()``
+(commonly a local function) and its parent frame (which is the frame
+that switched to it) belong to the same class or module; so from that
+point of view the continulet is a purely local link between two local
+frames.  It doesn't make sense to have a concept that allows this link
+to be manipulated from outside.
 
 
 .. _`Stackless Python`: http://www.stackless.com
 .. _`documentation of the greenlets`: http://packages.python.org/greenlet/
-.. _`Stackless Transform`: translation.html#the-stackless-transform
 
 .. include:: _ref.txt
diff --git a/pypy/doc/translation.rst b/pypy/doc/translation.rst
--- a/pypy/doc/translation.rst
+++ b/pypy/doc/translation.rst
@@ -552,14 +552,15 @@
 
 The stackless transform converts functions into a form that knows how
 to save the execution point and active variables into a heap structure
-and resume execution at that point.  This is used to implement
+and resume execution at that point.  This was used to implement
 coroutines as an RPython-level feature, which in turn are used to
-implement `coroutines, greenlets and tasklets`_ as an application
+implement coroutines, greenlets and tasklets as an application
 level feature for the Standard Interpreter.
 
-Enable the stackless transformation with :config:`translation.stackless`.
+The stackless transformation has been deprecated and is no longer
+available in trunk.  It has been replaced with continulets_.
 
-.. _`coroutines, greenlets and tasklets`: stackless.html
+.. _continulets: stackless.html
 
 .. _`preparing the graphs for source generation`:
 
diff --git a/pypy/interpreter/test/test_gateway.py b/pypy/interpreter/test/test_gateway.py
--- a/pypy/interpreter/test/test_gateway.py
+++ b/pypy/interpreter/test/test_gateway.py
@@ -704,7 +704,7 @@
 class TestPassThroughArguments_CALL_METHOD(TestPassThroughArguments):
 
     def setup_class(cls):
-        space = gettestobjspace(usemodules=('_stackless',), **{
+        space = gettestobjspace(usemodules=('itertools',), **{
             "objspace.opcodes.CALL_METHOD": True
             })
         cls.space = space
diff --git a/pypy/jit/backend/llgraph/llimpl.py b/pypy/jit/backend/llgraph/llimpl.py
--- a/pypy/jit/backend/llgraph/llimpl.py
+++ b/pypy/jit/backend/llgraph/llimpl.py
@@ -57,6 +57,12 @@
     else:
         return LLSupport.from_rstr(s)
 
+FLOAT_ARRAY_TP = lltype.Ptr(lltype.Array(lltype.Float, hints={"nolength": True}))
+def maybe_uncast(TP, array):
+    if array._TYPE.TO._hints.get("uncast_on_llgraph"):
+        array = rffi.cast(TP, array)
+    return array
+
 # a list of argtypes of all operations - couldn't find any and it's
 # very useful.  Note however that the table is half-broken here and
 # there, in ways that are sometimes a bit hard to fix; that's why
@@ -1079,7 +1085,7 @@
     if isinstance(TYPE, lltype.Ptr):
         if isinstance(x, (int, long, llmemory.AddressAsInt)):
             x = llmemory.cast_int_to_adr(x)
-        if TYPE is rffi.VOIDP:
+        if TYPE is rffi.VOIDP or TYPE.TO._hints.get("uncast_on_llgraph"):
             # assume that we want a "C-style" cast, without typechecking the value
             return rffi.cast(TYPE, x)
         return llmemory.cast_adr_to_ptr(x, TYPE)
@@ -1329,8 +1335,8 @@
     return cast_to_floatstorage(array.getitem(index))
 
 def do_getarrayitem_raw_float(array, index):
-    array = array.adr.ptr._obj
-    return cast_to_floatstorage(array.getitem(index))
+    array = maybe_uncast(FLOAT_ARRAY_TP, array.adr.ptr)
+    return cast_to_floatstorage(array._obj.getitem(index))
 
 def do_getarrayitem_gc_ptr(array, index):
     array = array._obj.container
@@ -1392,8 +1398,9 @@
     newvalue = cast_from_floatstorage(ITEMTYPE, newvalue)
     array.setitem(index, newvalue)
 
+
 def do_setarrayitem_raw_float(array, index, newvalue):
-    array = array.adr.ptr
+    array = maybe_uncast(FLOAT_ARRAY_TP, array.adr.ptr)
     ITEMTYPE = lltype.typeOf(array).TO.OF
     newvalue = cast_from_floatstorage(ITEMTYPE, newvalue)
     array._obj.setitem(index, newvalue)
diff --git a/pypy/jit/backend/llgraph/runner.py b/pypy/jit/backend/llgraph/runner.py
--- a/pypy/jit/backend/llgraph/runner.py
+++ b/pypy/jit/backend/llgraph/runner.py
@@ -312,7 +312,7 @@
         token = history.getkind(getattr(S, fieldname))
         return self.getdescr(ofs, token[0], name=fieldname)
 
-    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo):
         arg_types = []
         for ARG in ARGS:
             token = history.getkind(ARG)
@@ -326,7 +326,7 @@
         return self.getdescr(0, token[0], extrainfo=extrainfo,
                              arg_types=''.join(arg_types))
 
-    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo=None):
+    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo):
         from pypy.jit.backend.llsupport.ffisupport import get_ffi_type_kind
         from pypy.jit.backend.llsupport.ffisupport import UnsupportedKind
         arg_types = []
@@ -522,7 +522,7 @@
         return FieldDescr.new(T1, fieldname)
 
     @staticmethod
-    def calldescrof(FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(FUNC, ARGS, RESULT, extrainfo):
         return StaticMethDescr.new(FUNC, ARGS, RESULT, extrainfo)
 
     @staticmethod
diff --git a/pypy/jit/backend/llsupport/gc.py b/pypy/jit/backend/llsupport/gc.py
--- a/pypy/jit/backend/llsupport/gc.py
+++ b/pypy/jit/backend/llsupport/gc.py
@@ -366,36 +366,92 @@
 
     def add_jit2gc_hooks(self, jit2gc):
         #
-        def collect_jit_stack_root(callback, gc, addr):
-            if addr.signed[0] != GcRootMap_shadowstack.MARKER:
-                # common case
-                if gc.points_to_valid_gc_object(addr):
-                    callback(gc, addr)
-                return WORD
-            else:
-                # case of a MARKER followed by an assembler stack frame
-                follow_stack_frame_of_assembler(callback, gc, addr)
-                return 2 * WORD
+        # ---------------
+        # This is used to enumerate the shadowstack in the presence
+        # of the JIT.  It is also used by the stacklet support in
+        # rlib/_stacklet_shadowstack.  That's why it is written as
+        # an iterator that can also be used with a custom_trace.
         #
-        def follow_stack_frame_of_assembler(callback, gc, addr):
-            frame_addr = addr.signed[1]
-            addr = llmemory.cast_int_to_adr(frame_addr + self.force_index_ofs)
-            force_index = addr.signed[0]
-            if force_index < 0:
-                force_index = ~force_index
-            callshape = self._callshapes[force_index]
-            n = 0
-            while True:
-                offset = rffi.cast(lltype.Signed, callshape[n])
-                if offset == 0:
-                    break
-                addr = llmemory.cast_int_to_adr(frame_addr + offset)
-                if gc.points_to_valid_gc_object(addr):
-                    callback(gc, addr)
-                n += 1
+        class RootIterator:
+            _alloc_flavor_ = "raw"
+
+            def next(iself, gc, next, range_highest):
+                # Return the "next" valid GC object' address.  This usually
+                # means just returning "next", until we reach "range_highest",
+                # except that we are skipping NULLs.  If "next" contains a
+                # MARKER instead, then we go into JIT-frame-lookup mode.
+                #
+                while True:
+                    #
+                    # If we are not iterating right now in a JIT frame
+                    if iself.frame_addr == 0:
+                        #
+                        # Look for the next shadowstack address that
+                        # contains a valid pointer
+                        while next != range_highest:
+                            if next.signed[0] == self.MARKER:
+                                break
+                            if gc.points_to_valid_gc_object(next):
+                                return next
+                            next += llmemory.sizeof(llmemory.Address)
+                        else:
+                            return llmemory.NULL     # done
+                        #
+                        # It's a JIT frame.  Save away 'next' for later, and
+                        # go into JIT-frame-exploring mode.
+                        next += llmemory.sizeof(llmemory.Address)
+                        frame_addr = next.signed[0]
+                        iself.saved_next = next
+                        iself.frame_addr = frame_addr
+                        addr = llmemory.cast_int_to_adr(frame_addr +
+                                                        self.force_index_ofs)
+                        addr = iself.translateptr(iself.context, addr)
+                        force_index = addr.signed[0]
+                        if force_index < 0:
+                            force_index = ~force_index
+                        # NB: the next line reads a still-alive _callshapes,
+                        # because we ensure that just before we called this
+                        # piece of assembler, we put on the (same) stack a
+                        # pointer to a loop_token that keeps the force_index
+                        # alive.
+                        callshape = self._callshapes[force_index]
+                    else:
+                        # Continuing to explore this JIT frame
+                        callshape = iself.callshape
+                    #
+                    # 'callshape' points to the next INT of the callshape.
+                    # If it's zero we are done with the JIT frame.
+                    while rffi.cast(lltype.Signed, callshape[0]) != 0:
+                        #
+                        # Non-zero: it's an offset inside the JIT frame.
+                        # Read it and increment 'callshape'.
+                        offset = rffi.cast(lltype.Signed, callshape[0])
+                        callshape = lltype.direct_ptradd(callshape, 1)
+                        addr = llmemory.cast_int_to_adr(iself.frame_addr +
+                                                        offset)
+                        addr = iself.translateptr(iself.context, addr)
+                        if gc.points_to_valid_gc_object(addr):
+                            #
+                            # The JIT frame contains a valid GC pointer at
+                            # this address (as opposed to NULL).  Save
+                            # 'callshape' for the next call, and return the
+                            # address.
+                            iself.callshape = callshape
+                            return addr
+                    #
+                    # Restore 'prev' and loop back to the start.
+                    iself.frame_addr = 0
+                    next = iself.saved_next
+                    next += llmemory.sizeof(llmemory.Address)
+
+        # ---------------
         #
+        root_iterator = RootIterator()
+        root_iterator.frame_addr = 0
+        root_iterator.context = llmemory.NULL
+        root_iterator.translateptr = lambda context, addr: addr
         jit2gc.update({
-            'rootstackhook': collect_jit_stack_root,
+            'root_iterator': root_iterator,
             })
 
     def initialize(self):
@@ -550,7 +606,7 @@
             has_finalizer = bool(tid & (1<<llgroup.HALFSHIFT))
             check_typeid(type_id)
             res = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
-                                                  type_id, size, True,
+                                                  type_id, size,
                                                   has_finalizer, False)
             # In case the operation above failed, we are returning NULL
             # from this function to assembler.  There is also an RPython
@@ -575,7 +631,7 @@
             return llop1.do_malloc_varsize_clear(
                 llmemory.GCREF,
                 type_id, num_elem, self.array_basesize, itemsize,
-                self.array_length_ofs, True)
+                self.array_length_ofs)
         self.malloc_array = malloc_array
         self.GC_MALLOC_ARRAY = lltype.Ptr(lltype.FuncType(
             [lltype.Signed] * 3, llmemory.GCREF))
@@ -591,12 +647,12 @@
             return llop1.do_malloc_varsize_clear(
                 llmemory.GCREF,
                 str_type_id, length, str_basesize, str_itemsize,
-                str_ofs_length, True)
+                str_ofs_length)
         def malloc_unicode(length):
             return llop1.do_malloc_varsize_clear(
                 llmemory.GCREF,
                 unicode_type_id, length, unicode_basesize,unicode_itemsize,
-                unicode_ofs_length, True)
+                unicode_ofs_length)
         self.malloc_str = malloc_str
         self.malloc_unicode = malloc_unicode
         self.GC_MALLOC_STR_UNICODE = lltype.Ptr(lltype.FuncType(
@@ -622,7 +678,7 @@
             # also use it to allocate varsized objects.  The tid
             # and possibly the length are both set afterward.
             gcref = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
-                                        0, size, True, False, False)
+                                        0, size, False, False)
             return rffi.cast(lltype.Signed, gcref)
         self.malloc_slowpath = malloc_slowpath
         self.MALLOC_SLOWPATH = lltype.FuncType([lltype.Signed], lltype.Signed)
diff --git a/pypy/jit/backend/llsupport/llmodel.py b/pypy/jit/backend/llsupport/llmodel.py
--- a/pypy/jit/backend/llsupport/llmodel.py
+++ b/pypy/jit/backend/llsupport/llmodel.py
@@ -254,10 +254,10 @@
         return ofs, size, sign
     unpack_arraydescr_size._always_inline_ = True
 
-    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo):
         return get_call_descr(self.gc_ll_descr, ARGS, RESULT, extrainfo)
 
-    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo=None):
+    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo):
         from pypy.jit.backend.llsupport import ffisupport
         return ffisupport.get_call_descr_dynamic(self, ffi_args, ffi_result,
                                                  extrainfo)
diff --git a/pypy/jit/backend/llsupport/test/test_gc.py b/pypy/jit/backend/llsupport/test/test_gc.py
--- a/pypy/jit/backend/llsupport/test/test_gc.py
+++ b/pypy/jit/backend/llsupport/test/test_gc.py
@@ -246,9 +246,8 @@
     def __init__(self):
         self.record = []
 
-    def do_malloc_fixedsize_clear(self, RESTYPE, type_id, size, can_collect,
+    def do_malloc_fixedsize_clear(self, RESTYPE, type_id, size,
                                   has_finalizer, contains_weakptr):
-        assert can_collect
         assert not contains_weakptr
         p = llmemory.raw_malloc(size)
         p = llmemory.cast_adr_to_ptr(p, RESTYPE)
@@ -258,8 +257,7 @@
         return p
 
     def do_malloc_varsize_clear(self, RESTYPE, type_id, length, size,
-                                itemsize, offset_to_length, can_collect):
-        assert can_collect
+                                itemsize, offset_to_length):
         p = llmemory.raw_malloc(size + itemsize * length)
         (p + offset_to_length).signed[0] = length
         p = llmemory.cast_adr_to_ptr(p, RESTYPE)
diff --git a/pypy/jit/backend/test/calling_convention_test.py b/pypy/jit/backend/test/calling_convention_test.py
--- a/pypy/jit/backend/test/calling_convention_test.py
+++ b/pypy/jit/backend/test/calling_convention_test.py
@@ -8,6 +8,7 @@
                                          ConstObj, BoxFloat, ConstFloat)
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.typesystem import deref
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.tool.oparser import parse
 from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi, rclass
 from pypy.rpython.ootypesystem import ootype
@@ -96,7 +97,8 @@
             FUNC = self.FuncType(funcargs, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             ops = '[%s]\n' % arguments
@@ -148,7 +150,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
@@ -190,7 +193,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
@@ -268,7 +272,8 @@
                 else:
                     ARGS.append(lltype.Signed)
             FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-                lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+                lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+                EffectInfo.MOST_GENERAL)
             ops = '''
             [%s]
             f99 = call_assembler(%s, descr=called_looptoken)
@@ -337,7 +342,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
diff --git a/pypy/jit/backend/test/runner_test.py b/pypy/jit/backend/test/runner_test.py
--- a/pypy/jit/backend/test/runner_test.py
+++ b/pypy/jit/backend/test/runner_test.py
@@ -9,6 +9,7 @@
                                          ConstObj, BoxFloat, ConstFloat)
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.typesystem import deref
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.tool.oparser import parse
 from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi, rclass
 from pypy.rpython.ootypesystem import ootype
@@ -445,7 +446,8 @@
             return chr(ord(c) + 1)
         FPTR = self.Ptr(self.FuncType([lltype.Char], lltype.Char))
         func_ptr = llhelper(FPTR, func)
-        calldescr = cpu.calldescrof(deref(FPTR), (lltype.Char,), lltype.Char)
+        calldescr = cpu.calldescrof(deref(FPTR), (lltype.Char,), lltype.Char,
+                                    EffectInfo.MOST_GENERAL)
         x = cpu.bh_call_i(self.get_funcbox(cpu, func_ptr).value,
                           calldescr, [ord('A')], None, None)
         assert x == ord('B')
@@ -458,7 +460,8 @@
                                           lltype.Float))
             func_ptr = llhelper(FPTR, func)
             FTP = deref(FPTR)
-            calldescr = cpu.calldescrof(FTP, FTP.ARGS, FTP.RESULT)
+            calldescr = cpu.calldescrof(FTP, FTP.ARGS, FTP.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             x = cpu.bh_call_f(self.get_funcbox(cpu, func_ptr).value,
                               calldescr,
                               [42], None, [longlong.getfloatstorage(3.5)])
@@ -486,13 +489,15 @@
             FUNC = deref(FPTR)
             funcbox = self.get_funcbox(cpu, func_ptr)
             # first, try it with the "normal" calldescr
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             res = self.execute_operation(rop.CALL,
                                          [funcbox, BoxInt(num), BoxInt(num)],
                                          'int', descr=calldescr)
             assert res.value == 2 * num
             # then, try it with the dynamic calldescr
-            dyn_calldescr = cpu.calldescrof_dynamic([ffi_type, ffi_type], ffi_type)
+            dyn_calldescr = cpu.calldescrof_dynamic([ffi_type, ffi_type], ffi_type,
+                                                    EffectInfo.MOST_GENERAL)
             res = self.execute_operation(rop.CALL,
                                          [funcbox, BoxInt(num), BoxInt(num)],
                                          'int', descr=dyn_calldescr)
@@ -507,7 +512,8 @@
             FUNC = self.FuncType([F] * 7 + [I] * 2 + [F] * 3, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
             args = ([boxfloat(.1) for i in range(7)] +
                     [BoxInt(1), BoxInt(2), boxfloat(.2), boxfloat(.3),
@@ -529,7 +535,8 @@
 
         FUNC = self.FuncType([lltype.Signed]*16, lltype.Signed)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         func_ptr = llhelper(FPTR, func)
         args = range(16)
         funcbox = self.get_funcbox(self.cpu, func_ptr)
@@ -552,7 +559,8 @@
             FPTR = self.Ptr(self.FuncType([TP] * nb_args, TP))
             func_ptr = llhelper(FPTR, func_ints)
             FUNC = deref(FPTR)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
             args = [280-24*i for i in range(nb_args)]
             res = self.execute_operation(rop.CALL,
@@ -566,7 +574,8 @@
 
         FUNC = self.FuncType([lltype.Float, lltype.Float], lltype.Float)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         func_ptr = llhelper(FPTR, func)
         funcbox = self.get_funcbox(self.cpu, func_ptr)
         res = self.execute_operation(rop.CALL, [funcbox, constfloat(1.5),
@@ -1589,7 +1598,8 @@
         '''
         FPTR = lltype.Ptr(lltype.FuncType([lltype.Signed], lltype.Void))
         fptr = llhelper(FPTR, func)
-        calldescr = self.cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT)
+        calldescr = self.cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT,
+                                         EffectInfo.MOST_GENERAL)
 
         xtp = lltype.malloc(rclass.OBJECT_VTABLE, immortal=True)
         xtp.subclassrange_min = 1
@@ -1807,7 +1817,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Void)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1850,7 +1861,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Signed)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1895,7 +1907,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Float)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1941,7 +1954,8 @@
         cpu = self.cpu
         func_adr = llmemory.cast_ptr_to_adr(c_tolower.funcsym)
         funcbox = ConstInt(heaptracker.adr2int(func_adr))
-        calldescr = cpu.calldescrof_dynamic([types.uchar], types.sint)
+        calldescr = cpu.calldescrof_dynamic([types.uchar], types.sint,
+                                            EffectInfo.MOST_GENERAL)
         i1 = BoxInt()
         i2 = BoxInt()
         tok = BoxInt()
@@ -1997,7 +2011,8 @@
         funcbox = ConstInt(heaptracker.adr2int(func_adr))
         calldescr = cpu.calldescrof_dynamic([types.pointer, types_size_t,
                                              types_size_t, types.pointer],
-                                            types.void)
+                                            types.void,
+                                            EffectInfo.MOST_GENERAL)
         i0 = BoxInt()
         i1 = BoxInt()
         i2 = BoxInt()
@@ -2292,7 +2307,8 @@
         ARGS = [lltype.Signed] * 10
         RES = lltype.Signed
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         for i in range(10):
             self.cpu.set_future_value_int(i, i+1)
         res = self.cpu.execute_token(looptoken)
@@ -2332,7 +2348,8 @@
         ARGS = [lltype.Float, lltype.Float]
         RES = lltype.Float
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         
         ops = '''
         [f0, f1]
@@ -2422,7 +2439,8 @@
         ARGS = [lltype.Float, lltype.Float]
         RES = lltype.Float
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         
         ops = '''
         [f0, f1]
@@ -2634,7 +2652,8 @@
             #
             FUNC = self.FuncType([lltype.Signed], RESTYPE)
             FPTR = self.Ptr(FUNC)
-            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                             EffectInfo.MOST_GENERAL)
             x = self.cpu.bh_call_i(self.get_funcbox(self.cpu, f).value,
                                    calldescr, [value], None, None)
             assert x == expected, (
@@ -2667,7 +2686,8 @@
             #
             FUNC = self.FuncType([lltype.Signed], RESTYPE)
             FPTR = self.Ptr(FUNC)
-            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                             EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(self.cpu, f)
             res = self.execute_operation(rop.CALL, [funcbox, BoxInt(value)],
                                          'int', descr=calldescr)
@@ -2701,7 +2721,8 @@
         #
         FUNC = self.FuncType([lltype.SignedLongLong], lltype.SignedLongLong)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         x = self.cpu.bh_call_f(self.get_funcbox(self.cpu, f).value,
                                calldescr, None, None, [value])
         assert x == expected
@@ -2728,7 +2749,8 @@
         #
         FUNC = self.FuncType([lltype.SignedLongLong], lltype.SignedLongLong)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         funcbox = self.get_funcbox(self.cpu, f)
         res = self.execute_operation(rop.CALL, [funcbox, BoxFloat(value)],
                                      'float', descr=calldescr)
@@ -2756,7 +2778,8 @@
         #
         FUNC = self.FuncType([lltype.SingleFloat], lltype.SingleFloat)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         ivalue = longlong.singlefloat2int(value)
         iexpected = longlong.singlefloat2int(expected)
         x = self.cpu.bh_call_i(self.get_funcbox(self.cpu, f).value,
@@ -2785,7 +2808,8 @@
         #
         FUNC = self.FuncType([lltype.SingleFloat], lltype.SingleFloat)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         funcbox = self.get_funcbox(self.cpu, f)
         ivalue = longlong.singlefloat2int(value)
         iexpected = longlong.singlefloat2int(expected)
diff --git a/pypy/jit/backend/test/test_ll_random.py b/pypy/jit/backend/test/test_ll_random.py
--- a/pypy/jit/backend/test/test_ll_random.py
+++ b/pypy/jit/backend/test/test_ll_random.py
@@ -6,6 +6,7 @@
 from pypy.jit.metainterp.history import BoxPtr, BoxInt
 from pypy.jit.metainterp.history import BasicFailDescr
 from pypy.jit.codewriter import heaptracker
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.rpython.annlowlevel import llhelper
 from pypy.rlib.rarithmetic import intmask
 from pypy.rpython.llinterp import LLException
@@ -468,6 +469,10 @@
         exec code in d
         return subset, d['f'], vtableptr
 
+    def getcalldescr(self, builder, TP):
+        ef = EffectInfo.MOST_GENERAL
+        return builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT, ef)
+
 # 1. non raising call and guard_no_exception
 class CallOperation(BaseCallOperation):
     def produce_into(self, builder, r):
@@ -481,7 +486,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         op = ResOperation(rop.GUARD_NO_EXCEPTION, [], None,
                           descr=BasicFailDescr())
@@ -501,7 +506,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         _, vtableptr = builder.get_random_structure_type_and_vtable(r)
         exc_box = ConstAddr(llmemory.cast_ptr_to_adr(vtableptr), builder.cpu)
@@ -523,7 +528,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         exc_box = ConstAddr(llmemory.cast_ptr_to_adr(exc), builder.cpu)
         op = ResOperation(rop.GUARD_EXCEPTION, [exc_box], BoxPtr(),
@@ -540,7 +545,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         op = ResOperation(rop.GUARD_NO_EXCEPTION, [], BoxPtr(),
                           descr=BasicFailDescr())
@@ -559,7 +564,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         while True:
             _, vtableptr = builder.get_random_structure_type_and_vtable(r)
diff --git a/pypy/jit/backend/x86/regalloc.py b/pypy/jit/backend/x86/regalloc.py
--- a/pypy/jit/backend/x86/regalloc.py
+++ b/pypy/jit/backend/x86/regalloc.py
@@ -843,8 +843,8 @@
 
     def consider_call(self, op):
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            oopspecindex = effectinfo.oopspecindex
+        oopspecindex = effectinfo.oopspecindex
+        if oopspecindex != EffectInfo.OS_NONE:
             if IS_X86_32:
                 # support for some of the llong operations,
                 # which only exist on x86-32
diff --git a/pypy/jit/backend/x86/test/test_gc_integration.py b/pypy/jit/backend/x86/test/test_gc_integration.py
--- a/pypy/jit/backend/x86/test/test_gc_integration.py
+++ b/pypy/jit/backend/x86/test/test_gc_integration.py
@@ -7,6 +7,7 @@
      BoxPtr, ConstPtr, TreeLoop
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.codewriter import heaptracker
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.backend.llsupport.descr import GcCache
 from pypy.jit.backend.llsupport.gc import GcLLDescription
 from pypy.jit.backend.detect_cpu import getcpuclass
@@ -76,7 +77,8 @@
         for box in boxes:
             regalloc.rm.try_allocate_reg(box)
         TP = lltype.FuncType([], lltype.Signed)
-        calldescr = cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        calldescr = cpu.calldescrof(TP, TP.ARGS, TP.RESULT,
+                                    EffectInfo.MOST_GENERAL)
         regalloc.rm._check_invariants()
         box = boxes[0]
         regalloc.position = 0
diff --git a/pypy/jit/backend/x86/test/test_regalloc.py b/pypy/jit/backend/x86/test/test_regalloc.py
--- a/pypy/jit/backend/x86/test/test_regalloc.py
+++ b/pypy/jit/backend/x86/test/test_regalloc.py
@@ -16,6 +16,7 @@
 from pypy.rpython.annlowlevel import llhelper
 from pypy.rpython.lltypesystem import rclass, rstr
 from pypy.jit.codewriter import longlong
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.backend.x86.rx86 import *
 
 def test_is_comparison_or_ovf_op():
@@ -92,7 +93,8 @@
     zd_addr = cpu.cast_int_to_adr(zero_division_tp)
     zero_division_error = llmemory.cast_adr_to_ptr(zd_addr,
                                             lltype.Ptr(rclass.OBJECT_VTABLE))
-    raising_calldescr = cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT)
+    raising_calldescr = cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT,
+                                        EffectInfo.MOST_GENERAL)
 
     fdescr1 = BasicFailDescr(1)
     fdescr2 = BasicFailDescr(2)
@@ -115,9 +117,12 @@
     f2ptr = llhelper(F2PTR, f2)
     f10ptr = llhelper(F10PTR, f10)
 
-    f1_calldescr = cpu.calldescrof(F1PTR.TO, F1PTR.TO.ARGS, F1PTR.TO.RESULT)
-    f2_calldescr = cpu.calldescrof(F2PTR.TO, F2PTR.TO.ARGS, F2PTR.TO.RESULT)
-    f10_calldescr = cpu.calldescrof(F10PTR.TO, F10PTR.TO.ARGS, F10PTR.TO.RESULT)
+    f1_calldescr = cpu.calldescrof(F1PTR.TO, F1PTR.TO.ARGS, F1PTR.TO.RESULT,
+                                   EffectInfo.MOST_GENERAL)
+    f2_calldescr = cpu.calldescrof(F2PTR.TO, F2PTR.TO.ARGS, F2PTR.TO.RESULT,
+                                   EffectInfo.MOST_GENERAL)
+    f10_calldescr= cpu.calldescrof(F10PTR.TO, F10PTR.TO.ARGS, F10PTR.TO.RESULT,
+                                   EffectInfo.MOST_GENERAL)
 
     namespace = locals().copy()
     type_system = 'lltype'
diff --git a/pypy/jit/codewriter/call.py b/pypy/jit/codewriter/call.py
--- a/pypy/jit/codewriter/call.py
+++ b/pypy/jit/codewriter/call.py
@@ -6,7 +6,7 @@
 from pypy.jit.codewriter import support
 from pypy.jit.codewriter.jitcode import JitCode
 from pypy.jit.codewriter.effectinfo import (VirtualizableAnalyzer,
-    QuasiImmutAnalyzer, CanReleaseGILAnalyzer, effectinfo_from_writeanalyze,
+    QuasiImmutAnalyzer, RandomEffectsAnalyzer, effectinfo_from_writeanalyze,
     EffectInfo, CallInfoCollection)
 from pypy.translator.simplify import get_funcobj, get_functype
 from pypy.rpython.lltypesystem import lltype, llmemory
@@ -31,7 +31,7 @@
             self.readwrite_analyzer = ReadWriteAnalyzer(translator)
             self.virtualizable_analyzer = VirtualizableAnalyzer(translator)
             self.quasiimmut_analyzer = QuasiImmutAnalyzer(translator)
-            self.canreleasegil_analyzer = CanReleaseGILAnalyzer(translator)
+            self.randomeffects_analyzer = RandomEffectsAnalyzer(translator)
         #
         for index, jd in enumerate(jitdrivers_sd):
             jd.index = index
@@ -187,7 +187,7 @@
             fnaddr = llmemory.cast_ptr_to_adr(fnptr)
         NON_VOID_ARGS = [ARG for ARG in FUNC.ARGS if ARG is not lltype.Void]
         calldescr = self.cpu.calldescrof(FUNC, tuple(NON_VOID_ARGS),
-                                         FUNC.RESULT)
+                                         FUNC.RESULT, EffectInfo.MOST_GENERAL)
         return (fnaddr, calldescr)
 
     def getcalldescr(self, op, oopspecindex=EffectInfo.OS_NONE,
@@ -219,9 +219,11 @@
                 assert not NON_VOID_ARGS, ("arguments not supported for "
                                            "loop-invariant function!")
         # build the extraeffect
-        can_release_gil = self.canreleasegil_analyzer.analyze(op)
-        # can_release_gil implies can_invalidate
-        can_invalidate = can_release_gil or self.quasiimmut_analyzer.analyze(op)
+        random_effects = self.randomeffects_analyzer.analyze(op)
+        if random_effects:
+            extraeffect = EffectInfo.EF_RANDOM_EFFECTS
+        # random_effects implies can_invalidate
+        can_invalidate = random_effects or self.quasiimmut_analyzer.analyze(op)
         if extraeffect is None:
             if self.virtualizable_analyzer.analyze(op):
                 extraeffect = EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
@@ -239,12 +241,10 @@
         #
         effectinfo = effectinfo_from_writeanalyze(
             self.readwrite_analyzer.analyze(op), self.cpu, extraeffect,
-            oopspecindex, can_invalidate, can_release_gil)
+            oopspecindex, can_invalidate)
         #
-        if oopspecindex != EffectInfo.OS_NONE:
-            assert effectinfo is not None
+        assert effectinfo is not None
         if elidable or loopinvariant:
-            assert effectinfo is not None
             assert extraeffect != EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
             # XXX this should also say assert not can_invalidate, but
             #     it can't because our analyzer is not good enough for now
@@ -264,8 +264,7 @@
 
     def calldescr_canraise(self, calldescr):
         effectinfo = calldescr.get_extra_info()
-        return (effectinfo is None or
-                effectinfo.extraeffect > EffectInfo.EF_CANNOT_RAISE)
+        return effectinfo.check_can_raise()
 
     def jitdriver_sd_from_portal_graph(self, graph):
         for jd in self.jitdrivers_sd:
diff --git a/pypy/jit/codewriter/effectinfo.py b/pypy/jit/codewriter/effectinfo.py
--- a/pypy/jit/codewriter/effectinfo.py
+++ b/pypy/jit/codewriter/effectinfo.py
@@ -15,6 +15,7 @@
     EF_ELIDABLE_CAN_RAISE              = 3 #elidable function (but can raise)
     EF_CAN_RAISE                       = 4 #normal function (can raise)
     EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE = 5 #can raise and force virtualizables
+    EF_RANDOM_EFFECTS                  = 6 #can do whatever
 
     # the 'oopspecindex' field is one of the following values:
     OS_NONE                     = 0    # normal case, no oopspec
@@ -80,17 +81,26 @@
                 write_descrs_fields, write_descrs_arrays,
                 extraeffect=EF_CAN_RAISE,
                 oopspecindex=OS_NONE,
-                can_invalidate=False, can_release_gil=False):
-        key = (frozenset(readonly_descrs_fields),
-               frozenset(readonly_descrs_arrays),
-               frozenset(write_descrs_fields),
-               frozenset(write_descrs_arrays),
+                can_invalidate=False):
+        key = (frozenset_or_none(readonly_descrs_fields),
+               frozenset_or_none(readonly_descrs_arrays),
+               frozenset_or_none(write_descrs_fields),
+               frozenset_or_none(write_descrs_arrays),
                extraeffect,
                oopspecindex,
-               can_invalidate,
-               can_release_gil)
+               can_invalidate)
         if key in cls._cache:
             return cls._cache[key]
+        if extraeffect == EffectInfo.EF_RANDOM_EFFECTS:
+            assert readonly_descrs_fields is None
+            assert readonly_descrs_arrays is None
+            assert write_descrs_fields is None
+            assert write_descrs_arrays is None
+        else:
+            assert readonly_descrs_fields is not None
+            assert readonly_descrs_arrays is not None
+            assert write_descrs_fields is not None
+            assert write_descrs_arrays is not None
         result = object.__new__(cls)
         result.readonly_descrs_fields = readonly_descrs_fields
         result.readonly_descrs_arrays = readonly_descrs_arrays
@@ -104,11 +114,13 @@
             result.write_descrs_arrays = write_descrs_arrays
         result.extraeffect = extraeffect
         result.can_invalidate = can_invalidate
-        result.can_release_gil = can_release_gil
         result.oopspecindex = oopspecindex
         cls._cache[key] = result
         return result
 
+    def check_can_raise(self):
+        return self.extraeffect > self.EF_CANNOT_RAISE
+
     def check_can_invalidate(self):
         return self.can_invalidate
 
@@ -116,56 +128,71 @@
         return self.extraeffect >= self.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
 
     def has_random_effects(self):
-        return self.oopspecindex == self.OS_LIBFFI_CALL or self.can_release_gil
+        return self.extraeffect >= self.EF_RANDOM_EFFECTS
+
+
+def frozenset_or_none(x):
+    if x is None:
+        return None
+    return frozenset(x)
+
+EffectInfo.MOST_GENERAL = EffectInfo(None, None, None, None,
+                                     EffectInfo.EF_RANDOM_EFFECTS,
+                                     can_invalidate=True)
+
 
 def effectinfo_from_writeanalyze(effects, cpu,
                                  extraeffect=EffectInfo.EF_CAN_RAISE,
                                  oopspecindex=EffectInfo.OS_NONE,
-                                 can_invalidate=False,
-                                 can_release_gil=False):
+                                 can_invalidate=False):
     from pypy.translator.backendopt.writeanalyze import top_set
-    if effects is top_set:
-        return None
-    readonly_descrs_fields = []
-    readonly_descrs_arrays = []
-    write_descrs_fields = []
-    write_descrs_arrays = []
+    if effects is top_set or extraeffect == EffectInfo.EF_RANDOM_EFFECTS:
+        readonly_descrs_fields = None
+        readonly_descrs_arrays = None
+        write_descrs_fields = None
+        write_descrs_arrays = None
+        extraeffect = EffectInfo.EF_RANDOM_EFFECTS
+    else:
+        readonly_descrs_fields = []
+        readonly_descrs_arrays = []
+        write_descrs_fields = []
+        write_descrs_arrays = []
 
-    def add_struct(descrs_fields, (_, T, fieldname)):
-        T = deref(T)
-        if consider_struct(T, fieldname):
-            descr = cpu.fielddescrof(T, fieldname)
-            descrs_fields.append(descr)
+        def add_struct(descrs_fields, (_, T, fieldname)):
+            T = deref(T)
+            if consider_struct(T, fieldname):
+                descr = cpu.fielddescrof(T, fieldname)
+                descrs_fields.append(descr)
 
-    def add_array(descrs_arrays, (_, T)):
-        ARRAY = deref(T)
-        if consider_array(ARRAY):
-            descr = cpu.arraydescrof(ARRAY)
-            descrs_arrays.append(descr)
+        def add_array(descrs_arrays, (_, T)):
+            ARRAY = deref(T)
+            if consider_array(ARRAY):
+                descr = cpu.arraydescrof(ARRAY)
+                descrs_arrays.append(descr)
 
-    for tup in effects:
-        if tup[0] == "struct":
-            add_struct(write_descrs_fields, tup)
-        elif tup[0] == "readstruct":
-            tupw = ("struct",) + tup[1:]
-            if tupw not in effects:
-                add_struct(readonly_descrs_fields, tup)
-        elif tup[0] == "array":
-            add_array(write_descrs_arrays, tup)
-        elif tup[0] == "readarray":
-            tupw = ("array",) + tup[1:]
-            if tupw not in effects:
-                add_array(readonly_descrs_arrays, tup)
-        else:
-            assert 0
+        for tup in effects:
+            if tup[0] == "struct":
+                add_struct(write_descrs_fields, tup)
+            elif tup[0] == "readstruct":
+                tupw = ("struct",) + tup[1:]
+                if tupw not in effects:
+                    add_struct(readonly_descrs_fields, tup)
+            elif tup[0] == "array":
+                add_array(write_descrs_arrays, tup)
+            elif tup[0] == "readarray":
+                tupw = ("array",) + tup[1:]
+                if tupw not in effects:
+                    add_array(readonly_descrs_arrays, tup)
+            else:
+                assert 0
+    #
     return EffectInfo(readonly_descrs_fields,
                       readonly_descrs_arrays,
                       write_descrs_fields,
                       write_descrs_arrays,
                       extraeffect,
                       oopspecindex,
-                      can_invalidate,
-                      can_release_gil)
+                      can_invalidate)
 
 def consider_struct(TYPE, fieldname):
     if fieldType(TYPE, fieldname) is lltype.Void:
@@ -201,12 +228,13 @@
     def analyze_simple_operation(self, op, graphinfo):
         return op.opname == 'jit_force_quasi_immutable'
 
-class CanReleaseGILAnalyzer(BoolGraphAnalyzer):
+class RandomEffectsAnalyzer(BoolGraphAnalyzer):
     def analyze_direct_call(self, graph, seen=None):
-        releases_gil = False
         if hasattr(graph, "func") and hasattr(graph.func, "_ptr"):
-            releases_gil = graph.func._ptr._obj.releases_gil
-        return releases_gil or super(CanReleaseGILAnalyzer, self).analyze_direct_call(graph, seen)
+            if graph.func._ptr._obj.random_effects_on_gcobjs:
+                return True
+        return super(RandomEffectsAnalyzer, self).analyze_direct_call(graph,
+                                                                      seen)
 
     def analyze_simple_operation(self, op, graphinfo):
         return False
diff --git a/pypy/jit/codewriter/jtransform.py b/pypy/jit/codewriter/jtransform.py
--- a/pypy/jit/codewriter/jtransform.py
+++ b/pypy/jit/codewriter/jtransform.py
@@ -1,4 +1,5 @@
 import py
+
 from pypy.jit.codewriter import support, heaptracker, longlong
 from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.codewriter.flatten import ListOfKind, IndirectCallTargets
@@ -22,6 +23,11 @@
     t = Transformer(cpu, callcontrol, portal_jd)
     t.transform(graph)
 
+def integer_bounds(size, unsigned):
+    if unsigned:
+        return 0, 1 << (8 * size)
+    else:
+        return -(1 << (8 * size - 1)), 1 << (8 * size - 1)
 
 class Transformer(object):
     vable_array_vars = None
@@ -780,81 +786,127 @@
             raise NotImplementedError("cast_ptr_to_int")
 
     def rewrite_op_force_cast(self, op):
-        assert not self._is_gc(op.args[0])
-        fromll = longlong.is_longlong(op.args[0].concretetype)
-        toll   = longlong.is_longlong(op.result.concretetype)
-        if fromll and toll:
+        v_arg = op.args[0]
+        v_result = op.result
+        assert not self._is_gc(v_arg)
+
+        if v_arg.concretetype == v_result.concretetype:
             return
-        if fromll:
-            args = op.args
-            opname = 'truncate_longlong_to_int'
-            RESULT = lltype.Signed
-            v = varoftype(RESULT)
-            op1 = SpaceOperation(opname, args, v)
-            op2 = self.rewrite_operation(op1)
-            oplist = self.force_cast_without_longlong(op2.result, op.result)
+
+        float_arg = v_arg.concretetype in [lltype.Float, lltype.SingleFloat]
+        float_res = v_result.concretetype in [lltype.Float, lltype.SingleFloat]
+        if not float_arg and not float_res:
+            # some int -> some int cast
+            return self._int_to_int_cast(v_arg, v_result)
+        elif float_arg and float_res:
+            # some float -> some float cast
+            return self._float_to_float_cast(v_arg, v_result)
+        elif not float_arg and float_res:
+            # some int -> some float
+            ops = []
+            v1 = varoftype(lltype.Signed)
+            oplist = self.rewrite_operation(
+                SpaceOperation('force_cast', [v_arg], v1)
+            )
             if oplist:
-                return [op2] + oplist
-            #
-            # force a renaming to put the correct result in place, even though
-            # it might be slightly mistyped (e.g. Signed versus Unsigned)
-            assert op2.result is v
-            op2.result = op.result
-            return op2
-        elif toll:
-            size, unsigned = rffi.size_and_sign(op.args[0].concretetype)
-            if unsigned:
+                ops.extend(oplist)
+            else:
+                v1 = v_arg
+            v2 = varoftype(lltype.Float)
+            op = self.rewrite_operation(
+                SpaceOperation('cast_int_to_float', [v1], v2)
+            )
+            ops.append(op)
+            op2 = self.rewrite_operation(
+                SpaceOperation('force_cast', [v2], v_result)
+            )
+            if op2:
+                ops.append(op2)
+            else:
+                op.result = v_result
+            return ops
+        elif float_arg and not float_res:
+            # some float -> some int
+            ops = []
+            v1 = varoftype(lltype.Float)
+            op1 = self.rewrite_operation(
+                SpaceOperation('force_cast', [v_arg], v1)
+            )
+            if op1:
+                ops.append(op1)
+            else:
+                v1 = v_arg
+            v2 = varoftype(lltype.Signed)
+            op = self.rewrite_operation(
+                SpaceOperation('cast_float_to_int', [v1], v2)
+            )
+            ops.append(op)
+            oplist = self.rewrite_operation(
+                SpaceOperation('force_cast', [v2], v_result)
+            )
+            if oplist:
+                ops.extend(oplist)
+            else:
+                op.result = v_result
+            return ops
+        else:
+            assert False
+
+    def _int_to_int_cast(self, v_arg, v_result):
+        longlong_arg = longlong.is_longlong(v_arg.concretetype)
+        longlong_res = longlong.is_longlong(v_result.concretetype)
+        size1, unsigned1 = rffi.size_and_sign(v_arg.concretetype)
+        size2, unsigned2 = rffi.size_and_sign(v_result.concretetype)
+
+        if longlong_arg and longlong_res:
+            return
+        elif longlong_arg:
+            v = varoftype(lltype.Signed)
+            op1 = self.rewrite_operation(
+                SpaceOperation('truncate_longlong_to_int', [v_arg], v)
+            )
+            op2 = SpaceOperation('force_cast', [v], v_result)
+            oplist = self.rewrite_operation(op2)
+            if not oplist:
+                op1.result = v_result
+                oplist = []
+            return [op1] + oplist
+        elif longlong_res:
+            if unsigned1:
                 INTERMEDIATE = lltype.Unsigned
             else:
                 INTERMEDIATE = lltype.Signed
             v = varoftype(INTERMEDIATE)
-            oplist = self.force_cast_without_longlong(op.args[0], v)
+            op1 = SpaceOperation('force_cast', [v_arg], v)
+            oplist = self.rewrite_operation(op1)
             if not oplist:
-                v = op.args[0]
+                v = v_arg
                 oplist = []
-            if unsigned:
+            if unsigned1:
                 opname = 'cast_uint_to_longlong'
             else:
                 opname = 'cast_int_to_longlong'
-            op1 = SpaceOperation(opname, [v], op.result)
-            op2 = self.rewrite_operation(op1)
+            op2 = self.rewrite_operation(
+                SpaceOperation(opname, [v], v_result)
+            )
             return oplist + [op2]
-        else:
-            return self.force_cast_without_longlong(op.args[0], op.result)
 
-    def force_cast_without_longlong(self, v_arg, v_result):
-        if v_result.concretetype == v_arg.concretetype:
+        # We've now, ostensibly, dealt with the longlongs, everything should be
+        # a Signed or smaller
+        assert size1 <= rffi.sizeof(lltype.Signed)
+        assert size2 <= rffi.sizeof(lltype.Signed)
+
+        # the target type is LONG or ULONG
+        if size2 == rffi.sizeof(lltype.Signed):
             return
-        if v_arg.concretetype == rffi.FLOAT:
-            assert v_result.concretetype == lltype.Float, "cast %s -> %s" % (
-                v_arg.concretetype, v_result.concretetype)
-            return SpaceOperation('cast_singlefloat_to_float', [v_arg],
-                                  v_result)
-        if v_result.concretetype == rffi.FLOAT:
-            assert v_arg.concretetype == lltype.Float, "cast %s -> %s" % (
-                v_arg.concretetype, v_result.concretetype)
-            return SpaceOperation('cast_float_to_singlefloat', [v_arg],
-                                  v_result)
-        return self.force_cast_without_singlefloat(v_arg, v_result)
 
-    def force_cast_without_singlefloat(self, v_arg, v_result):
-        size2, unsigned2 = rffi.size_and_sign(v_result.concretetype)
-        assert size2 <= rffi.sizeof(lltype.Signed)
-        if size2 == rffi.sizeof(lltype.Signed):
-            return     # the target type is LONG or ULONG
-        size1, unsigned1 = rffi.size_and_sign(v_arg.concretetype)
-        assert size1 <= rffi.sizeof(lltype.Signed)
-        #
-        def bounds(size, unsigned):
-            if unsigned:
-                return 0, 1<<(8*size)
-            else:
-                return -(1<<(8*size-1)), 1<<(8*size-1)
-        min1, max1 = bounds(size1, unsigned1)
-        min2, max2 = bounds(size2, unsigned2)
+        min1, max1 = integer_bounds(size1, unsigned1)
+        min2, max2 = integer_bounds(size2, unsigned2)
+
+        # the target type includes the source range
         if min2 <= min1 <= max1 <= max2:
-            return     # the target type includes the source range
-        #
+            return
+
         result = []
         if min2:
             c_min2 = Constant(min2, lltype.Signed)
@@ -862,15 +914,28 @@
             result.append(SpaceOperation('int_sub', [v_arg, c_min2], v2))
         else:
             v2 = v_arg
-        c_mask = Constant(int((1<<(8*size2))-1), lltype.Signed)
-        v3 = varoftype(lltype.Signed)
+        c_mask = Constant(int((1 << (8 * size2)) - 1), lltype.Signed)
+        if min2:
+            v3 = varoftype(lltype.Signed)
+        else:
+            v3 = v_result
         result.append(SpaceOperation('int_and', [v2, c_mask], v3))
         if min2:
             result.append(SpaceOperation('int_add', [v3, c_min2], v_result))
-        else:
-            result[-1].result = v_result
         return result
 
+    def _float_to_float_cast(self, v_arg, v_result):
+        if v_arg.concretetype == lltype.SingleFloat:
+            assert v_result.concretetype == lltype.Float, "cast %s -> %s" % (
+                v_arg.concretetype, v_result.concretetype)
+            return SpaceOperation('cast_singlefloat_to_float', [v_arg],
+                                  v_result)
+        if v_result.concretetype == lltype.SingleFloat:
+            assert v_arg.concretetype == lltype.Float, "cast %s -> %s" % (
+                v_arg.concretetype, v_result.concretetype)
+            return SpaceOperation('cast_float_to_singlefloat', [v_arg],
+                                  v_result)
+
     def rewrite_op_direct_ptradd(self, op):
         # xxx otherwise, not implemented:
         assert op.args[0].concretetype == rffi.CCHARP
@@ -1417,7 +1482,7 @@
             extraeffect = EffectInfo.EF_CANNOT_RAISE
         elif oopspec_name.startswith('libffi_call_'):
             oopspecindex = EffectInfo.OS_LIBFFI_CALL
-            extraeffect = EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
+            extraeffect = EffectInfo.EF_RANDOM_EFFECTS
         else:
             assert False, 'unsupported oopspec: %s' % oopspec_name
         return self._handle_oopspec_call(op, args, oopspecindex, extraeffect)
diff --git a/pypy/jit/codewriter/test/test_call.py b/pypy/jit/codewriter/test/test_call.py
--- a/pypy/jit/codewriter/test/test_call.py
+++ b/pypy/jit/codewriter/test/test_call.py
@@ -191,4 +191,4 @@
     [block, _] = list(f_graph.iterblocks())
     [op] = block.operations
     call_descr = cc.getcalldescr(op)
-    assert call_descr.extrainfo.can_release_gil
\ No newline at end of file
+    assert call_descr.extrainfo.has_random_effects()
diff --git a/pypy/jit/codewriter/test/test_codewriter.py b/pypy/jit/codewriter/test/test_codewriter.py
--- a/pypy/jit/codewriter/test/test_codewriter.py
+++ b/pypy/jit/codewriter/test/test_codewriter.py
@@ -5,7 +5,7 @@
 from pypy.rpython.lltypesystem import lltype, llmemory, rffi
 
 class FakeCallDescr(AbstractDescr):
-    def __init__(self, FUNC, ARGS, RESULT, effectinfo=None):
+    def __init__(self, FUNC, ARGS, RESULT, effectinfo):
         self.FUNC = FUNC
         self.ARGS = ARGS
         self.RESULT = RESULT
diff --git a/pypy/jit/codewriter/test/test_flatten.py b/pypy/jit/codewriter/test/test_flatten.py
--- a/pypy/jit/codewriter/test/test_flatten.py
+++ b/pypy/jit/codewriter/test/test_flatten.py
@@ -50,7 +50,7 @@
     def __init__(self, rtyper):
         rtyper._builtin_func_for_spec_cache = FakeDict()
         self.rtyper = rtyper
-    def calldescrof(self, FUNC, ARGS, RESULT):
+    def calldescrof(self, FUNC, ARGS, RESULT, effectinfo):
         return FakeDescr()
     def fielddescrof(self, STRUCT, name):
         return FakeDescr()
@@ -324,7 +324,7 @@
     def test_exc_exitswitch(self):
         def g(i):
             pass
-        
+
         def f(i):
             try:
                 g(i)
@@ -854,13 +854,51 @@
             int_return %i0
         """, transform=True)
 
-    def test_force_cast_float(self):
+    def test_force_cast_floats(self):
         from pypy.rpython.lltypesystem import rffi
+        # Caststs to lltype.Float
         def f(n):
             return rffi.cast(lltype.Float, n)
         self.encoding_test(f, [12.456], """
             float_return %f0
         """, transform=True)
+        self.encoding_test(f, [rffi.cast(rffi.SIGNEDCHAR, 42)], """
+            cast_int_to_float %i0 -> %f0
+            float_return %f0
+        """, transform=True)
+
+        # Casts to lltype.SingleFloat
+        def g(n):
+            return rffi.cast(lltype.SingleFloat, n)
+        self.encoding_test(g, [12.456], """
+            cast_float_to_singlefloat %f0 -> %i0
+            int_return %i0
+        """, transform=True)
+        self.encoding_test(g, [rffi.cast(rffi.SIGNEDCHAR, 42)], """
+            cast_int_to_float %i0 -> %f0
+            cast_float_to_singlefloat %f0 -> %i1
+            int_return %i1
+        """, transform=True)
+
+        # Casts from floats
+        def f(n):
+            return rffi.cast(rffi.SIGNEDCHAR, n)
+        self.encoding_test(f, [12.456], """
+            cast_float_to_int %f0 -> %i0
+            int_sub %i0, $-128 -> %i1
+            int_and %i1, $255 -> %i2
+            int_add %i2, $-128 -> %i3
+            int_return %i3
+        """, transform=True)
+        self.encoding_test(f, [rffi.cast(lltype.SingleFloat, 12.456)], """
+            cast_singlefloat_to_float %i0 -> %f0
+            cast_float_to_int %f0 -> %i1
+            int_sub %i1, $-128 -> %i2
+            int_and %i2, $255 -> %i3
+            int_add %i3, $-128 -> %i4
+            int_return %i4
+        """, transform=True)
+
 
     def test_direct_ptradd(self):
         from pypy.rpython.lltypesystem import rffi
diff --git a/pypy/jit/metainterp/optimizeopt/fficall.py b/pypy/jit/metainterp/optimizeopt/fficall.py
--- a/pypy/jit/metainterp/optimizeopt/fficall.py
+++ b/pypy/jit/metainterp/optimizeopt/fficall.py
@@ -19,7 +19,8 @@
         self.funcval = funcval
         self.opargs = []
         argtypes, restype = self._get_signature(funcval)
-        self.descr = cpu.calldescrof_dynamic(argtypes, restype)
+        self.descr = cpu.calldescrof_dynamic(argtypes, restype,
+                                             EffectInfo.MOST_GENERAL)
         # ^^^ may be None if unsupported
         self.prepare_op = prepare_op
         self.delayed_ops = []
@@ -195,9 +196,7 @@
 
     def _get_oopspec(self, op):
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            return effectinfo.oopspecindex
-        return EffectInfo.OS_NONE
+        return effectinfo.oopspecindex
 
     def _get_funcval(self, op):
         return self.getvalue(op.getarg(1))
diff --git a/pypy/jit/metainterp/optimizeopt/heap.py b/pypy/jit/metainterp/optimizeopt/heap.py
--- a/pypy/jit/metainterp/optimizeopt/heap.py
+++ b/pypy/jit/metainterp/optimizeopt/heap.py
@@ -241,31 +241,33 @@
             opnum == rop.CALL_RELEASE_GIL or
             opnum == rop.CALL_ASSEMBLER):
             if opnum == rop.CALL_ASSEMBLER:
-                effectinfo = None
+                self._seen_guard_not_invalidated = False
             else:
                 effectinfo = op.getdescr().get_extra_info()
-            if effectinfo is None or effectinfo.check_can_invalidate():
-                self._seen_guard_not_invalidated = False
-            if effectinfo is not None and not effectinfo.has_random_effects():
-                # XXX we can get the wrong complexity here, if the lists
-                # XXX stored on effectinfo are large
-                for fielddescr in effectinfo.readonly_descrs_fields:
-                    self.force_lazy_setfield(fielddescr)
-                for arraydescr in effectinfo.readonly_descrs_arrays:
-                    self.force_lazy_setarrayitem(arraydescr)
-                for fielddescr in effectinfo.write_descrs_fields:
-                    self.force_lazy_setfield(fielddescr, can_cache=False)
-                for arraydescr in effectinfo.write_descrs_arrays:
-                    self.force_lazy_setarrayitem(arraydescr, can_cache=False)
-                if effectinfo.check_forces_virtual_or_virtualizable():
-                    vrefinfo = self.optimizer.metainterp_sd.virtualref_info
-                    self.force_lazy_setfield(vrefinfo.descr_forced)
-                    # ^^^ we only need to force this field; the other fields
-                    # of virtualref_info and virtualizable_info are not gcptrs.
-                return
+                if effectinfo.check_can_invalidate():
+                    self._seen_guard_not_invalidated = False
+                if not effectinfo.has_random_effects():
+                    self.force_from_effectinfo(effectinfo)
+                    return
         self.force_all_lazy_setfields_and_arrayitems()
         self.clean_caches()
 
+    def force_from_effectinfo(self, effectinfo):
+        # XXX we can get the wrong complexity here, if the lists
+        # XXX stored on effectinfo are large
+        for fielddescr in effectinfo.readonly_descrs_fields:
+            self.force_lazy_setfield(fielddescr)
+        for arraydescr in effectinfo.readonly_descrs_arrays:
+            self.force_lazy_setarrayitem(arraydescr)
+        for fielddescr in effectinfo.write_descrs_fields:
+            self.force_lazy_setfield(fielddescr, can_cache=False)
+        for arraydescr in effectinfo.write_descrs_arrays:
+            self.force_lazy_setarrayitem(arraydescr, can_cache=False)
+        if effectinfo.check_forces_virtual_or_virtualizable():
+            vrefinfo = self.optimizer.metainterp_sd.virtualref_info
+            self.force_lazy_setfield(vrefinfo.descr_forced)
+            # ^^^ we only need to force this field; the other fields
+            # of virtualref_info and virtualizable_info are not gcptrs.
 
     def turned_constant(self, value):
         assert value.is_constant()
diff --git a/pypy/jit/metainterp/optimizeopt/rewrite.py b/pypy/jit/metainterp/optimizeopt/rewrite.py
--- a/pypy/jit/metainterp/optimizeopt/rewrite.py
+++ b/pypy/jit/metainterp/optimizeopt/rewrite.py
@@ -433,11 +433,10 @@
         # specifically the given oopspec call.  For non-oopspec calls,
         # oopspecindex is just zero.
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            oopspecindex = effectinfo.oopspecindex
-            if oopspecindex == EffectInfo.OS_ARRAYCOPY:
-                if self._optimize_CALL_ARRAYCOPY(op):
-                    return
+        oopspecindex = effectinfo.oopspecindex
+        if oopspecindex == EffectInfo.OS_ARRAYCOPY:
+            if self._optimize_CALL_ARRAYCOPY(op):
+                return
         self.emit_operation(op)
 
     def _optimize_CALL_ARRAYCOPY(self, op):
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py b/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py
@@ -51,14 +51,18 @@
                              restype=types.sint)
         #
         def calldescr(cpu, FUNC, oopspecindex, extraeffect=None):
-            einfo = EffectInfo([], [], [], [], oopspecindex=oopspecindex,
+            if extraeffect == EffectInfo.EF_RANDOM_EFFECTS:
+                f = None   # means "can force all" really
+            else:
+                f = []
+            einfo = EffectInfo(f, f, f, f, oopspecindex=oopspecindex,
                                extraeffect=extraeffect)
             return cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT, einfo)
         #
         libffi_prepare =  calldescr(cpu, FUNC, EffectInfo.OS_LIBFFI_PREPARE)
         libffi_push_arg = calldescr(cpu, FUNC, EffectInfo.OS_LIBFFI_PUSH_ARG)
         libffi_call =     calldescr(cpu, FUNC, EffectInfo.OS_LIBFFI_CALL,
-                                 EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE)
+                                    EffectInfo.EF_RANDOM_EFFECTS)
     
     namespace = namespace.__dict__
 
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_util.py b/pypy/jit/metainterp/optimizeopt/test/test_util.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_util.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_util.py
@@ -167,7 +167,8 @@
     onedescr = cpu.fielddescrof(U, 'one')
 
     FUNC = lltype.FuncType([lltype.Signed], lltype.Signed)
-    plaincalldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+    plaincalldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                     EffectInfo.MOST_GENERAL)
     nonwritedescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
                                     EffectInfo([], [], [], []))
     writeadescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
diff --git a/pypy/jit/metainterp/optimizeopt/vstring.py b/pypy/jit/metainterp/optimizeopt/vstring.py
--- a/pypy/jit/metainterp/optimizeopt/vstring.py
+++ b/pypy/jit/metainterp/optimizeopt/vstring.py
@@ -455,8 +455,8 @@
         # specifically the given oopspec call.  For non-oopspec calls,
         # oopspecindex is just zero.
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            oopspecindex = effectinfo.oopspecindex
+        oopspecindex = effectinfo.oopspecindex
+        if oopspecindex != EffectInfo.OS_NONE:
             for value, meth in opt_call_oopspec_ops:
                 if oopspecindex == value:      # a match with the OS_STR_xxx
                     if meth(self, op, mode_string):
diff --git a/pypy/jit/metainterp/pyjitpl.py b/pypy/jit/metainterp/pyjitpl.py
--- a/pypy/jit/metainterp/pyjitpl.py
+++ b/pypy/jit/metainterp/pyjitpl.py
@@ -1257,10 +1257,8 @@
         assert i == len(allboxes)
         #
         effectinfo = descr.get_extra_info()
-        if (effectinfo is None or
-                effectinfo.extraeffect ==
-                             effectinfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE or
-                assembler_call):
+        if (assembler_call or
+                effectinfo.check_forces_virtual_or_virtualizable()):
             # residual calls require attention to keep virtualizables in-sync
             self.metainterp.clear_exception()
             self.metainterp.vable_and_vrefs_before_residual_call()
@@ -1693,12 +1691,11 @@
             return
         if opnum == rop.CALL:
             effectinfo = descr.get_extra_info()
-            if effectinfo is not None:
-                ef = effectinfo.extraeffect
-                if ef == effectinfo.EF_LOOPINVARIANT or \
-                   ef == effectinfo.EF_ELIDABLE_CANNOT_RAISE or \
-                   ef == effectinfo.EF_ELIDABLE_CAN_RAISE:
-                    return
+            ef = effectinfo.extraeffect
+            if ef == effectinfo.EF_LOOPINVARIANT or \
+               ef == effectinfo.EF_ELIDABLE_CANNOT_RAISE or \
+               ef == effectinfo.EF_ELIDABLE_CAN_RAISE:
+                return
         if self.heap_cache:
             self.heap_cache.clear()
         if self.heap_array_cache:
diff --git a/pypy/jit/metainterp/test/test_compile.py b/pypy/jit/metainterp/test/test_compile.py
--- a/pypy/jit/metainterp/test/test_compile.py
+++ b/pypy/jit/metainterp/test/test_compile.py
@@ -190,7 +190,7 @@
     class FakeJitDriverSD:
         portal_runner_ptr = llhelper(lltype.Ptr(FUNC), ll_portal_runner)
         portal_runner_adr = llmemory.cast_ptr_to_adr(portal_runner_ptr)
-        portal_calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        portal_calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT, None)
         portal_finishtoken = compile.DoneWithThisFrameDescrInt()
         num_red_args = 2
         result_type = INT
diff --git a/pypy/jit/metainterp/test/test_rawmem.py b/pypy/jit/metainterp/test/test_rawmem.py
new file mode 100644
--- /dev/null
+++ b/pypy/jit/metainterp/test/test_rawmem.py
@@ -0,0 +1,22 @@
+from pypy.jit.metainterp.test.support import LLJitMixin
+from pypy.rpython.lltypesystem import lltype, rffi
+
+
+class TestJITRawMem(LLJitMixin):
+    def test_cast_void_ptr(self):
+        TP = lltype.Array(lltype.Float, hints={"nolength": True})
+        VOID_TP = lltype.Array(lltype.Void, hints={"nolength": True, "uncast_on_llgraph": True})
+        class A(object):
+            def __init__(self, x):
+                self.storage = rffi.cast(lltype.Ptr(VOID_TP), x)\
+
+        def f(n):
+            x = lltype.malloc(TP, n, flavor="raw", zero=True)
+            a = A(x)
+            s = 0.0
+            rffi.cast(lltype.Ptr(TP), a.storage)[0] = 1.0
+            s += rffi.cast(lltype.Ptr(TP), a.storage)[0]
+            lltype.free(x, flavor="raw")
+            return s
+        res = self.interp_operations(f, [10])
+        assert res == 1.0
\ No newline at end of file
diff --git a/pypy/jit/metainterp/test/test_string.py b/pypy/jit/metainterp/test/test_string.py
--- a/pypy/jit/metainterp/test/test_string.py
+++ b/pypy/jit/metainterp/test/test_string.py
@@ -1,5 +1,6 @@
 import py
 from pypy.rlib.jit import JitDriver, dont_look_inside, we_are_jitted
+from pypy.rlib.debug import debug_print
 from pypy.jit.codewriter.policy import StopAtXPolicy
 from pypy.rpython.ootypesystem import ootype
 from pypy.jit.metainterp.test.support import LLJitMixin, OOJitMixin
@@ -521,7 +522,8 @@
         jitdriver = JitDriver(greens = ['g'], reds = ['m'])
         @dont_look_inside
         def escape(x):
-            print str(x)
+            # a plain "print" would call os.write() and release the gil
+            debug_print(str(x))
         def f(g, m):
             g = str(g)
             while m >= 0:
diff --git a/pypy/jit/metainterp/test/test_virtualstate.py b/pypy/jit/metainterp/test/test_virtualstate.py
--- a/pypy/jit/metainterp/test/test_virtualstate.py
+++ b/pypy/jit/metainterp/test/test_virtualstate.py
@@ -1,3 +1,4 @@
+from __future__ import with_statement
 import py
 from pypy.jit.metainterp.optimize import InvalidLoop
 from pypy.jit.metainterp.optimizeopt.virtualstate import VirtualStateInfo, VStructStateInfo, \
diff --git a/pypy/jit/metainterp/warmspot.py b/pypy/jit/metainterp/warmspot.py
--- a/pypy/jit/metainterp/warmspot.py
+++ b/pypy/jit/metainterp/warmspot.py
@@ -21,6 +21,7 @@
 from pypy.jit.metainterp.jitdriver import JitDriverStaticData
 from pypy.jit.codewriter import support, codewriter, longlong
 from pypy.jit.codewriter.policy import JitPolicy
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.metainterp.optimizeopt import ALL_OPTS_NAMES
 
 # ____________________________________________________________
@@ -746,7 +747,8 @@
         jd.portal_calldescr = self.cpu.calldescrof(
             jd._PTR_PORTAL_FUNCTYPE.TO,
             jd._PTR_PORTAL_FUNCTYPE.TO.ARGS,
-            jd._PTR_PORTAL_FUNCTYPE.TO.RESULT)
+            jd._PTR_PORTAL_FUNCTYPE.TO.RESULT,
+            EffectInfo.MOST_GENERAL)
 
         vinfo = jd.virtualizable_info
 
diff --git a/pypy/module/__builtin__/__init__.py b/pypy/module/__builtin__/__init__.py
--- a/pypy/module/__builtin__/__init__.py
+++ b/pypy/module/__builtin__/__init__.py
@@ -19,6 +19,7 @@
         'sorted'        : 'app_functional.sorted',
         'any'           : 'app_functional.any',
         'all'           : 'app_functional.all',
+        'sum'           : 'app_functional.sum',
         'vars'          : 'app_inspect.vars',
         'dir'           : 'app_inspect.dir',
 
@@ -85,7 +86,6 @@
         'enumerate'     : 'functional.W_Enumerate',
         'min'           : 'functional.min',
         'max'           : 'functional.max',
-        'sum'           : 'functional.sum',
         'map'           : 'functional.map',
         'zip'           : 'functional.zip',
         'reduce'        : 'functional.reduce',
@@ -118,7 +118,7 @@
                 return module.Module(space, None, w_builtin)
            builtin = space.interpclass_w(w_builtin)
            if isinstance(builtin, module.Module):
-               return builtin   
+               return builtin
        # no builtin! make a default one.  Given them None, at least.
        builtin = module.Module(space, None)
        space.setitem(builtin.w_dict, space.wrap('None'), space.w_None)
diff --git a/pypy/module/__builtin__/app_functional.py b/pypy/module/__builtin__/app_functional.py
--- a/pypy/module/__builtin__/app_functional.py
+++ b/pypy/module/__builtin__/app_functional.py
@@ -34,3 +34,18 @@
         if not x:
             return False
     return True
+
+def sum(sequence, start=0):
+    """sum(sequence[, start]) -> value
+
+Returns the sum of a sequence of numbers (NOT strings) plus the value
+of parameter 'start' (which defaults to 0).  When the sequence is
+empty, returns start."""
+    if isinstance(start, basestring):
+        raise TypeError("sum() can't sum strings")
+    last = start
+    for x in sequence:
+        # Very intentionally *not* +=, that would have different semantics if
+        # start was a mutable type, such as a list
+        last = last + x
+    return last
\ No newline at end of file
diff --git a/pypy/module/__builtin__/functional.py b/pypy/module/__builtin__/functional.py
--- a/pypy/module/__builtin__/functional.py
+++ b/pypy/module/__builtin__/functional.py
@@ -325,27 +325,6 @@
         result_w.append(w_res)
     return result_w
 
-def sum(space, w_sequence, w_start=0):
-    """sum(sequence[, start]) -> value
-
-Returns the sum of a sequence of numbers (NOT strings) plus the value
-of parameter 'start' (which defaults to 0).  When the sequence is
-empty, returns start."""
-    if space.is_true(space.isinstance(w_start, space.w_basestring)):
-        msg = "sum() can't sum strings"
-        raise OperationError(space.w_TypeError, space.wrap(msg))
-    w_iter = space.iter(w_sequence)
-    w_last = w_start
-    while True:
-        try:
-            w_next = space.next(w_iter)
-        except OperationError, e:
-            if not e.match(space, space.w_StopIteration):
-                raise
-            break
-        w_last = space.add(w_last, w_next)
-    return w_last
-
 @unwrap_spec(sequences_w="args_w")
 def zip(space, sequences_w):
     """Return a list of tuples, where the nth tuple contains every nth item of
diff --git a/pypy/module/__builtin__/test/test_classobj.py b/pypy/module/__builtin__/test/test_classobj.py
--- a/pypy/module/__builtin__/test/test_classobj.py
+++ b/pypy/module/__builtin__/test/test_classobj.py
@@ -981,6 +981,86 @@
         assert a.x == 2
         raises(TypeError, descr.__delete__, a)
 
+    def test_partial_ordering(self):
+        class A:
+            def __lt__(self, other):
+                return self
+        a1 = A()
+        a2 = A()
+        assert (a1 < a2) is a1
+        assert (a1 > a2) is a2
+
+    def test_eq_order(self):
+        # this gives the ordering of equality-related functions on top of
+        # CPython **for old-style classes**.
+        class A:
+            def __eq__(self, other): return self.__class__.__name__+':A.eq'
+            def __ne__(self, other): return self.__class__.__name__+':A.ne'
+            def __lt__(self, other): return self.__class__.__name__+':A.lt'
+            def __le__(self, other): return self.__class__.__name__+':A.le'
+            def __gt__(self, other): return self.__class__.__name__+':A.gt'
+            def __ge__(self, other): return self.__class__.__name__+':A.ge'
+        class B:
+            def __eq__(self, other): return self.__class__.__name__+':B.eq'
+            def __ne__(self, other): return self.__class__.__name__+':B.ne'
+            def __lt__(self, other): return self.__class__.__name__+':B.lt'
+            def __le__(self, other): return self.__class__.__name__+':B.le'
+            def __gt__(self, other): return self.__class__.__name__+':B.gt'
+            def __ge__(self, other): return self.__class__.__name__+':B.ge'
+        #
+        assert (A() == B()) == 'A:A.eq'
+        assert (A() != B()) == 'A:A.ne'
+        assert (A() <  B()) == 'A:A.lt'
+        assert (A() <= B()) == 'A:A.le'
+        assert (A() >  B()) == 'A:A.gt'
+        assert (A() >= B()) == 'A:A.ge'
+        #
+        assert (B() == A()) == 'B:B.eq'
+        assert (B() != A()) == 'B:B.ne'
+        assert (B() <  A()) == 'B:B.lt'
+        assert (B() <= A()) == 'B:B.le'
+        assert (B() >  A()) == 'B:B.gt'
+        assert (B() >= A()) == 'B:B.ge'
+        #
+        class C(A):
+            def __eq__(self, other): return self.__class__.__name__+':C.eq'
+            def __ne__(self, other): return self.__class__.__name__+':C.ne'
+            def __lt__(self, other): return self.__class__.__name__+':C.lt'
+            def __le__(self, other): return self.__class__.__name__+':C.le'
+            def __gt__(self, other): return self.__class__.__name__+':C.gt'
+            def __ge__(self, other): return self.__class__.__name__+':C.ge'
+        #
+        assert (A() == C()) == 'A:A.eq'
+        assert (A() != C()) == 'A:A.ne'
+        assert (A() <  C()) == 'A:A.lt'
+        assert (A() <= C()) == 'A:A.le'
+        assert (A() >  C()) == 'A:A.gt'
+        assert (A() >= C()) == 'A:A.ge'
+        #
+        assert (C() == A()) == 'C:C.eq'
+        assert (C() != A()) == 'C:C.ne'
+        assert (C() <  A()) == 'C:C.lt'
+        assert (C() <= A()) == 'C:C.le'
+        assert (C() >  A()) == 'C:C.gt'
+        assert (C() >= A()) == 'C:C.ge'
+        #
+        class D(A):
+            pass
+        #
+        assert (A() == D()) == 'A:A.eq'
+        assert (A() != D()) == 'A:A.ne'
+        assert (A() <  D()) == 'A:A.lt'
+        assert (A() <= D()) == 'A:A.le'
+        assert (A() >  D()) == 'A:A.gt'
+        assert (A() >= D()) == 'A:A.ge'
+        #
+        assert (D() == A()) == 'D:A.eq'
+        assert (D() != A()) == 'D:A.ne'
+        assert (D() <  A()) == 'D:A.lt'
+        assert (D() <= A()) == 'D:A.le'
+        assert (D() >  A()) == 'D:A.gt'
+        assert (D() >= A()) == 'D:A.ge'
+
 
 class AppTestOldStyleClassStrDict(object):
     def setup_class(cls):
diff --git a/pypy/module/__pypy__/interp_builders.py b/pypy/module/__pypy__/interp_builders.py
--- a/pypy/module/__pypy__/interp_builders.py
+++ b/pypy/module/__pypy__/interp_builders.py
@@ -7,7 +7,7 @@
 
 class W_UnicodeBuilder(Wrappable):
     def __init__(self, space, size):
-        if size == -1:
+        if size < 0:
             self.builder = UnicodeBuilder()
         else:
             self.builder = UnicodeBuilder(size)
@@ -47,4 +47,4 @@
     append_slice = interp2app(W_UnicodeBuilder.descr_append_slice),
     build = interp2app(W_UnicodeBuilder.descr_build),
 )
-W_UnicodeBuilder.typedef.acceptable_as_base_class = False
\ No newline at end of file
+W_UnicodeBuilder.typedef.acceptable_as_base_class = False
diff --git a/pypy/module/_continuation/__init__.py b/pypy/module/_continuation/__init__.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/__init__.py
@@ -0,0 +1,40 @@
+from pypy.interpreter.mixedmodule import MixedModule
+
+
+class Module(MixedModule):
+    """This module exposes 'one-shot continuation containers'.
+
+A 'continulet' object from this module is a container that stores a
+one-shot continuation.  It is similar in purpose to the 'f_back'
+attribute of frames, which points to where execution should continue
+after this frame finishes.  The difference is that it will be changed
+(often repeatedly) before the frame actually returns.
+
+To make a continulet object, call 'continulet' with a callable and
+optional extra arguments.  Later, the first time you switch() to the
+continulet, the callable is invoked with the same continulet object as
+the extra first argument.
+
+At this point, the one-shot continuation stored in the continulet points
+to the caller of switch().  When switch() is called again, this one-shot
+continuation is exchanged with the current one; it means that the caller
+of switch() is suspended, its continuation stored in the container, and
+the old continuation from the continulet object is resumed.
+
+Continulets are internally implemented using stacklets.  Stacklets
+are a bit more primitive (they are really one-shot continuations), but
+that idea only works in C, not in Python, notably because of exceptions.
+
+The most primitive API is actually 'permute()', which just permutes the
+one-shot continuation stored in two (or more) continulets.
+"""
+
+    appleveldefs = {
+        'error': 'app_continuation.error',
+        'generator': 'app_continuation.generator',
+    }
+
+    interpleveldefs = {
+        'continulet': 'interp_continuation.W_Continulet',
+        'permute': 'interp_continuation.permute',
+    }
diff --git a/pypy/module/_continuation/app_continuation.py b/pypy/module/_continuation/app_continuation.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/app_continuation.py
@@ -0,0 +1,35 @@
+
+class error(Exception):
+    "Usage error of the _continuation module."
+
+
+import _continuation
+
+
+class generator(object):
+
+    def __init__(self, callable):
+        self.__func__ = callable
+
+    def __get__(self, obj, type=None):
+        return generator(self.__func__.__get__(obj, type))
+
+    def __call__(self, *args, **kwds):
+        return genlet(self.__func__, *args, **kwds)
+
+
+class genlet(_continuation.continulet):
+
+    def __iter__(self):
+        return self
+
+    def next(self, value=None):
+        res = self.switch(value)
+        if self.is_pending():
+            return res
+        else:
+            if res is not None:
+                raise TypeError("_continuation.generator must return None")
+            raise StopIteration
+
+    send = next
diff --git a/pypy/module/_continuation/interp_continuation.py b/pypy/module/_continuation/interp_continuation.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/interp_continuation.py
@@ -0,0 +1,245 @@
+from pypy.rlib.rstacklet import StackletThread
+from pypy.rlib import jit
+from pypy.interpreter.error import OperationError
+from pypy.interpreter.executioncontext import ExecutionContext
+from pypy.interpreter.baseobjspace import Wrappable
+from pypy.interpreter.typedef import TypeDef
+from pypy.interpreter.gateway import interp2app
+
+
+class W_Continulet(Wrappable):
+    sthread = None
+
+    def __init__(self, space):
+        self.space = space
+        # states:
+        #  - not init'ed: self.sthread == None
+        #  - normal:      self.sthread != None, not is_empty_handle(self.h)
+        #  - finished:    self.sthread != None, is_empty_handle(self.h)
+
+    def check_sthread(self):
+        ec = self.space.getexecutioncontext()
+        if ec.stacklet_thread is not self.sthread:
+            start_state.clear()
+            raise geterror(self.space, "inter-thread support is missing")
+        return ec
+
+    def descr_init(self, w_callable, __args__):
+        if self.sthread is not None:
+            raise geterror(self.space, "continulet already __init__ialized")
+        start_state.origin = self
+        start_state.w_callable = w_callable
+        start_state.args = __args__
+        self.sthread = build_sthread(self.space)
+        try:
+            self.h = self.sthread.new(new_stacklet_callback)
+            if self.sthread.is_empty_handle(self.h):    # early return
+                raise MemoryError
+        except MemoryError:
+            self.sthread = None
+            start_state.clear()
+            raise getmemoryerror(self.space)
+
+    def switch(self, w_to):
+        to = self.space.interp_w(W_Continulet, w_to, can_be_None=True)
+        if to is not None:
+            if self is to:    # double-switch to myself: no-op
+                return get_result()
+            if to.sthread is None:
+                start_state.clear()
+                raise geterror(self.space, "continulet not initialized yet")
+        if self.sthread is None:
+            start_state.clear()
+            raise geterror(self.space, "continulet not initialized yet")
+        ec = self.check_sthread()
+        saved_topframeref = ec.topframeref
+        #
+        start_state.origin = self
+        if to is None:
+            # simple switch: going to self.h
+            start_state.destination = self
+        else:
+            # double switch: the final destination is to.h
+            start_state.destination = to
+        #
+        h = start_state.destination.h
+        sthread = self.sthread
+        if sthread.is_empty_handle(h):
+            start_state.clear()
+            raise geterror(self.space, "continulet already finished")
+        #
+        try:
+            do_switch(sthread, h)
+        except MemoryError:
+            start_state.clear()
+            raise getmemoryerror(self.space)
+        #
+        ec = sthread.ec
+        ec.topframeref = saved_topframeref
+        return get_result()
+
+    def descr_switch(self, w_value=None, w_to=None):
+        start_state.w_value = w_value
+        return self.switch(w_to)
+
+    def descr_throw(self, w_type, w_val=None, w_tb=None, w_to=None):
+        from pypy.interpreter.pytraceback import check_traceback
+        space = self.space
+        #
+        msg = "throw() third argument must be a traceback object"
+        if space.is_w(w_tb, space.w_None):
+            tb = None
+        else:
+            tb = check_traceback(space, w_tb, msg)
+        #
+        operr = OperationError(w_type, w_val, tb)
+        operr.normalize_exception(space)
+        start_state.w_value = None
+        start_state.propagate_exception = operr
+        return self.switch(w_to)
+
+    def descr_is_pending(self):
+        valid = (self.sthread is not None
+                 and not self.sthread.is_empty_handle(self.h))
+        return self.space.newbool(valid)
+
+
+def W_Continulet___new__(space, w_subtype, __args__):
+    r = space.allocate_instance(W_Continulet, w_subtype)
+    r.__init__(space)
+    return space.wrap(r)
+
+
+W_Continulet.typedef = TypeDef(
+    'continulet',
+    __module__ = '_continuation',
+    __new__     = interp2app(W_Continulet___new__),
+    __init__    = interp2app(W_Continulet.descr_init),
+    switch      = interp2app(W_Continulet.descr_switch),
+    throw       = interp2app(W_Continulet.descr_throw),
+    is_pending  = interp2app(W_Continulet.descr_is_pending),
+    )
+
+
+# ____________________________________________________________
+
+
+class State:
+    def __init__(self, space):
+        self.space = space 
+        w_module = space.getbuiltinmodule('_continuation')
+        self.w_error = space.getattr(w_module, space.wrap('error'))
+        self.w_memoryerror = OperationError(space.w_MemoryError, space.w_None)
+
+def geterror(space, message):
+    cs = space.fromcache(State)
+    return OperationError(cs.w_error, space.wrap(message))
+
+def getmemoryerror(space):
+    cs = space.fromcache(State)
+    return cs.w_memoryerror
+
+# ____________________________________________________________
+
+
+class SThread(StackletThread):
+
+    def __init__(self, space, ec):
+        StackletThread.__init__(self, space.config)
+        self.space = space
+        self.ec = ec
+
+ExecutionContext.stacklet_thread = None
+
+# ____________________________________________________________
+
+
+class StartState:   # xxx a single global to pass around the function to start
+    def clear(self):
+        self.origin = None
+        self.destination = None
+        self.w_callable = None
+        self.args = None
+        self.w_value = None
+        self.propagate_exception = None
+start_state = StartState()
+start_state.clear()
+
+
+def new_stacklet_callback(h, arg):
+    self       = start_state.origin
+    w_callable = start_state.w_callable
+    args       = start_state.args
+    start_state.clear()
+    try:
+        do_switch(self.sthread, h)
+    except MemoryError:
+        return h       # oups!  do an early return in this case
+    #
+    space = self.space
+    try:
+        ec = self.sthread.ec
+        ec.topframeref = jit.vref_None
+
+        if start_state.propagate_exception is not None:
+            raise start_state.propagate_exception   # just propagate it further
+        if start_state.w_value is not space.w_None:
+            raise OperationError(space.w_TypeError, space.wrap(
+                "can't send non-None value to a just-started continulet"))
+
+        args = args.prepend(self.space.wrap(self))
+        w_result = space.call_args(w_callable, args)
+    except Exception, e:
+        start_state.propagate_exception = e
+    else:
+        start_state.w_value = w_result
+    start_state.origin = self
+    start_state.destination = self
+    return self.h
+
+
+def do_switch(sthread, h):
+    h = sthread.switch(h)
+    origin = start_state.origin
+    self = start_state.destination
+    start_state.origin = None
+    start_state.destination = None
+    self.h, origin.h = origin.h, h
+
+def get_result():
+    if start_state.propagate_exception:
+        e = start_state.propagate_exception
+        start_state.propagate_exception = None
+        raise e
+    w_value = start_state.w_value
+    start_state.w_value = None
+    return w_value
+
+def build_sthread(space):
+    ec = space.getexecutioncontext()
+    sthread = ec.stacklet_thread
+    if not sthread:
+        sthread = ec.stacklet_thread = SThread(space, ec)
+    return sthread
+
+# ____________________________________________________________
+
+def permute(space, args_w):
+    sthread = build_sthread(space)
+    #
+    contlist = []
+    for w_cont in args_w:
+        cont = space.interp_w(W_Continulet, w_cont)
+        if cont.sthread is not sthread:
+            if cont.sthread is None:
+                raise geterror(space, "got a non-initialized continulet")
+            else:
+                raise geterror(space, "inter-thread support is missing")
+        elif sthread.is_empty_handle(cont.h):
+            raise geterror(space, "got an already-finished continulet")
+        contlist.append(cont)
+    #
+    if len(contlist) > 1:
+        other = contlist[-1].h
+        for cont in contlist:
+            other, cont.h = cont.h, other
diff --git a/pypy/module/_continuation/test/__init__.py b/pypy/module/_continuation/test/__init__.py
new file mode 100644
diff --git a/pypy/module/_continuation/test/support.py b/pypy/module/_continuation/test/support.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/support.py
@@ -0,0 +1,12 @@
+import py
+from pypy.conftest import gettestobjspace
+from pypy.rpython.tool.rffi_platform import CompilationError
+
+
+class BaseAppTest:
+    def setup_class(cls):
+        try:
+            import pypy.rlib.rstacklet
+        except CompilationError, e:
+            py.test.skip("cannot import rstacklet: %s" % e)
+        cls.space = gettestobjspace(usemodules=['_continuation'])
diff --git a/pypy/module/_continuation/test/test_generator.py b/pypy/module/_continuation/test/test_generator.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/test_generator.py
@@ -0,0 +1,70 @@
+from pypy.module._continuation.test.support import BaseAppTest
+
+
+class AppTestGenerator(BaseAppTest):
+
+    def test_simple(self):
+        from _continuation import generator
+        #
+        @generator
+        def f(gen, n):
+            gen.switch(n+1)
+            f2(gen, n+2)
+            gen.switch(n+3)
+        #
+        def f2(gen, m):
+            gen.switch(m*2)
+        #
+        g = f(10)
+        res = g.next()
+        assert res == 11
+        res = g.next()
+        assert res == 24
+        res = g.next()
+        assert res == 13
+        raises(StopIteration, g.next)
+
+    def test_iterator(self):
+        from _continuation import generator
+        #
+        @generator
+        def f(gen, n):
+            gen.switch(n+1)
+            f2(gen, n+2)
+            gen.switch(n+3)
+        #
+        def f2(gen, m):
+            gen.switch(m*2)
+        #
+        res = list(f(10))
+        assert res == [11, 24, 13]
+        g = f(20)
+        assert iter(g) is g
+
+    def test_bound_method(self):
+        from _continuation import generator
+        #
+        class A(object):
+            def __init__(self, m):
+                self.m = m
+            #
+            @generator
+            def f(self, gen, n):
+                gen.switch(n - self.m)
+        #
+        a = A(10)
+        res = list(a.f(25))
+        assert res == [15]
+
+    def test_must_return_None(self):
+        from _continuation import generator
+        #
+        @generator
+        def f(gen, n):
+            gen.switch(n+1)
+            return "foo"
+        #
+        g = f(10)
+        res = g.next()
+        assert res == 11
+        raises(TypeError, g.next)
diff --git a/pypy/module/_continuation/test/test_stacklet.py b/pypy/module/_continuation/test/test_stacklet.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/test_stacklet.py
@@ -0,0 +1,635 @@
+import os
+from pypy.module._continuation.test.support import BaseAppTest
+
+
+class AppTestStacklet(BaseAppTest):
+    def setup_class(cls):
+        BaseAppTest.setup_class.im_func(cls)
+        cls.w_translated = cls.space.wrap(
+            os.path.join(os.path.dirname(__file__),
+                         'test_translated.py'))
+
+    def test_new_empty(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c):
+            pass
+        #
+        c = continulet(empty_callback)
+        assert type(c) is continulet
+
+    def test_call_empty(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c1):
+            assert c1 is c
+            seen.append(1)
+            return 42
+        #
+        seen = []
+        c = continulet(empty_callback)
+        res = c.switch()
+        assert res == 42
+        assert seen == [1]
+
+    def test_no_double_init(self):
+        from _continuation import continulet, error
+        #
+        def empty_callback(c1):
+            pass
+        #
+        c = continulet(empty_callback)
+        raises(error, c.__init__, empty_callback)
+
+    def test_no_init_after_started(self):
+        from _continuation import continulet, error
+        #
+        def empty_callback(c1):
+            raises(error, c1.__init__, empty_callback)
+            return 42
+        #
+        c = continulet(empty_callback)
+        res = c.switch()
+        assert res == 42
+
+    def test_no_init_after_finished(self):
+        from _continuation import continulet, error
+        #
+        def empty_callback(c1):
+            return 42
+        #
+        c = continulet(empty_callback)
+        res = c.switch()
+        assert res == 42
+        raises(error, c.__init__, empty_callback)
+
+    def test_propagate_exception(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c1):
+            assert c1 is c
+            seen.append(42)
+            raise ValueError
+        #
+        seen = []
+        c = continulet(empty_callback)
+        raises(ValueError, c.switch)
+        assert seen == [42]
+
+    def test_callback_with_arguments(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c1, *args, **kwds):
+            seen.append(c1)
+            seen.append(args)
+            seen.append(kwds)
+            return 42
+        #
+        seen = []
+        c = continulet(empty_callback, 42, 43, foo=44, bar=45)
+        res = c.switch()
+        assert res == 42
+        assert seen == [c, (42, 43), {'foo': 44, 'bar': 45}]
+
+    def test_switch(self):
+        from _continuation import continulet
+        #
+        def switchbackonce_callback(c):
+            seen.append(1)
+            res = c.switch('a')
+            assert res == 'b'
+            seen.append(3)
+            return 'c'
+        #
+        seen = []
+        c = continulet(switchbackonce_callback)
+        seen.append(0)
+        res = c.switch()
+        assert res == 'a'
+        seen.append(2)
+        res = c.switch('b')
+        assert res == 'c'
+        assert seen == [0, 1, 2, 3]
+
+    def test_initial_switch_must_give_None(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c):
+            return 'ok'
+        #
+        c = continulet(empty_callback)
+        res = c.switch(None)
+        assert res == 'ok'
+        #
+        c = continulet(empty_callback)
+        raises(TypeError, c.switch, 'foo')  # "can't send non-None value"
+
+    def test_continuation_error(self):
+        from _continuation import continulet, error
+        #
+        def empty_callback(c):
+            return 42
+        #
+        c = continulet(empty_callback)
+        c.switch()
+        e = raises(error, c.switch)
+        assert str(e.value) == "continulet already finished"
+
+    def test_not_initialized_yet(self):
+        from _continuation import continulet, error
+        c = continulet.__new__(continulet)
+        e = raises(error, c.switch)
+        assert str(e.value) == "continulet not initialized yet"
+
+    def test_go_depth2(self):
+        from _continuation import continulet
+        #
+        def depth2(c):
+            seen.append(3)
+            return 4
+        #
+        def depth1(c):
+            seen.append(1)
+            c2 = continulet(depth2)
+            seen.append(2)
+            res = c2.switch()
+            seen.append(res)
+            return 5
+        #
+        seen = []
+        c = continulet(depth1)
+        seen.append(0)
+        res = c.switch()
+        seen.append(res)
+        assert seen == [0, 1, 2, 3, 4, 5]
+
+    def test_exception_depth2(self):
+        from _continuation import continulet
+        #
+        def depth2(c):
+            seen.append(2)
+            raise ValueError
+        #
+        def depth1(c):
+            seen.append(1)
+            try:
+                continulet(depth2).switch()
+            except ValueError:
+                seen.append(3)
+            return 4
+        #
+        seen = []
+        c = continulet(depth1)
+        res = c.switch()
+        seen.append(res)
+        assert seen == [1, 2, 3, 4]
+
+    def test_exception_with_switch(self):
+        from _continuation import continulet
+        #
+        def depth1(c):
+            seen.append(1)
+            c.switch()
+            seen.append(3)
+            raise ValueError
+        #
+        seen = []
+        c = continulet(depth1)
+        seen.append(0)
+        c.switch()
+        seen.append(2)
+        raises(ValueError, c.switch)
+        assert seen == [0, 1, 2, 3]
+
+    def test_is_pending(self):
+        from _continuation import continulet
+        #
+        def switchbackonce_callback(c):
+            assert c.is_pending()
+            res = c.switch('a')
+            assert res == 'b'
+            assert c.is_pending()
+            return 'c'
+        #
+        c = continulet.__new__(continulet)
+        assert not c.is_pending()
+        c.__init__(switchbackonce_callback)
+        assert c.is_pending()
+        res = c.switch()
+        assert res == 'a'
+        assert c.is_pending()
+        res = c.switch('b')
+        assert res == 'c'
+        assert not c.is_pending()
+
+    def test_switch_alternate(self):
+        from _continuation import continulet
+        #
+        def func_lower(c):
+            res = c.switch('a')
+            assert res == 'b'
+            res = c.switch('c')
+            assert res == 'd'
+            return 'e'
+        #
+        def func_upper(c):
+            res = c.switch('A')
+            assert res == 'B'
+            res = c.switch('C')
+            assert res == 'D'
+            return 'E'
+        #
+        c_lower = continulet(func_lower)
+        c_upper = continulet(func_upper)
+        res = c_lower.switch()
+        assert res == 'a'
+        res = c_upper.switch()
+        assert res == 'A'
+        res = c_lower.switch('b')
+        assert res == 'c'
+        res = c_upper.switch('B')
+        assert res == 'C'
+        res = c_lower.switch('d')
+        assert res == 'e'
+        res = c_upper.switch('D')
+        assert res == 'E'
+
+    def test_exception_with_switch_depth2(self):
+        from _continuation import continulet
+        #
+        def depth2(c):
+            seen.append(4)
+            c.switch()
+            seen.append(6)
+            raise ValueError
+        #
+        def depth1(c):
+            seen.append(1)
+            c.switch()
+            seen.append(3)
+            c2 = continulet(depth2)
+            c2.switch()
+            seen.append(5)
+            raises(ValueError, c2.switch)
+            assert not c2.is_pending()
+            seen.append(7)
+            assert c.is_pending()
+            raise KeyError
+        #
+        seen = []
+        c = continulet(depth1)
+        c.switch()
+        seen.append(2)
+        raises(KeyError, c.switch)
+        assert not c.is_pending()
+        assert seen == [1, 2, 3, 4, 5, 6, 7]
+
+    def test_random_switching(self):
+        from _continuation import continulet
+        #
+        def t1(c1):
+            return c1.switch()
+        def s1(c1, n):
+            assert n == 123
+            c2 = t1(c1)
+            return c1.switch('a') + 1
+        #
+        def s2(c2, c1):
+            res = c1.switch(c2)
+            assert res == 'a'
+            return c2.switch('b') + 2
+        #
+        def f():
+            c1 = continulet(s1, 123)
+            c2 = continulet(s2, c1)
+            c1.switch()
+            res = c2.switch()
+            assert res == 'b'
+            res = c1.switch(1000)
+            assert res == 1001
+            return c2.switch(2000)
+        #
+        res = f()
+        assert res == 2002
+
+    def test_f_back_is_None_for_now(self):
+        import sys
+        from _continuation import continulet
+        #
+        def g(c):
+            c.switch(sys._getframe(0))
+            c.switch(sys._getframe(0).f_back)
+            c.switch(sys._getframe(1))
+            c.switch(sys._getframe(1).f_back)
+            c.switch(sys._getframe(2))
+        def f(c):
+            g(c)
+        #
+        c = continulet(f)
+        f1 = c.switch()
+        assert f1.f_code.co_name == 'g'
+        f2 = c.switch()
+        assert f2.f_code.co_name == 'f'
+        f3 = c.switch()
+        assert f3.f_code.co_name == 'f'
+        f4 = c.switch()
+        assert f4 is None
+        raises(ValueError, c.switch)    # "call stack is not deep enough"
+
+    def test_traceback_is_complete(self):
+        import sys
+        from _continuation import continulet
+        #
+        def g():
+            raise KeyError
+        def f(c):
+            g()
+        #
+        def do(c):
+            c.switch()
+        #
+        c = continulet(f)
+        try:
+            do(c)
+        except KeyError:
+            tb = sys.exc_info()[2]
+        else:
+            raise AssertionError("should have raised!")
+        #
+        assert tb.tb_next.tb_frame.f_code.co_name == 'do'
+        assert tb.tb_next.tb_next.tb_frame.f_code.co_name == 'f'
+        assert tb.tb_next.tb_next.tb_next.tb_frame.f_code.co_name == 'g'
+        assert tb.tb_next.tb_next.tb_next.tb_next is None
+
+    def test_switch2_simple(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            res = c1.switch('started 1')
+            assert res == 'a'
+            res = c1.switch('b', to=c2)
+            assert res == 'c'
+            return 42
+        def f2(c2):
+            res = c2.switch('started 2')
+            assert res == 'b'
+            res = c2.switch('c', to=c1)
+            not_reachable
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c1.switch()
+        assert res == 'started 1'
+        res = c2.switch()
+        assert res == 'started 2'
+        res = c1.switch('a')
+        assert res == 42
+
+    def test_switch2_pingpong(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            res = c1.switch('started 1')
+            assert res == 'go'
+            for i in range(10):
+                res = c1.switch(i, to=c2)
+                assert res == 100 + i
+            return 42
+        def f2(c2):
+            res = c2.switch('started 2')
+            for i in range(10):
+                assert res == i
+                res = c2.switch(100 + i, to=c1)
+            not_reachable
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c1.switch()
+        assert res == 'started 1'
+        res = c2.switch()
+        assert res == 'started 2'
+        res = c1.switch('go')
+        assert res == 42
+
+    def test_switch2_more_complex(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            res = c1.switch(to=c2)
+            assert res == 'a'
+            res = c1.switch('b', to=c2)
+            assert res == 'c'
+            return 41
+        def f2(c2):
+            res = c2.switch('a', to=c1)
+            assert res == 'b'
+            return 42
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c1.switch()
+        assert res == 42
+        assert not c2.is_pending()    # finished by returning 42
+        res = c1.switch('c')
+        assert res == 41
+
+    def test_switch2_no_op(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            res = c1.switch('a', to=c1)
+            assert res == 'a'
+            return 42
+        #
+        c1 = continulet(f1)
+        res = c1.switch()
+        assert res == 42
+
+    def test_switch2_immediately_away(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            print 'in f1'
+            return 'm'
+        #
+        def f2(c2):
+            res = c2.switch('z')
+            print 'got there!'
+            assert res == 'a'
+            return None
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c2.switch()
+        assert res == 'z'
+        assert c1.is_pending()
+        assert c2.is_pending()
+        print 'calling!'
+        res = c1.switch('a', to=c2)
+        print 'back'
+        assert res == 'm'
+
+    def test_switch2_immediately_away_corner_case(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            this_is_never_seen
+        #
+        def f2(c2):
+            res = c2.switch('z')
+            assert res is None
+            return 'b'    # this goes back into the caller, which is f1,
+                          # but f1 didn't start yet, so a None-value value
+                          # has nowhere to go to...
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c2.switch()
+        assert res == 'z'
+        raises(TypeError, c1.switch, to=c2)  # "can't send non-None value"
+
+    def test_switch2_not_initialized_yet(self):
+        from _continuation import continulet, error
+        #
+        def f1(c1):
+            not_reachable
+        #
+        c1 = continulet(f1)
+        c2 = continulet.__new__(continulet)
+        e = raises(error, c1.switch, to=c2)
+        assert str(e.value) == "continulet not initialized yet"
+
+    def test_switch2_already_finished(self):
+        from _continuation import continulet, error
+        #
+        def f1(c1):
+            not_reachable
+        def empty_callback(c):
+            return 42
+        #
+        c1 = continulet(f1)
+        c2 = continulet(empty_callback)
+        c2.switch()
+        e = raises(error, c1.switch, to=c2)
+        assert str(e.value) == "continulet already finished"
+
+    def test_throw(self):
+        import sys
+        from _continuation import continulet
+        #
+        def f1(c1):
+            try:
+                c1.switch()
+            except KeyError:
+                res = "got keyerror"
+            try:
+                c1.switch(res)
+            except IndexError, e:
+                pass
+            try:
+                c1.switch(e)
+            except IndexError, e2:
+                pass
+            try:
+                c1.switch(e2)
+            except IndexError:
+                c1.throw(*sys.exc_info())
+            should_never_reach_here
+        #
+        c1 = continulet(f1)
+        c1.switch()
+        res = c1.throw(KeyError)
+        assert res == "got keyerror"
+        class FooError(IndexError):
+            pass
+        foo = FooError()
+        res = c1.throw(foo)
+        assert res is foo
+        res = c1.throw(IndexError, foo)
+        assert res is foo
+        #
+        def main():
+            def do_raise():
+                raise foo
+            try:
+                do_raise()
+            except IndexError:
+                tb = sys.exc_info()[2]
+            try:
+                c1.throw(IndexError, foo, tb)
+            except IndexError:
+                tb = sys.exc_info()[2]
+            return tb
+        #
+        tb = main()
+        assert tb.tb_frame.f_code.co_name == 'main'
+        assert tb.tb_next.tb_frame.f_code.co_name == 'f1'
+        assert tb.tb_next.tb_next.tb_frame.f_code.co_name == 'main'
+        assert tb.tb_next.tb_next.tb_next.tb_frame.f_code.co_name == 'do_raise'
+        assert tb.tb_next.tb_next.tb_next.tb_next is None
+
+    def test_throw_to_starting(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            not_reached
+        #
+        c1 = continulet(f1)
+        raises(IndexError, c1.throw, IndexError)
+
+    def test_throw2_simple(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            not_reached
+        def f2(c2):
+            try:
+                c2.switch("ready")
+            except IndexError:
+                raise ValueError
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c2.switch()
+        assert res == "ready"
+        assert c1.is_pending()
+        assert c2.is_pending()
+        raises(ValueError, c1.throw, IndexError, to=c2)
+        assert not c1.is_pending()
+        assert not c2.is_pending()
+
+    def test_throw2_no_op(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            raises(ValueError, c1.throw, ValueError, to=c1)
+            return "ok"
+        #
+        c1 = continulet(f1)
+        res = c1.switch()
+        assert res == "ok"
+
+    def test_permute(self):
+        from _continuation import continulet, permute
+        #
+        def f1(c1):
+            res = c1.switch()
+            assert res == "ok"
+            return "done"
+        #
+        def f2(c2):
+            permute(c1, c2)
+            return "ok"
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        c1.switch()
+        res = c2.switch()
+        assert res == "done"
+
+    def test_various_depths(self):
+        skip("may fail on top of CPython")
+        # run it from test_translated, but not while being actually translated
+        d = {}
+        execfile(self.translated, d)
+        d['set_fast_mode']()
+        d['test_various_depths']()
diff --git a/pypy/module/_continuation/test/test_translated.py b/pypy/module/_continuation/test/test_translated.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/test_translated.py
@@ -0,0 +1,132 @@
+import py
+try:
+    import _continuation
+except ImportError:
+    py.test.skip("to run on top of a translated pypy-c")
+
+import sys, random
+
+# ____________________________________________________________
+
+STATUS_MAX = 50000
+CONTINULETS = 50
+
+def set_fast_mode():
+    global STATUS_MAX, CONTINULETS
+    STATUS_MAX = 100
+    CONTINULETS = 5
+
+# ____________________________________________________________
+
+class Done(Exception):
+    pass
+
+
+class Runner(object):
+
+    def __init__(self):
+        self.foobar = 12345
+        self.conts = {}     # {continulet: parent-or-None}
+        self.contlist = []
+
+    def run_test(self):
+        self.start_continulets()
+        self.n = 0
+        try:
+            while True:
+                self.do_switch(src=None)
+                assert self.target is None
+        except Done:
+            self.check_traceback(sys.exc_info()[2])
+
+    def do_switch(self, src):
+        assert src not in self.conts.values()
+        c = random.choice(self.contlist)
+        self.target = self.conts[c]
+        self.conts[c] = src
+        c.switch()
+        assert self.target is src
+
+    def run_continulet(self, c, i):
+        while True:
+            assert self.target is c
+            assert self.contlist[i] is c
+            self.do_switch(c)
+            assert self.foobar == 12345
+            self.n += 1
+            if self.n >= STATUS_MAX:
+                raise Done
+
+    def start_continulets(self, i=0):
+        c = _continuation.continulet(self.run_continulet, i)
+        self.contlist.append(c)
+        if i < CONTINULETS:
+            self.start_continulets(i + 1)
+            # ^^^ start each continulet with a different base stack
+        self.conts[c] = c   # initially (i.e. not started) there are all loops
+
+    def check_traceback(self, tb):
+        found = []
+        tb = tb.tb_next
+        while tb:
+            if tb.tb_frame.f_code.co_name != 'do_switch':
+                assert tb.tb_frame.f_code.co_name == 'run_continulet', (
+                    "got %r" % (tb.tb_frame.f_code.co_name,))
+                found.append(tb.tb_frame.f_locals['c'])
+            tb = tb.tb_next
+        found.reverse()
+        #
+        expected = []
+        c = self.target
+        while c is not None:
+            expected.append(c)
+            c = self.conts[c]
+        #
+        assert found == expected, "%r == %r" % (found, expected)
+
+# ____________________________________________________________
+
+class AppTestWrapper:
+    def setup_class(cls):
+        "Run test_various_depths() when we are run with 'pypy py.test -A'."
+        from pypy.conftest import option
+        if not option.runappdirect:
+            py.test.skip("meant only for -A run")
+
+    def test_single_threaded(self):
+        for i in range(20):
+            yield Runner().run_test,
+
+    def test_multi_threaded(self):
+        for i in range(5):
+            yield multithreaded_test,
+
+class ThreadTest(object):
+    def __init__(self, lock):
+        self.lock = lock
+        self.ok = False
+        lock.acquire()
+    def run(self):
+        try:
+            Runner().run_test()
+            self.ok = True
+        finally:
+            self.lock.release()
+
+def multithreaded_test():
+    try:
+        import thread
+    except ImportError:
+        py.test.skip("no threads")
+    ts = [ThreadTest(thread.allocate_lock()) for i in range(5)]
+    for t in ts:
+        thread.start_new_thread(t.run, ())
+    for t in ts:
+        t.lock.acquire()
+    for t in ts:
+        assert t.ok
+
+# ____________________________________________________________
+
+if __name__ == '__main__':
+    Runner().run_test()
diff --git a/pypy/module/bz2/interp_bz2.py b/pypy/module/bz2/interp_bz2.py
--- a/pypy/module/bz2/interp_bz2.py
+++ b/pypy/module/bz2/interp_bz2.py
@@ -351,6 +351,7 @@
         self.decompressor = W_BZ2Decompressor(space)
         self.readlength = r_longlong(0)
         self.buffer = ""
+        self.pos = 0
         self.finished = False
         if buffering < 1024:
             buffering = 1024   # minimum amount of compressed data read at once
@@ -385,6 +386,7 @@
             self.stream.seek(0, 0)
             self.decompressor = W_BZ2Decompressor(self.space)
             self.readlength = r_longlong(0)
+            self.pos = 0
             self.buffer = ""
             self.finished = False
         else:
@@ -410,15 +412,19 @@
                                  self.space.wrap("compressed file ended before the logical end-of-the-stream was detected"))
         result = self.space.str_w(w_result)
         self.readlength += len(result)
-        result = self.buffer + result
+        if len(self.buffer) != self.pos:
+            pos = self.pos
+            assert pos >= 0
+            result = self.buffer[pos:] + result
         self.buffer = ''
+        self.pos = 0
         return result
 
     def read(self, n):
         # XXX not nice
         if n <= 0:
             return ''
-        while not self.buffer:
+        while self.pos == len(self.buffer):
             if self.finished:
                 return ""
             moredata = self.stream.read(max(self.buffering, n))
@@ -433,17 +439,23 @@
                     return ""
                 raise
             self.buffer = self.space.str_w(w_read)
-        if len(self.buffer) >= n:
-            result = self.buffer[:n]
-            self.buffer = self.buffer[n:]
+            self.pos = 0
+        if len(self.buffer) - self.pos >= n:
+            pos = self.pos
+            assert pos >= 0
+            result = self.buffer[pos:pos + n]
+            self.pos += n
         else:
             result = self.buffer
+            self.pos = 0
             self.buffer = ""
         self.readlength += len(result)
         return result
 
     def peek(self):
-        return self.buffer
+        pos = self.pos
+        assert pos >= 0
+        return self.buffer[pos:]
 
     def try_to_find_file_descriptor(self):
         return self.stream.try_to_find_file_descriptor()
diff --git a/pypy/module/micronumpy/__init__.py b/pypy/module/micronumpy/__init__.py
--- a/pypy/module/micronumpy/__init__.py
+++ b/pypy/module/micronumpy/__init__.py
@@ -7,6 +7,8 @@
 
     interpleveldefs = {
         'array': 'interp_numarray.SingleDimArray',
+        'dtype': 'interp_dtype.W_Dtype',
+
         'zeros': 'interp_numarray.zeros',
         'empty': 'interp_numarray.zeros',
         'ones': 'interp_numarray.ones',
diff --git a/pypy/module/micronumpy/compile.py b/pypy/module/micronumpy/compile.py
--- a/pypy/module/micronumpy/compile.py
+++ b/pypy/module/micronumpy/compile.py
@@ -3,56 +3,103 @@
 It should not be imported by the module itself
 """
 
-from pypy.module.micronumpy.interp_numarray import FloatWrapper, SingleDimArray, BaseArray
+from pypy.interpreter.baseobjspace import InternalSpaceCache, W_Root
+from pypy.module.micronumpy.interp_dtype import W_Float64Dtype
+from pypy.module.micronumpy.interp_numarray import Scalar, SingleDimArray, BaseArray
+from pypy.rlib.objectmodel import specialize
+
 
 class BogusBytecode(Exception):
     pass
 
-def create_array(size):
-    a = SingleDimArray(size)
+def create_array(dtype, size):
+    a = SingleDimArray(size, dtype=dtype)
     for i in range(size):
-        a.storage[i] = float(i % 10)
+        dtype.setitem(a.storage, i, dtype.box(float(i % 10)))
     return a
 
-class TrivialSpace(object):
-    def wrap(self, x):
-        return x
+class FakeSpace(object):
+    w_ValueError = None
+
+    def __init__(self):
+        """NOT_RPYTHON"""
+        self.fromcache = InternalSpaceCache(self).getorbuild
 
     def issequence_w(self, w_obj):
-        # Completley wrong in the general case, but good enough for this.
-        return isinstance(w_obj, BaseArray)
+        return True
+
+    @specialize.argtype(1)
+    def wrap(self, obj):
+        if isinstance(obj, float):
+            return FloatObject(obj)
+        elif isinstance(obj, bool):
+            return BoolObject(obj)
+        elif isinstance(obj, int):
+            return IntObject(obj)
+        raise Exception
+
+    def float(self, w_obj):
+        assert isinstance(w_obj, FloatObject)
+        return w_obj
 
     def float_w(self, w_obj):
-        assert isinstance(w_obj, float)
-        return w_obj
+        return w_obj.floatval
+
+
+class FloatObject(W_Root):
+    def __init__(self, floatval):
+        self.floatval = floatval
+
+class BoolObject(W_Root):
+    def __init__(self, boolval):
+        self.boolval = boolval
+
+class IntObject(W_Root):
+    def __init__(self, intval):
+        self.intval = intval
+
+
+space = FakeSpace()
 
 def numpy_compile(bytecode, array_size):
-    space = TrivialSpace()
     stack = []
     i = 0
+    dtype = space.fromcache(W_Float64Dtype)
     for b in bytecode:
         if b == 'a':
-            stack.append(create_array(array_size))
+            stack.append(create_array(dtype, array_size))
             i += 1
         elif b == 'f':
-            stack.append(FloatWrapper(1.2))
+            stack.append(Scalar(dtype, dtype.box(1.2)))
         elif b == '+':
             right = stack.pop()
-            stack.append(stack.pop().descr_add(space, right))
+            res = stack.pop().descr_add(space, right)
+            assert isinstance(res, BaseArray)
+            stack.append(res)
         elif b == '-':
             right = stack.pop()
-            stack.append(stack.pop().descr_sub(space, right))
+            res = stack.pop().descr_sub(space, right)
+            assert isinstance(res, BaseArray)
+            stack.append(res)
         elif b == '*':
             right = stack.pop()
-            stack.append(stack.pop().descr_mul(space, right))
+            res = stack.pop().descr_mul(space, right)
+            assert isinstance(res, BaseArray)
+            stack.append(res)
         elif b == '/':
             right = stack.pop()
-            stack.append(stack.pop().descr_div(space, right))
+            res = stack.pop().descr_div(space, right)
+            assert isinstance(res, BaseArray)
+            stack.append(res)
         elif b == '%':
             right = stack.pop()
-            stack.append(stack.pop().descr_mod(space, right))
+            res = stack.pop().descr_mod(space, right)
+            assert isinstance(res, BaseArray)
+            stack.append(res)
         elif b == '|':
-            stack.append(stack.pop().descr_abs(space))
+            res = stack.pop().descr_abs(space)
+            assert isinstance(res, BaseArray)
+            stack.append(res)
         else:
             print "Unknown opcode: %s" % b
             raise BogusBytecode()
diff --git a/pypy/module/micronumpy/interp_dtype.py b/pypy/module/micronumpy/interp_dtype.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/micronumpy/interp_dtype.py
@@ -0,0 +1,356 @@
+import functools
+import math
+
+from pypy.interpreter.baseobjspace import Wrappable
+from pypy.interpreter.error import OperationError
+from pypy.interpreter.gateway import interp2app
+from pypy.interpreter.typedef import TypeDef, interp_attrproperty, GetSetProperty
+from pypy.module.micronumpy import signature
+from pypy.objspace.std.floatobject import float2string
+from pypy.rlib import rfloat
+from pypy.rlib.rarithmetic import widen
+from pypy.rlib.objectmodel import specialize, enforceargs
+from pypy.rlib.unroll import unrolling_iterable
+from pypy.rpython.lltypesystem import lltype, rffi
+
+
+SIGNEDLTR = "i"
+BOOLLTR = "b"
+FLOATINGLTR = "f"
+
+class W_Dtype(Wrappable):
+    def __init__(self, space):
+        pass
+
+    def descr__new__(space, w_subtype, w_dtype):
+        if space.is_w(w_dtype, space.w_None):
+            return space.fromcache(W_Float64Dtype)
+        elif space.isinstance_w(w_dtype, space.w_str):
+            dtype = space.str_w(w_dtype)
+            for alias, dtype_class in dtypes_by_alias:
+                if alias == dtype:
+                    return space.fromcache(dtype_class)
+        elif isinstance(space.interpclass_w(w_dtype), W_Dtype):
+            return w_dtype
+        elif space.isinstance_w(w_dtype, space.w_type):
+            for typename, dtype_class in dtypes_by_apptype:
+                if space.is_w(getattr(space, "w_%s" % typename), w_dtype):
+                    return space.fromcache(dtype_class)
+        raise OperationError(space.w_TypeError, space.wrap("data type not understood"))
+
+    def descr_repr(self, space):
+        return space.wrap("dtype('%s')" % self.name)
+
+    def descr_str(self, space):
+        return space.wrap(self.name)
+
+    def descr_get_shape(self, space):
+        return space.newtuple([])
+
+
+class BaseBox(object):
+    pass
+
+VOID_TP = lltype.Ptr(lltype.Array(lltype.Void, hints={'nolength': True, "uncast_on_llgraph": True}))
+
+def create_low_level_dtype(num, kind, name, aliases, applevel_types, T, valtype):
+    class Box(BaseBox):
+        def __init__(self, val):
+            self.val = val
+
+        def wrap(self, space):
+            return space.wrap(self.val)
+
+        def convert_to(self, dtype):
+            return dtype.adapt_val(self.val)
+    Box.__name__ = "%sBox" % T._name
+
+    TP = lltype.Ptr(lltype.Array(T, hints={'nolength': True}))
+    class W_LowLevelDtype(W_Dtype):
+        signature = signature.BaseSignature()
+
+        def erase(self, storage):
+            return rffi.cast(VOID_TP, storage)
+
+        def unerase(self, storage):
+            return rffi.cast(TP, storage)
+
+        @enforceargs(None, valtype)
+        def box(self, value):
+            return Box(value)
+
+        def unbox(self, box):
+            assert isinstance(box, Box)
+            return box.val
+
+        def unwrap(self, space, w_item):
+            raise NotImplementedError
+
+        def malloc(self, size):
+            # XXX find out why test_zjit explodes with tracking of allocations
+            return self.erase(lltype.malloc(TP.TO, size,
+                zero=True, flavor="raw",
+                track_allocation=False, add_memory_pressure=True
+            ))
+
+        def getitem(self, storage, i):
+            return Box(self.unerase(storage)[i])
+
+        def setitem(self, storage, i, item):
+            self.unerase(storage)[i] = self.unbox(item)
+
+        def setitem_w(self, space, storage, i, w_item):
+            self.setitem(storage, i, self.unwrap(space, w_item))
+
+        @specialize.argtype(1)
+        def adapt_val(self, val):
+            return self.box(rffi.cast(TP.TO.OF, val))
+
+    W_LowLevelDtype.__name__ = "W_%sDtype" % name.capitalize()
+    W_LowLevelDtype.num = num
+    W_LowLevelDtype.kind = kind
+    W_LowLevelDtype.name = name
+    W_LowLevelDtype.aliases = aliases
+    W_LowLevelDtype.applevel_types = applevel_types
+    W_LowLevelDtype.num_bytes = rffi.sizeof(T)
+    return W_LowLevelDtype
+
+
+def binop(func):
+    @functools.wraps(func)
+    def impl(self, v1, v2):
+        return self.adapt_val(func(self,
+            self.for_computation(self.unbox(v1)),
+            self.for_computation(self.unbox(v2)),
+        ))
+    return impl
+
+def unaryop(func):
+    @functools.wraps(func)
+    def impl(self, v):
+        return self.adapt_val(func(self, self.for_computation(self.unbox(v))))
+    return impl
+
+class ArithmaticTypeMixin(object):
+    _mixin_ = True
+
+    @binop
+    def add(self, v1, v2):
+        return v1 + v2
+    @binop
+    def sub(self, v1, v2):
+        return v1 - v2
+    @binop
+    def mul(self, v1, v2):
+        return v1 * v2
+    @binop
+    def div(self, v1, v2):
+        return v1 / v2
+
+    @unaryop
+    def pos(self, v):
+        return +v
+    @unaryop
+    def neg(self, v):
+        return -v
+    @unaryop
+    def abs(self, v):
+        return abs(v)
+
+    @binop
+    def max(self, v1, v2):
+        return max(v1, v2)
+    @binop
+    def min(self, v1, v2):
+        return min(v1, v2)
+
+    def bool(self, v):
+        return bool(self.for_computation(self.unbox(v)))
+    def ne(self, v1, v2):
+        return self.for_computation(self.unbox(v1)) != self.for_computation(self.unbox(v2))
+
+
+class FloatArithmeticDtype(ArithmaticTypeMixin):
+    _mixin_ = True
+
+    def for_computation(self, v):
+        return v
+
+    @binop
+    def mod(self, v1, v2):
+        return math.fmod(v1, v2)
+    @binop
+    def pow(self, v1, v2):
+        return math.pow(v1, v2)
+
+    @unaryop
+    def sign(self, v):
+        if v == 0.0:
+            return 0.0
+        return rfloat.copysign(1.0, v)
+    @unaryop
+    def reciprocal(self, v):
+        if v == 0.0:
+            return rfloat.copysign(rfloat.INFINITY, v)
+        return 1.0 / v
+    @unaryop
+    def fabs(self, v):
+        return math.fabs(v)
+    @unaryop
+    def floor(self, v):
+        return math.floor(v)
+
+    @binop
+    def copysign(self, v1, v2):
+        return math.copysign(v1, v2)
+    @unaryop
+    def exp(self, v):
+        try:
+            return math.exp(v)
+        except OverflowError:
+            return rfloat.INFINITY
+    @unaryop
+    def sin(self, v):
+        return math.sin(v)
+    @unaryop
+    def cos(self, v):
+        return math.cos(v)
+    @unaryop
+    def tan(self, v):
+        return math.tan(v)
+    @unaryop
+    def arcsin(self, v):
+        if v < -1.0 or  v > 1.0:
+            return rfloat.NAN
+        return math.asin(v)
+    @unaryop
+    def arccos(self, v):
+        if v < -1.0 or v > 1.0:
+            return rfloat.NAN
+        return math.acos(v)
+    @unaryop
+    def arctan(self, v):
+        return math.atan(v)
+
+class IntegerArithmeticDtype(ArithmaticTypeMixin):
+    _mixin_ = True
+
+    def unwrap(self, space, w_item):
+        return self.adapt_val(space.int_w(space.int(w_item)))
+
+    def for_computation(self, v):
+        return widen(v)
+
+    @binop
+    def mod(self, v1, v2):
+        return v1 % v2
+
+    @unaryop
+    def sign(self, v):
+        if v > 0:
+            return 1
+        elif v < 0:
+            return -1
+        else:
+            assert v == 0
+            return 0
+
+    def str_format(self, item):
+        return str(widen(self.unbox(item)))
+
+W_BoolDtype = create_low_level_dtype(
+    num = 0, kind = BOOLLTR, name = "bool",
+    aliases = ["?"],
+    applevel_types = ["bool"],
+    T = lltype.Bool,
+    valtype = bool,
+)
+class W_BoolDtype(IntegerArithmeticDtype, W_BoolDtype):
+    def unwrap(self, space, w_item):
+        return self.adapt_val(space.is_true(w_item))
+
+    def str_format(self, item):
+        v = self.unbox(item)
+        return "True" if v else "False"
+
+    def for_computation(self, v):
+        return int(v)
+
+W_Int8Dtype = create_low_level_dtype(
+    num = 1, kind = SIGNEDLTR, name = "int8",
+    aliases = ["int8"],
+    applevel_types = [],
+    T = rffi.SIGNEDCHAR,
+    valtype = rffi.SIGNEDCHAR._type,
+)
+class W_Int8Dtype(IntegerArithmeticDtype, W_Int8Dtype):
+    def unwrap(self, space, w_item):
+        return self.adapt_val(space.int_w(space.int(w_item)))
+
+W_Int32Dtype = create_low_level_dtype(
+    num = 5, kind = SIGNEDLTR, name = "int32",
+    aliases = ["i"],
+    applevel_types = [],
+    T = rffi.INT,
+    valtype = rffi.INT._type,
+)
+class W_Int32Dtype(IntegerArithmeticDtype, W_Int32Dtype):
+    pass
+
+W_Int64Dtype = create_low_level_dtype(
+    num = 9, kind = SIGNEDLTR, name = "int64",
+    aliases = [],
+    applevel_types = ["long"],
+    T = rffi.LONGLONG,
+    valtype = rffi.LONGLONG._type,
+)
+class W_Int64Dtype(IntegerArithmeticDtype, W_Int64Dtype):
+    pass
+
+W_Float64Dtype = create_low_level_dtype(
+    num = 12, kind = FLOATINGLTR, name = "float64",
+    aliases = [],
+    applevel_types = ["float"],
+    T = lltype.Float,
+    valtype = float,
+)
+class W_Float64Dtype(FloatArithmeticDtype, W_Float64Dtype):
+    def unwrap(self, space, w_item):
+        return self.adapt_val(space.float_w(space.float(w_item)))
+
+    def str_format(self, item):
+        return float2string(self.unbox(item), 'g', rfloat.DTSF_STR_PRECISION)
+
+ALL_DTYPES = [
+    W_BoolDtype,
+    W_Int8Dtype, W_Int32Dtype, W_Int64Dtype,
+    W_Float64Dtype
+]
+
+dtypes_by_alias = unrolling_iterable([
+    (alias, dtype)
+    for dtype in ALL_DTYPES
+    for alias in dtype.aliases
+])
+dtypes_by_apptype = unrolling_iterable([
+    (apptype, dtype)
+    for dtype in ALL_DTYPES
+    for apptype in dtype.applevel_types
+])
+dtypes_by_num_bytes = unrolling_iterable(sorted([
+    (dtype.num_bytes, dtype)
+    for dtype in ALL_DTYPES
+]))
+
+W_Dtype.typedef = TypeDef("dtype",
+    __module__ = "numpy",
+    __new__ = interp2app(W_Dtype.descr__new__.im_func),
+
+    __repr__ = interp2app(W_Dtype.descr_repr),
+    __str__ = interp2app(W_Dtype.descr_str),
+
+    num = interp_attrproperty("num", cls=W_Dtype),
+    kind = interp_attrproperty("kind", cls=W_Dtype),
+    shape = GetSetProperty(W_Dtype.descr_get_shape),
+)
+W_Dtype.typedef.acceptable_as_base_class = False
\ No newline at end of file
diff --git a/pypy/module/micronumpy/interp_numarray.py b/pypy/module/micronumpy/interp_numarray.py
--- a/pypy/module/micronumpy/interp_numarray.py
+++ b/pypy/module/micronumpy/interp_numarray.py
@@ -1,38 +1,22 @@
 from pypy.interpreter.baseobjspace import Wrappable
-from pypy.interpreter.error import OperationError, operationerrfmt
+from pypy.interpreter.error import OperationError
 from pypy.interpreter.gateway import interp2app, unwrap_spec
 from pypy.interpreter.typedef import TypeDef, GetSetProperty
-from pypy.module.micronumpy.interp_support import Signature
-from pypy.module.micronumpy import interp_ufuncs
-from pypy.objspace.std.floatobject import float2string as float2string_orig
+from pypy.module.micronumpy import interp_ufuncs, interp_dtype, signature
 from pypy.rlib import jit
-from pypy.rlib.rfloat import DTSF_STR_PRECISION
 from pypy.rpython.lltypesystem import lltype
 from pypy.tool.sourcetools import func_with_new_name
-import math
 
-TP = lltype.Array(lltype.Float, hints={'nolength': True})
 
 numpy_driver = jit.JitDriver(greens = ['signature'],
                              reds = ['result_size', 'i', 'self', 'result'])
-all_driver = jit.JitDriver(greens=['signature'], reds=['i', 'size', 'self'])
-any_driver = jit.JitDriver(greens=['signature'], reds=['i', 'size', 'self'])
-slice_driver1 = jit.JitDriver(greens=['signature'], reds=['i', 'j', 'step', 'stop', 'source', 'dest'])
-slice_driver2 = jit.JitDriver(greens=['signature'], reds=['i', 'j', 'step', 'stop', 'source', 'dest'])
-
-def add(v1, v2):
-    return v1 + v2
-def mul(v1, v2):
-    return v1 * v2
-def maximum(v1, v2):
-    return max(v1, v2)
-def minimum(v1, v2):
-    return min(v1, v2)
-
-def float2string(x):
-    return float2string_orig(x, 'g', DTSF_STR_PRECISION)
+all_driver = jit.JitDriver(greens=['signature'], reds=['i', 'size', 'self', 'dtype'])
+any_driver = jit.JitDriver(greens=['signature'], reds=['i', 'size', 'self', 'dtype'])
+slice_driver = jit.JitDriver(greens=['signature'], reds=['i', 'j', 'step', 'stop', 'source', 'dest'])
 
 class BaseArray(Wrappable):
+    _attrs_ = ["invalidates", "signature"]
+
     def __init__(self):
         self.invalidates = []
 
@@ -45,6 +29,30 @@
             arr.force_if_needed()
         del self.invalidates[:]
 
+    def add_invalidates(self, other):
+        self.invalidates.append(other)
+
+    def descr__new__(space, w_subtype, w_size_or_iterable, w_dtype=None):
+        l = space.listview(w_size_or_iterable)
+        if space.is_w(w_dtype, space.w_None):
+            w_dtype = None
+            for w_item in l:
+                w_dtype = interp_ufuncs.find_dtype_for_scalar(space, w_item, w_dtype)
+                if w_dtype is space.fromcache(interp_dtype.W_Float64Dtype):
+                    break
+            if w_dtype is None:
+                w_dtype = space.w_None
+
+        dtype = space.interp_w(interp_dtype.W_Dtype,
+            space.call_function(space.gettypefor(interp_dtype.W_Dtype), w_dtype)
+        )
+        arr = SingleDimArray(len(l), dtype=dtype)
+        i = 0
+        for w_elem in l:
+            dtype.setitem_w(space, arr.storage, i, w_elem)
+            i += 1
+        return arr
+
     def _unaryop_impl(w_ufunc):
         def impl(self, space):
             return w_ufunc(space, self)
@@ -68,7 +76,10 @@
 
     def _binop_right_impl(w_ufunc):
         def impl(self, space, w_other):
-            w_other = FloatWrapper(space.float_w(w_other))
+            w_other = scalar_w(space,
+                interp_ufuncs.find_dtype_for_scalar(space, w_other, self.find_dtype()),
+                w_other
+            )
             return w_ufunc(space, w_other, self)
         return func_with_new_name(impl, "binop_right_%s_impl" % w_ufunc.__name__)
 
@@ -79,34 +90,42 @@
     descr_rpow = _binop_right_impl(interp_ufuncs.power)
     descr_rmod = _binop_right_impl(interp_ufuncs.mod)
 
-    def _reduce_sum_prod_impl(function, init):
+    def _reduce_sum_prod_impl(op_name, init):
         reduce_driver = jit.JitDriver(greens=['signature'],
-                         reds = ['i', 'size', 'self', 'result'])
+                         reds = ['i', 'size', 'self', 'result', 'res_dtype'])
 
-        def loop(self, result, size):
+        def loop(self, res_dtype, result, size):
             i = 0
             while i < size:
                 reduce_driver.jit_merge_point(signature=self.signature,
-                                              self=self, size=size, i=i,
-                                              result=result)
-                result = function(result, self.eval(i))
+                                              self=self, res_dtype=res_dtype,
+                                              size=size, i=i, result=result)
+                result = getattr(res_dtype, op_name)(
+                    result,
+                    self.eval(i).convert_to(res_dtype)
+                )
                 i += 1
             return result
 
         def impl(self, space):
-            return space.wrap(loop(self, init, self.find_size()))
-        return func_with_new_name(impl, "reduce_%s_impl" % function.__name__)
+            dtype = interp_ufuncs.find_unaryop_result_dtype(
+                space, self.find_dtype(), promote_to_largest=True
+            )
+            result = dtype.adapt_val(init)
+            return loop(self, dtype, result, self.find_size()).wrap(space)
+        return func_with_new_name(impl, "reduce_%s_impl" % op_name)
 
-    def _reduce_max_min_impl(function):
+    def _reduce_max_min_impl(op_name):
         reduce_driver = jit.JitDriver(greens=['signature'],
-                         reds = ['i', 'size', 'self', 'result'])
+                         reds = ['i', 'size', 'self', 'result', 'dtype'])
         def loop(self, result, size):
             i = 1
+            dtype = self.find_dtype()
             while i < size:
                 reduce_driver.jit_merge_point(signature=self.signature,
-                                              self=self, size=size, i=i,
-                                              result=result)
-                result = function(result, self.eval(i))
+                                              self=self, dtype=dtype,
+                                              size=size, i=i, result=result)
+                result = getattr(dtype, op_name)(result, self.eval(i))
                 i += 1
             return result
 
@@ -115,23 +134,25 @@
             if size == 0:
                 raise OperationError(space.w_ValueError,
                     space.wrap("Can't call %s on zero-size arrays" \
-                            % function.__name__))
-            return space.wrap(loop(self, self.eval(0), size))
-        return func_with_new_name(impl, "reduce_%s_impl" % function.__name__)
+                            % op_name))
+            return loop(self, self.eval(0), size).wrap(space)
+        return func_with_new_name(impl, "reduce_%s_impl" % op_name)
 
-    def _reduce_argmax_argmin_impl(function):
+    def _reduce_argmax_argmin_impl(op_name):
         reduce_driver = jit.JitDriver(greens=['signature'],
-                         reds = ['i', 'size', 'result', 'self', 'cur_best'])
+                         reds = ['i', 'size', 'result', 'self', 'cur_best', 'dtype'])
         def loop(self, size):
             result = 0
             cur_best = self.eval(0)
             i = 1
+            dtype = self.find_dtype()
             while i < size:
                 reduce_driver.jit_merge_point(signature=self.signature,
-                                              self=self, size=size, i=i,
-                                              result=result, cur_best=cur_best)
-                new_best = function(cur_best, self.eval(i))
-                if new_best != cur_best:
+                                              self=self, dtype=dtype,
+                                              size=size, i=i, result=result,
+                                              cur_best=cur_best)
+                new_best = getattr(dtype, op_name)(cur_best, self.eval(i))
+                if dtype.ne(new_best, cur_best):
                     result = i
                     cur_best = new_best
                 i += 1
@@ -141,16 +162,17 @@
             if size == 0:
                 raise OperationError(space.w_ValueError,
                     space.wrap("Can't call %s on zero-size arrays" \
-                            % function.__name__))
+                            % op_name))
             return space.wrap(loop(self, size))
-        return func_with_new_name(impl, "reduce_arg%s_impl" % function.__name__)
+        return func_with_new_name(impl, "reduce_arg%s_impl" % op_name)
 
     def _all(self):
         size = self.find_size()
+        dtype = self.find_dtype()
         i = 0
         while i < size:
-            all_driver.jit_merge_point(signature=self.signature, self=self, size=size, i=i)
-            if not self.eval(i):
+            all_driver.jit_merge_point(signature=self.signature, self=self, dtype=dtype, size=size, i=i)
+            if not dtype.bool(self.eval(i)):
                 return False
             i += 1
         return True
@@ -159,45 +181,48 @@
 
     def _any(self):
         size = self.find_size()
+        dtype = self.find_dtype()
         i = 0
         while i < size:
-            any_driver.jit_merge_point(signature=self.signature, self=self, size=size, i=i)
-            if self.eval(i):
+            any_driver.jit_merge_point(signature=self.signature, self=self, size=size, dtype=dtype, i=i)
+            if dtype.bool(self.eval(i)):
                 return True
             i += 1
         return False
     def descr_any(self, space):
         return space.wrap(self._any())
 
-    descr_sum = _reduce_sum_prod_impl(add, 0.0)
-    descr_prod = _reduce_sum_prod_impl(mul, 1.0)
-    descr_max = _reduce_max_min_impl(maximum)
-    descr_min = _reduce_max_min_impl(minimum)
-    descr_argmax = _reduce_argmax_argmin_impl(maximum)
-    descr_argmin = _reduce_argmax_argmin_impl(minimum)
+    descr_sum = _reduce_sum_prod_impl("add", 0)
+    descr_prod = _reduce_sum_prod_impl("mul", 1)
+    descr_max = _reduce_max_min_impl("max")
+    descr_min = _reduce_max_min_impl("min")
+    descr_argmax = _reduce_argmax_argmin_impl("max")
+    descr_argmin = _reduce_argmax_argmin_impl("min")
 
     def descr_dot(self, space, w_other):
-        if isinstance(w_other, BaseArray):
+        w_other = convert_to_array(space, w_other)
+        if isinstance(w_other, Scalar):
+            return self.descr_mul(space, w_other)
+        else:
             w_res = self.descr_mul(space, w_other)
             assert isinstance(w_res, BaseArray)
             return w_res.descr_sum(space)
-        else:
-            return self.descr_mul(space, w_other)
 
     def _getnums(self, comma):
+        dtype = self.find_dtype()
         if self.find_size() > 1000:
             nums = [
-                float2string(self.eval(index))
+                dtype.str_format(self.eval(index))
                 for index in range(3)
             ]
             nums.append("..." + "," * comma)
             nums.extend([
-                float2string(self.eval(index))
+                dtype.str_format(self.eval(index))
                 for index in range(self.find_size() - 3, self.find_size())
             ])
         else:
             nums = [
-                float2string(self.eval(index))
+                dtype.str_format(self.eval(index))
                 for index in range(self.find_size())
             ]
         return nums
@@ -205,19 +230,28 @@
     def get_concrete(self):
         raise NotImplementedError
 
-    def descr_copy(self, space):
-        return new_numarray(space, self)
+    def descr_get_dtype(self, space):
+        return space.wrap(self.find_dtype())
 
     def descr_get_shape(self, space):
         return space.newtuple([self.descr_len(space)])
 
+    def descr_copy(self, space):
+        return space.call_function(space.gettypefor(BaseArray), self, self.find_dtype())
+
     def descr_len(self, space):
         return self.get_concrete().descr_len(space)
 
     def descr_repr(self, space):
         # Simple implementation so that we can see the array. Needs work.
         concrete = self.get_concrete()
-        return space.wrap("array([" + ", ".join(concrete._getnums(False)) + "])")
+        res = "array([" + ", ".join(concrete._getnums(False)) + "]"
+        dtype = concrete.find_dtype()
+        if (dtype is not space.fromcache(interp_dtype.W_Float64Dtype) and
+            dtype is not space.fromcache(interp_dtype.W_Int64Dtype)):
+            res += ", dtype=" + dtype.name
+        res += ")"
+        return space.wrap(res)
 
     def descr_str(self, space):
         # Simple implementation so that we can see the array. Needs work.
@@ -229,10 +263,13 @@
         start, stop, step, slice_length = space.decode_index4(w_idx, self.find_size())
         if step == 0:
             # Single index
-            return space.wrap(self.get_concrete().eval(start))
+            return self.get_concrete().eval(start).wrap(space)
         else:
             # Slice
-            res = SingleDimSlice(start, stop, step, slice_length, self, self.signature.transition(SingleDimSlice.static_signature))
+            new_sig = signature.Signature.find_sig([
+                SingleDimSlice.signature, self.signature
+            ])
+            res = SingleDimSlice(start, stop, step, slice_length, self, new_sig)
             return space.wrap(res)
 
     def descr_setitem(self, space, w_idx, w_value):
@@ -242,79 +279,82 @@
                                                               self.find_size())
         if step == 0:
             # Single index
-            self.get_concrete().setitem(start, space.float_w(w_value))
+            self.get_concrete().setitem_w(space, start, w_value)
         else:
             concrete = self.get_concrete()
             if isinstance(w_value, BaseArray):
-                # for now we just copy if setting part of an array from 
+                # for now we just copy if setting part of an array from
                 # part of itself. can be improved.
                 if (concrete.get_root_storage() ==
                     w_value.get_concrete().get_root_storage()):
-                    w_value = new_numarray(space, w_value)
+                    w_value = space.call_function(space.gettypefor(BaseArray), w_value)
+                    assert isinstance(w_value, BaseArray)
             else:
                 w_value = convert_to_array(space, w_value)
-            concrete.setslice(space, start, stop, step, 
+            concrete.setslice(space, start, stop, step,
                                                slice_length, w_value)
 
     def descr_mean(self, space):
         return space.wrap(space.float_w(self.descr_sum(space))/self.find_size())
 
-    def _sliceloop1(self, start, stop, step, source, dest):
+    def _sliceloop(self, start, stop, step, source, dest):
         i = start
         j = 0
-        while i < stop:
-            slice_driver1.jit_merge_point(signature=source.signature,
-                    step=step, stop=stop, i=i, j=j, source=source,
-                    dest=dest)
-            dest.storage[i] = source.eval(j)
+        while (step > 0 and i < stop) or (step < 0 and i > stop):
+            slice_driver.jit_merge_point(signature=source.signature, step=step,
+                                         stop=stop, i=i, j=j, source=source,
+                                         dest=dest)
+            dest.setitem(i, source.eval(j).convert_to(dest.find_dtype()))
             j += 1
             i += step
 
-    def _sliceloop2(self, start, stop, step, source, dest):
-        i = start
-        j = 0
-        while i > stop:
-            slice_driver2.jit_merge_point(signature=source.signature,
-                    step=step, stop=stop, i=i, j=j, source=source,
-                    dest=dest)
-            dest.storage[i] = source.eval(j)
-            j += 1
-            i += step
-
-def convert_to_array (space, w_obj):
+def convert_to_array(space, w_obj):
     if isinstance(w_obj, BaseArray):
         return w_obj
     elif space.issequence_w(w_obj):
         # Convert to array.
-        return new_numarray(space, w_obj)
+        w_obj = space.call_function(space.gettypefor(BaseArray), w_obj)
+        assert isinstance(w_obj, BaseArray)
+        return w_obj
     else:
         # If it's a scalar
-        return FloatWrapper(space.float_w(w_obj))
+        dtype = interp_ufuncs.find_dtype_for_scalar(space, w_obj)
+        return scalar_w(space, dtype, w_obj)
 
-class FloatWrapper(BaseArray):
+def scalar_w(space, dtype, w_obj):
+    return Scalar(dtype, dtype.unwrap(space, w_obj))
+
+class Scalar(BaseArray):
     """
     Intermediate class representing a float literal.
     """
-    signature = Signature()
+    signature = signature.BaseSignature()
 
-    def __init__(self, float_value):
+    _attrs_ = ["dtype", "value"]
+
+    def __init__(self, dtype, value):
         BaseArray.__init__(self)
-        self.float_value = float_value
+        self.dtype = dtype
+        self.value = value
 
     def find_size(self):
         raise ValueError
 
+    def find_dtype(self):
+        return self.dtype
+
     def eval(self, i):
-        return self.float_value
+        return self.value
 
 class VirtualArray(BaseArray):
     """
     Class for representing virtual arrays, such as binary ops or ufuncs
     """
-    def __init__(self, signature):
+    def __init__(self, signature, res_dtype):
         BaseArray.__init__(self)
         self.forced_result = None
         self.signature = signature
+        self.res_dtype = res_dtype
 
     def _del_sources(self):
         # Function for deleting references to source arrays, to allow garbage-collecting them
@@ -324,12 +364,12 @@
         i = 0
         signature = self.signature
         result_size = self.find_size()
-        result = SingleDimArray(result_size)
+        result = SingleDimArray(result_size, self.find_dtype())
         while i < result_size:
             numpy_driver.jit_merge_point(signature=signature,
                                          result_size=result_size, i=i,
                                          self=self, result=result)
-            result.storage[i] = self.eval(i)
+            result.dtype.setitem(result.storage, i, self.eval(i))
             i += 1
         return result
 
@@ -347,17 +387,22 @@
             return self.forced_result.eval(i)
         return self._eval(i)
 
+    def setitem(self, item, value):
+        return self.get_concrete().setitem(item, value)
+
     def find_size(self):
         if self.forced_result is not None:
             # The result has been computed and sources may be unavailable
             return self.forced_result.find_size()
         return self._find_size()
 
+    def find_dtype(self):
+        return self.res_dtype
+
 
 class Call1(VirtualArray):
-    def __init__(self, function, values, signature):
-        VirtualArray.__init__(self, signature)
-        self.function = function
+    def __init__(self, signature, res_dtype, values):
+        VirtualArray.__init__(self, signature, res_dtype)
         self.values = values
 
     def _del_sources(self):
@@ -366,16 +411,24 @@
     def _find_size(self):
         return self.values.find_size()
 
+    def _find_dtype(self):
+        return self.res_dtype
+
     def _eval(self, i):
-        return self.function(self.values.eval(i))
+        val = self.values.eval(i).convert_to(self.res_dtype)
+
+        sig = jit.promote(self.signature)
+        assert isinstance(sig, signature.Signature)
+        call_sig = sig.components[0]
+        assert isinstance(call_sig, signature.Call1)
+        return call_sig.func(self.res_dtype, val)
 
 class Call2(VirtualArray):
     """
     Intermediate class for performing binary operations.
     """
-    def __init__(self, function, left, right, signature):
-        VirtualArray.__init__(self, signature)
-        self.function = function
+    def __init__(self, signature, res_dtype, left, right):
+        VirtualArray.__init__(self, signature, res_dtype)
         self.left = left
         self.right = right
 
@@ -391,8 +444,14 @@
         return self.right.find_size()
 
     def _eval(self, i):
-        lhs, rhs = self.left.eval(i), self.right.eval(i)
-        return self.function(lhs, rhs)
+        lhs = self.left.eval(i).convert_to(self.res_dtype)
+        rhs = self.right.eval(i).convert_to(self.res_dtype)
+
+        sig = jit.promote(self.signature)
+        assert isinstance(sig, signature.Signature)
+        call_sig = sig.components[0]
+        assert isinstance(call_sig, signature.Call2)
+        return call_sig.func(self.res_dtype, lhs, rhs)
 
 class ViewArray(BaseArray):
     """
@@ -415,9 +474,13 @@
     def eval(self, i):
         return self.parent.eval(self.calc_index(i))
 
-    @unwrap_spec(item=int, value=float)
+    @unwrap_spec(item=int)
+    def setitem_w(self, space, item, w_value):
+        return self.parent.setitem_w(space, self.calc_index(item), w_value)
+
     def setitem(self, item, value):
-        return self.parent.setitem(self.calc_index(item), value)
+        # This is currently not possible to be called from anywhere.
+        raise NotImplementedError
 
     def descr_len(self, space):
         return space.wrap(self.find_size())
@@ -426,7 +489,7 @@
         raise NotImplementedError
 
 class SingleDimSlice(ViewArray):
-    static_signature = Signature()
+    signature = signature.BaseSignature()
 
     def __init__(self, start, stop, step, slice_length, parent, signature):
         ViewArray.__init__(self, parent, signature)
@@ -443,35 +506,32 @@
         self.size = slice_length
 
     def get_root_storage(self):
-        return self.parent.storage
+        return self.parent.get_concrete().get_root_storage()
 
     def find_size(self):
         return self.size
 
+    def find_dtype(self):
+        return self.parent.find_dtype()
+
     def setslice(self, space, start, stop, step, slice_length, arr):
         start = self.calc_index(start)
         if stop != -1:
             stop = self.calc_index(stop)
         step = self.step * step
-        if step > 0:
-            self._sliceloop1(start, stop, step, arr, self.parent)
-        else:
-            self._sliceloop2(start, stop, step, arr, self.parent)
+        self._sliceloop(start, stop, step, arr, self.parent)
 
     def calc_index(self, item):
         return (self.start + item * self.step)
 
 
 class SingleDimArray(BaseArray):
-    signature = Signature()
-
-    def __init__(self, size):
+    def __init__(self, size, dtype):
         BaseArray.__init__(self)
         self.size = size
-        self.storage = lltype.malloc(TP, size, zero=True,
-                                     flavor='raw', track_allocation=False,
-                                     add_memory_pressure=True)
-        # XXX find out why test_zjit explodes with trackign of allocations
+        self.dtype = dtype
+        self.storage = dtype.malloc(size)
+        self.signature = dtype.signature
 
     def get_concrete(self):
         return self
@@ -482,54 +542,52 @@
     def find_size(self):
         return self.size
 
+    def find_dtype(self):
+        return self.dtype
+
     def eval(self, i):
-        return self.storage[i]
+        return self.dtype.getitem(self.storage, i)
 
     def descr_len(self, space):
         return space.wrap(self.size)
 
+    def setitem_w(self, space, item, w_value):
+        self.invalidated()
+        self.dtype.setitem_w(space, self.storage, item, w_value)
+
     def setitem(self, item, value):
         self.invalidated()
-        self.storage[item] = value
+        self.dtype.setitem(self.storage, item, value)
 
     def setslice(self, space, start, stop, step, slice_length, arr):
-        if step > 0:
-            self._sliceloop1(start, stop, step, arr, self)
-        else:
-            self._sliceloop2(start, stop, step, arr, self)
+        self._sliceloop(start, stop, step, arr, self)
 
     def __del__(self):
         lltype.free(self.storage, flavor='raw', track_allocation=False)
 
-def new_numarray(space, w_size_or_iterable):
-    l = space.listview(w_size_or_iterable)
-    arr = SingleDimArray(len(l))
-    i = 0
-    for w_elem in l:
-        arr.storage[i] = space.float_w(space.float(w_elem))
-        i += 1
-    return arr
-
-def descr_new_numarray(space, w_type, w_size_or_iterable):
-    return space.wrap(new_numarray(space, w_size_or_iterable))
+ at unwrap_spec(size=int)
+def zeros(space, size, w_dtype=None):
+    dtype = space.interp_w(interp_dtype.W_Dtype,
+        space.call_function(space.gettypefor(interp_dtype.W_Dtype), w_dtype)
+    )
+    return space.wrap(SingleDimArray(size, dtype=dtype))
 
 @unwrap_spec(size=int)
-def zeros(space, size):
-    return space.wrap(SingleDimArray(size))
+def ones(space, size, w_dtype=None):
+    dtype = space.interp_w(interp_dtype.W_Dtype,
+        space.call_function(space.gettypefor(interp_dtype.W_Dtype), w_dtype)
+    )
 
- at unwrap_spec(size=int)
-def ones(space, size):
-    arr = SingleDimArray(size)
+    arr = SingleDimArray(size, dtype=dtype)
+    one = dtype.adapt_val(1)
     for i in xrange(size):
-        arr.storage[i] = 1.0
+        arr.dtype.setitem(arr.storage, i, one)
     return space.wrap(arr)
 
 BaseArray.typedef = TypeDef(
     'numarray',
-    __new__ = interp2app(descr_new_numarray),
+    __new__ = interp2app(BaseArray.descr__new__.im_func),
 
-    copy = interp2app(BaseArray.descr_copy),
-    shape = GetSetProperty(BaseArray.descr_get_shape),
 
     __len__ = interp2app(BaseArray.descr_len),
     __getitem__ = interp2app(BaseArray.descr_getitem),
@@ -553,6 +611,9 @@
     __repr__ = interp2app(BaseArray.descr_repr),
     __str__ = interp2app(BaseArray.descr_str),
 
+    dtype = GetSetProperty(BaseArray.descr_get_dtype),
+    shape = GetSetProperty(BaseArray.descr_get_shape),
+
     mean = interp2app(BaseArray.descr_mean),
     sum = interp2app(BaseArray.descr_sum),
     prod = interp2app(BaseArray.descr_prod),
@@ -563,4 +624,6 @@
     all = interp2app(BaseArray.descr_all),
     any = interp2app(BaseArray.descr_any),
     dot = interp2app(BaseArray.descr_dot),
+
+    copy = interp2app(BaseArray.descr_copy),
 )
diff --git a/pypy/module/micronumpy/interp_support.py b/pypy/module/micronumpy/interp_support.py
--- a/pypy/module/micronumpy/interp_support.py
+++ b/pypy/module/micronumpy/interp_support.py
@@ -1,7 +1,8 @@
+from pypy.interpreter.error import OperationError
+from pypy.interpreter.gateway import unwrap_spec
+from pypy.module.micronumpy.interp_dtype import W_Float64Dtype
 from pypy.rlib.rstruct.runpack import runpack
 from pypy.rpython.lltypesystem import lltype, rffi
-from pypy.interpreter.error import OperationError
-from pypy.interpreter.gateway import unwrap_spec
 
 
 FLOAT_SIZE = rffi.sizeof(lltype.Float)
@@ -17,26 +18,17 @@
         raise OperationError(space.w_ValueError, space.wrap(
             "string length %d not divisable by %d" % (length, FLOAT_SIZE)))
 
-    a = SingleDimArray(number)
+    dtype = space.fromcache(W_Float64Dtype)
+    a = SingleDimArray(number, dtype=dtype)
 
     start = 0
     end = FLOAT_SIZE
     i = 0
     while i < number:
         part = s[start:end]
-        a.storage[i] = runpack('d', part)
+        a.dtype.setitem(a.storage, i, dtype.box(runpack('d', part)))
         i += 1
         start += FLOAT_SIZE
         end += FLOAT_SIZE
 
-    return space.wrap(a)
-
-class Signature(object):
-    def __init__(self):
-        self.transitions = {}
-
-    def transition(self, target):
-        if target in self.transitions:
-            return self.transitions[target]
-        self.transitions[target] = new = Signature()
-        return new
\ No newline at end of file
+    return space.wrap(a)
\ No newline at end of file
diff --git a/pypy/module/micronumpy/interp_ufuncs.py b/pypy/module/micronumpy/interp_ufuncs.py
--- a/pypy/module/micronumpy/interp_ufuncs.py
+++ b/pypy/module/micronumpy/interp_ufuncs.py
@@ -1,139 +1,158 @@
-import math
-
-from pypy.module.micronumpy.interp_support import Signature
-from pypy.rlib import rfloat
+from pypy.module.micronumpy import interp_dtype, signature
 from pypy.tool.sourcetools import func_with_new_name
 
-def ufunc(func):
-    signature = Signature()
+
+def ufunc(func=None, promote_to_float=False, promote_bools=False):
+    if func is None:
+        return lambda func: ufunc(func, promote_to_float, promote_bools)
+    call_sig = signature.Call1(func)
     def impl(space, w_obj):
-        from pypy.module.micronumpy.interp_numarray import Call1, convert_to_array
-        if space.issequence_w(w_obj):
-            w_obj_arr = convert_to_array(space, w_obj)
-            w_res = Call1(func, w_obj_arr, w_obj_arr.signature.transition(signature))
-            w_obj_arr.invalidates.append(w_res)
-            return w_res
-        else:
-            return space.wrap(func(space.float_w(w_obj)))
+        from pypy.module.micronumpy.interp_numarray import (Call1,
+            convert_to_array, Scalar)
+
+        w_obj = convert_to_array(space, w_obj)
+        res_dtype = find_unaryop_result_dtype(space,
+            w_obj.find_dtype(),
+            promote_to_float=promote_to_float,
+            promote_bools=promote_bools,
+        )
+        if isinstance(w_obj, Scalar):
+            return func(res_dtype, w_obj.value.convert_to(res_dtype)).wrap(space)
+
+        new_sig = signature.Signature.find_sig([call_sig, w_obj.signature])
+        w_res = Call1(new_sig, res_dtype, w_obj)
+        w_obj.add_invalidates(w_res)
+        return w_res
     return func_with_new_name(impl, "%s_dispatcher" % func.__name__)
 
-def ufunc2(func):
-    signature = Signature()
+def ufunc2(func=None, promote_to_float=False, promote_bools=False):
+    if func is None:
+        return lambda func: ufunc2(func, promote_to_float, promote_bools)
+
+    call_sig = signature.Call2(func)
     def impl(space, w_lhs, w_rhs):
-        from pypy.module.micronumpy.interp_numarray import Call2, convert_to_array
-        if space.issequence_w(w_lhs) or space.issequence_w(w_rhs):
-            w_lhs_arr = convert_to_array(space, w_lhs)
-            w_rhs_arr = convert_to_array(space, w_rhs)
-            new_sig = w_lhs_arr.signature.transition(signature).transition(w_rhs_arr.signature)
-            w_res = Call2(func, w_lhs_arr, w_rhs_arr, new_sig)
-            w_lhs_arr.invalidates.append(w_res)
-            w_rhs_arr.invalidates.append(w_res)
-            return w_res
-        else:
-            return space.wrap(func(space.float_w(w_lhs), space.float_w(w_rhs)))
+        from pypy.module.micronumpy.interp_numarray import (Call2,
+            convert_to_array, Scalar)
+
+        w_lhs = convert_to_array(space, w_lhs)
+        w_rhs = convert_to_array(space, w_rhs)
+        res_dtype = find_binop_result_dtype(space,
+            w_lhs.find_dtype(), w_rhs.find_dtype(),
+            promote_to_float=promote_to_float,
+            promote_bools=promote_bools,
+        )
+        if isinstance(w_lhs, Scalar) and isinstance(w_rhs, Scalar):
+            return func(res_dtype, w_lhs.value, w_rhs.value).wrap(space)
+
+        new_sig = signature.Signature.find_sig([
+            call_sig, w_lhs.signature, w_rhs.signature
+        ])
+        w_res = Call2(new_sig, res_dtype, w_lhs, w_rhs)
+        w_lhs.add_invalidates(w_res)
+        w_rhs.add_invalidates(w_res)
+        return w_res
     return func_with_new_name(impl, "%s_dispatcher" % func.__name__)
 
- at ufunc
-def absolute(value):
-    return abs(value)
+def find_binop_result_dtype(space, dt1, dt2, promote_to_float=False,
+    promote_bools=False):
+    # dt1.num should be <= dt2.num
+    if dt1.num > dt2.num:
+        dt1, dt2 = dt2, dt1
+    # Some operations promote op(bool, bool) to return int8, rather than bool
+    if promote_bools and (dt1.kind == dt2.kind == interp_dtype.BOOLLTR):
+        return space.fromcache(interp_dtype.W_Int8Dtype)
+    if promote_to_float:
+        return find_unaryop_result_dtype(space, dt2, promote_to_float=True)
+    # If they're the same kind, choose the greater one.
+    if dt1.kind == dt2.kind:
+        return dt2
 
- at ufunc2
-def add(lvalue, rvalue):
-    return lvalue + rvalue
+    # Everything promotes to float, and bool promotes to everything.
+    if dt2.kind == interp_dtype.FLOATINGLTR or dt1.kind == interp_dtype.BOOLLTR:
+        return dt2
 
- at ufunc2
-def copysign(lvalue, rvalue):
-    return rfloat.copysign(lvalue, rvalue)
+    assert False
 
- at ufunc2
-def divide(lvalue, rvalue):
-    return lvalue / rvalue
+def find_unaryop_result_dtype(space, dt, promote_to_float=False,
+    promote_to_largest=False, promote_bools=False):
+    if promote_bools and (dt.kind == interp_dtype.BOOLLTR):
+        return space.fromcache(interp_dtype.W_Int8Dtype)
+    if promote_to_float:
+        for bytes, dtype in interp_dtype.dtypes_by_num_bytes:
+            if dtype.kind == interp_dtype.FLOATINGLTR and dtype.num_bytes >= dt.num_bytes:
+                return space.fromcache(dtype)
+    if promote_to_largest:
+        if dt.kind == interp_dtype.BOOLLTR or dt.kind == interp_dtype.SIGNEDLTR:
+            return space.fromcache(interp_dtype.W_Int64Dtype)
+        elif dt.kind == interp_dtype.FLOATINGLTR:
+            return space.fromcache(interp_dtype.W_Float64Dtype)
+        else:
+            assert False
+    return dt
 
- at ufunc
-def exp(value):
+def find_dtype_for_scalar(space, w_obj, current_guess=None):
+    w_type = space.type(w_obj)
+
+    bool_dtype = space.fromcache(interp_dtype.W_BoolDtype)
+    int64_dtype = space.fromcache(interp_dtype.W_Int64Dtype)
+
+    if space.is_w(w_type, space.w_bool):
+        if current_guess is None:
+            return bool_dtype
+    elif space.is_w(w_type, space.w_int):
+        if (current_guess is None or current_guess is bool_dtype or
+            current_guess is int64_dtype):
+            return int64_dtype
+    return space.fromcache(interp_dtype.W_Float64Dtype)
+
+
+def ufunc_dtype_caller(ufunc_name, op_name, argcount, **kwargs):
+    if argcount == 1:
+        @ufunc(**kwargs)
+        def impl(res_dtype, value):
+            return getattr(res_dtype, op_name)(value)
+    elif argcount == 2:
+        @ufunc2(**kwargs)
+        def impl(res_dtype, lvalue, rvalue):
+            return getattr(res_dtype, op_name)(lvalue, rvalue)
+    return func_with_new_name(impl, ufunc_name)
+
+for ufunc_def in [
+    ("add", "add", 2),
+    ("subtract", "sub", 2),
+    ("multiply", "mul", 2),
+    ("divide", "div", 2, {"promote_bools": True}),
+    ("mod", "mod", 2, {"promote_bools": True}),
+    ("power", "pow", 2, {"promote_bools": True}),
+
+    ("maximum", "max", 2),
+    ("minimum", "min", 2),
+
+    ("copysign", "copysign", 2, {"promote_to_float": True}),
+
+    ("positive", "pos", 1),
+    ("negative", "neg", 1),
+    ("absolute", "abs", 1),
+    ("sign", "sign", 1, {"promote_bools": True}),
+    ("reciprocal", "reciprocal", 1),
+
+    ("fabs", "fabs", 1, {"promote_to_float": True}),
+    ("floor", "floor", 1, {"promote_to_float": True}),
+    ("exp", "exp", 1, {"promote_to_float": True}),
+
+    ("sin", "sin", 1, {"promote_to_float": True}),
+    ("cos", "cos", 1, {"promote_to_float": True}),
+    ("tan", "tan", 1, {"promote_to_float": True}),
+    ("arcsin", "arcsin", 1, {"promote_to_float": True}),
+    ("arccos", "arccos", 1, {"promote_to_float": True}),
+    ("arctan", "arctan", 1, {"promote_to_float": True}),
+]:
+    ufunc_name = ufunc_def[0]
+    op_name = ufunc_def[1]
+    argcount = ufunc_def[2]
     try:
-        return math.exp(value)
-    except OverflowError:
-        return rfloat.INFINITY
+        extra_kwargs = ufunc_def[3]
+    except IndexError:
+        extra_kwargs = {}
 
- at ufunc
-def fabs(value):
-    return math.fabs(value)
-
- at ufunc2
-def maximum(lvalue, rvalue):
-    return max(lvalue, rvalue)
-
- at ufunc2
-def minimum(lvalue, rvalue):
-    return min(lvalue, rvalue)
-
- at ufunc2
-def multiply(lvalue, rvalue):
-    return lvalue * rvalue
-
-# Used by numarray for __pos__. Not visible from numpy application space.
- at ufunc
-def positive(value):
-    return value
-
- at ufunc
-def negative(value):
-    return -value
-
- at ufunc
-def reciprocal(value):
-    if value == 0.0:
-        return rfloat.copysign(rfloat.INFINITY, value)
-    return 1.0 / value
-
- at ufunc2
-def subtract(lvalue, rvalue):
-    return lvalue - rvalue
-
- at ufunc
-def floor(value):
-    return math.floor(value)
-
- at ufunc
-def sign(value):
-    if value == 0.0:
-        return 0.0
-    return rfloat.copysign(1.0, value)
-
- at ufunc
-def sin(value):
-    return math.sin(value)
-
- at ufunc
-def cos(value):
-    return math.cos(value)
-
- at ufunc
-def tan(value):
-    return math.tan(value)
-
- at ufunc2
-def power(lvalue, rvalue):
-    return math.pow(lvalue, rvalue)
-
- at ufunc2
-def mod(lvalue, rvalue):
-    return math.fmod(lvalue, rvalue)
-
-
- at ufunc
-def arcsin(value):
-    if value < -1.0 or  value > 1.0:
-        return rfloat.NAN
-    return math.asin(value)
-
- at ufunc
-def arccos(value):
-    if value < -1.0 or  value > 1.0:
-        return rfloat.NAN
-    return math.acos(value)
-
- at ufunc
-def arctan(value):
-    return math.atan(value)
+    globals()[ufunc_name] = ufunc_dtype_caller(ufunc_name, op_name, argcount, **extra_kwargs)
diff --git a/pypy/module/micronumpy/signature.py b/pypy/module/micronumpy/signature.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/micronumpy/signature.py
@@ -0,0 +1,52 @@
+from pypy.rlib.objectmodel import r_dict, compute_identity_hash
+from pypy.rlib.rarithmetic import intmask
+
+
+def components_eq(lhs, rhs):
+    if len(lhs) != len(rhs):
+        return False
+    for i in range(len(lhs)):
+        v1, v2 = lhs[i], rhs[i]
+        if type(v1) is not type(v2) or not v1.eq(v2):
+            return False
+    return True
+
+def components_hash(components):
+    res = 0x345678
+    for component in components:
+        res = intmask((1000003 * res) ^ component.hash())
+    return res
+
+class BaseSignature(object):
+    _attrs_ = []
+
+    def eq(self, other):
+        return self is other
+
+    def hash(self):
+        return compute_identity_hash(self)
+
+class Signature(BaseSignature):
+    _known_sigs = r_dict(components_eq, components_hash)
+
+    _attrs_ = ["components"]
+    _immutable_fields_ = ["components[*]"]
+
+    def __init__(self, components):
+        self.components = components
+
+    @staticmethod
+    def find_sig(components):
+        return Signature._known_sigs.setdefault(components, Signature(components))
+
+class Call1(BaseSignature):
+    _immutable_fields_ = ["func"]
+
+    def __init__(self, func):
+        self.func = func
+
+class Call2(BaseSignature):
+    _immutable_fields_ = ["func"]
+
+    def __init__(self, func):
+        self.func = func
\ No newline at end of file
diff --git a/pypy/module/micronumpy/test/test_base.py b/pypy/module/micronumpy/test/test_base.py
--- a/pypy/module/micronumpy/test/test_base.py
+++ b/pypy/module/micronumpy/test/test_base.py
@@ -1,23 +1,36 @@
 from pypy.conftest import gettestobjspace
-from pypy.module.micronumpy.interp_numarray import SingleDimArray, FloatWrapper
+from pypy.module.micronumpy import interp_dtype
+from pypy.module.micronumpy.interp_numarray import SingleDimArray, Scalar
+from pypy.module.micronumpy.interp_ufuncs import (find_binop_result_dtype,
+        find_unaryop_result_dtype)
+
 
 class BaseNumpyAppTest(object):
     def setup_class(cls):
-        cls.space = gettestobjspace(usemodules=('micronumpy',))
+        cls.space = gettestobjspace(usemodules=['micronumpy'])
 
 class TestSignature(object):
     def test_binop_signature(self, space):
-        ar = SingleDimArray(10)
+        float64_dtype = space.fromcache(interp_dtype.W_Float64Dtype)
+
+        ar = SingleDimArray(10, dtype=float64_dtype)
         v1 = ar.descr_add(space, ar)
-        v2 = ar.descr_add(space, FloatWrapper(2.0))
+        v2 = ar.descr_add(space, Scalar(float64_dtype, 2.0))
         assert v1.signature is not v2.signature
-        v3 = ar.descr_add(space, FloatWrapper(1.0))
+        v3 = ar.descr_add(space, Scalar(float64_dtype, 1.0))
         assert v2.signature is v3.signature
         v4 = ar.descr_add(space, ar)
         assert v1.signature is v4.signature
 
+        bool_ar = SingleDimArray(10, dtype=space.fromcache(interp_dtype.W_BoolDtype))
+        v5 = ar.descr_add(space, bool_ar)
+        assert v5.signature is not v1.signature
+        assert v5.signature is not v2.signature
+        v6 = ar.descr_add(space, bool_ar)
+        assert v5.signature is v6.signature
+
     def test_slice_signature(self, space):
-        ar = SingleDimArray(10)
+        ar = SingleDimArray(10, dtype=space.fromcache(interp_dtype.W_Float64Dtype))
         v1 = ar.descr_getitem(space, space.wrap(slice(1, 5, 1)))
         v2 = ar.descr_getitem(space, space.wrap(slice(4, 6, 1)))
         assert v1.signature is v2.signature
@@ -25,3 +38,44 @@
         v3 = ar.descr_add(space, v1)
         v4 = ar.descr_add(space, v2)
         assert v3.signature is v4.signature
+
+class TestUfuncCoerscion(object):
+    def test_binops(self, space):
+        bool_dtype = space.fromcache(interp_dtype.W_BoolDtype)
+        int8_dtype = space.fromcache(interp_dtype.W_Int8Dtype)
+        int32_dtype = space.fromcache(interp_dtype.W_Int32Dtype)
+        float64_dtype = space.fromcache(interp_dtype.W_Float64Dtype)
+
+        # Basic pairing
+        assert find_binop_result_dtype(space, bool_dtype, bool_dtype) is bool_dtype
+        assert find_binop_result_dtype(space, bool_dtype, float64_dtype) is float64_dtype
+        assert find_binop_result_dtype(space, float64_dtype, bool_dtype) is float64_dtype
+        assert find_binop_result_dtype(space, int32_dtype, int8_dtype) is int32_dtype
+        assert find_binop_result_dtype(space, int32_dtype, bool_dtype) is int32_dtype
+
+        # With promote bool (happens on div), the result is that the op should
+        # promote bools to int8
+        assert find_binop_result_dtype(space, bool_dtype, bool_dtype, promote_bools=True) is int8_dtype
+        assert find_binop_result_dtype(space, bool_dtype, float64_dtype, promote_bools=True) is float64_dtype
+
+        # Coerce to floats
+        assert find_binop_result_dtype(space, bool_dtype, float64_dtype, promote_to_float=True) is float64_dtype
+
+    def test_unaryops(self, space):
+        bool_dtype = space.fromcache(interp_dtype.W_BoolDtype)
+        int8_dtype = space.fromcache(interp_dtype.W_Int8Dtype)
+        int32_dtype = space.fromcache(interp_dtype.W_Int32Dtype)
+        float64_dtype = space.fromcache(interp_dtype.W_Float64Dtype)
+
+        # Normal rules, everythign returns itself
+        assert find_unaryop_result_dtype(space, bool_dtype) is bool_dtype
+        assert find_unaryop_result_dtype(space, int8_dtype) is int8_dtype
+        assert find_unaryop_result_dtype(space, int32_dtype) is int32_dtype
+        assert find_unaryop_result_dtype(space, float64_dtype) is float64_dtype
+
+        # Coerce to floats, some of these will eventually be float16, or
+        # whatever our smallest float type is.
+        assert find_unaryop_result_dtype(space, bool_dtype, promote_to_float=True) is float64_dtype
+        assert find_unaryop_result_dtype(space, int8_dtype, promote_to_float=True) is float64_dtype
+        assert find_unaryop_result_dtype(space, int32_dtype, promote_to_float=True) is float64_dtype
+        assert find_unaryop_result_dtype(space, float64_dtype, promote_to_float=True) is float64_dtype
\ No newline at end of file
diff --git a/pypy/module/micronumpy/test/test_dtypes.py b/pypy/module/micronumpy/test/test_dtypes.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/micronumpy/test/test_dtypes.py
@@ -0,0 +1,101 @@
+from pypy.module.micronumpy.test.test_base import BaseNumpyAppTest
+
+
+class AppTestDtypes(BaseNumpyAppTest):
+    def test_dtype(self):
+        from numpy import dtype
+
+        d = dtype('?')
+        assert d.num == 0
+        assert d.kind == 'b'
+        assert dtype('int8').num == 1
+        assert dtype(d) is d
+        assert dtype(None) is dtype(float)
+        raises(TypeError, dtype, 1042)
+
+    def test_dtype_with_types(self):
+        from numpy import dtype
+
+        assert dtype(bool).num == 0
+        assert dtype(long).num == 9
+        assert dtype(float).num == 12
+
+    def test_array_dtype_attr(self):
+        from numpy import array, dtype
+
+        a = array(range(5), long)
+        assert a.dtype is dtype(long)
+
+    def test_repr_str(self):
+        from numpy import dtype
+
+        assert repr(dtype) == "<type 'numpy.dtype'>"
+        d = dtype('?')
+        assert repr(d) == "dtype('bool')"
+        assert str(d) == "bool"
+
+    def test_bool_array(self):
+        from numpy import array
+
+        a = array([0, 1, 2, 2.5], dtype='?')
+        assert a[0] is False
+        for i in xrange(1, 4):
+            assert a[i] is True
+
+    def test_copy_array_with_dtype(self):
+        from numpy import array
+        a = array([0, 1, 2, 3], dtype=long)
+        # int on 64-bit, long in 32-bit
+        assert isinstance(a[0], (int, long))
+        b = a.copy()
+        assert isinstance(b[0], (int, long))
+
+        a = array([0, 1, 2, 3], dtype=bool)
+        assert isinstance(a[0], bool)
+        b = a.copy()
+        assert isinstance(b[0], bool)
+
+    def test_zeros_bool(self):
+        from numpy import zeros
+        a = zeros(10, dtype=bool)
+        for i in range(10):
+            assert a[i] is False
+
+    def test_ones_bool(self):
+        from numpy import ones
+        a = ones(10, dtype=bool)
+        for i in range(10):
+            assert a[i] is True
+
+    def test_zeros_long(self):
+        from numpy import zeros
+        a = zeros(10, dtype=long)
+        for i in range(10):
+            assert isinstance(a[i], (int, long))
+            assert a[1] == 0
+
+    def test_ones_long(self):
+        from numpy import ones
+        a = ones(10, dtype=bool)
+        for i in range(10):
+            assert isinstance(a[i], (int, long))
+            assert a[1] == 1
+
+    def test_add_int8(self):
+        from numpy import array
+
+        a = array(range(5), dtype="int8")
+        b = a + a
+        for i in range(5):
+            assert b[i] == i * 2
+
+    def test_shape(self):
+        from numpy import dtype
+
+        assert dtype(long).shape == ()
+
+    def test_cant_subclass(self):
+        from numpy import dtype
+
+        # You can't subclass dtype
+        raises(TypeError, type, "Foo", (dtype,), {})
\ No newline at end of file
diff --git a/pypy/module/micronumpy/test/test_numarray.py b/pypy/module/micronumpy/test/test_numarray.py
--- a/pypy/module/micronumpy/test/test_numarray.py
+++ b/pypy/module/micronumpy/test/test_numarray.py
@@ -1,5 +1,3 @@
-import py
-
 from pypy.module.micronumpy.test.test_base import BaseNumpyAppTest
 from pypy.conftest import gettestobjspace
 
@@ -52,14 +50,18 @@
 
     def test_repr(self):
         from numpy import array, zeros
-        a = array(range(5))
+        a = array(range(5), float)
         assert repr(a) == "array([0.0, 1.0, 2.0, 3.0, 4.0])"
         a = zeros(1001)
         assert repr(a) == "array([0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0])"
+        a = array(range(5), long)
+        assert repr(a) == "array([0, 1, 2, 3, 4])"
+        a = array([True, False, True, False], "?")
+        assert repr(a) == "array([True, False, True, False], dtype=bool)"
 
     def test_repr_slice(self):
         from numpy import array, zeros
-        a = array(range(5))
+        a = array(range(5), float)
         b = a[1::2]
         assert repr(b) == "array([1.0, 3.0])"
         a = zeros(2002)
@@ -68,15 +70,23 @@
 
     def test_str(self):
         from numpy import array, zeros
-        a = array(range(5))
+        a = array(range(5), float)
         assert str(a) == "[0.0 1.0 2.0 3.0 4.0]"
         assert str((2*a)[:]) == "[0.0 2.0 4.0 6.0 8.0]"
         a = zeros(1001)
         assert str(a) == "[0.0 0.0 0.0 ..., 0.0 0.0 0.0]"
 
+        a = array(range(5), dtype=long)
+        assert str(a) == "[0 1 2 3 4]"
+        a = array([True, False, True, False], dtype="?")
+        assert str(a) == "[True False True False]"
+
+        a = array(range(5), dtype="int8")
+        assert str(a) == "[0 1 2 3 4]"
+
     def test_str_slice(self):
         from numpy import array, zeros
-        a = array(range(5))
+        a = array(range(5), float)
         b = a[1::2]
         assert str(b) == "[1.0 3.0]"
         a = zeros(2002)
@@ -132,7 +142,7 @@
 
     def test_setslice_list(self):
         from numpy import array
-        a = array(range(5))
+        a = array(range(5), float)
         b = [0., 1.]
         a[1:4:2] = b
         assert a[1] == 0.
@@ -140,7 +150,7 @@
 
     def test_setslice_constant(self):
         from numpy import array
-        a = array(range(5))
+        a = array(range(5), float)
         a[1:4:2] = 0.
         assert a[1] == 0.
         assert a[3] == 0.
@@ -167,6 +177,12 @@
         for i in range(5):
             assert b[i] == i + i
 
+        a = array([True, False, True, False], dtype="?")
+        b = array([True, True, False, False], dtype="?")
+        c = a + b
+        for i in range(4):
+            assert c[i] == bool(a[i] + b[i])
+
     def test_add_other(self):
         from numpy import array
         a = array(range(5))
@@ -220,12 +236,19 @@
             assert b[i] == i - 5
 
     def test_mul(self):
-        from numpy import array
+        from numpy import array, dtype
         a = array(range(5))
         b = a * a
         for i in range(5):
             assert b[i] == i * i
 
+        a = array(range(5), dtype=bool)
+        b = a * a
+        assert b.dtype is dtype(bool)
+        assert b[0] is False
+        for i in range(1, 5):
+            assert b[i] is True
+
     def test_mul_constant(self):
         from numpy import array
         a = array(range(5))
@@ -234,16 +257,22 @@
             assert b[i] == i * 5
 
     def test_div(self):
-        from numpy import array
+        from numpy import array, dtype
         a = array(range(1, 6))
         b = a / a
         for i in range(5):
             assert b[i] == 1
 
+        a = array(range(1, 6), dtype=bool)
+        b = a / a
+        assert b.dtype is dtype("int8")
+        for i in range(5):
+            assert b[i] == 1
+
     def test_div_other(self):
         from numpy import array
         a = array(range(5))
-        b = array([2, 2, 2, 2, 2])
+        b = array([2, 2, 2, 2, 2], float)
         c = a / b
         for i in range(5):
             assert c[i] == i / 2.0
@@ -257,7 +286,7 @@
 
     def test_pow(self):
         from numpy import array
-        a = array(range(5))
+        a = array(range(5), float)
         b = a ** a
         for i in range(5):
             print b[i], i**i
@@ -265,7 +294,7 @@
 
     def test_pow_other(self):
         from numpy import array
-        a = array(range(5))
+        a = array(range(5), float)
         b = array([2, 2, 2, 2, 2])
         c = a ** b
         for i in range(5):
@@ -273,7 +302,7 @@
 
     def test_pow_constant(self):
         from numpy import array
-        a = array(range(5))
+        a = array(range(5), float)
         b = a ** 2
         for i in range(5):
             assert b[i] == i ** 2
@@ -285,6 +314,12 @@
         for i in range(5):
             assert b[i] == 0
 
+        a = array(range(1, 6), float)
+        b = (a + 1) % a
+        assert b[0] == 0
+        for i in range(1, 5):
+            assert b[i] == 1
+
     def test_mod_other(self):
         from numpy import array
         a = array(range(5))
@@ -307,6 +342,10 @@
         for i in range(5):
             assert b[i] == a[i]
 
+        a = +array(range(5))
+        for i in range(5):
+            assert a[i] == i
+
     def test_neg(self):
         from numpy import array
         a = array([1.,-2.,3.,-4.,-5.])
@@ -314,6 +353,10 @@
         for i in range(5):
             assert b[i] == -a[i]
 
+        a = -array(range(5), dtype="int8")
+        for i in range(5):
+            assert a[i] == -i
+
     def test_abs(self):
         from numpy import array
         a = array([1.,-2.,3.,-4.,-5.])
@@ -321,6 +364,10 @@
         for i in range(5):
             assert b[i] == abs(a[i])
 
+        a = abs(array(range(-5, 5), dtype="int8"))
+        for i in range(-5, 5):
+            assert a[i + 5] == abs(i)
+
     def test_auto_force(self):
         from numpy import array
         a = array(range(5))
@@ -343,6 +390,12 @@
         for i in range(4):
             assert s[i] == a[i+1]
 
+        s = (a + a)[1:2]
+        assert len(s) == 1
+        assert s[0] == 2
+        s[:1] = array([5])
+        assert s[0] == 5
+
     def test_getslice_step(self):
         from numpy import array
         a = array(range(10))
@@ -388,6 +441,9 @@
         assert a.sum() == 10.0
         assert a[:4].sum() == 6.0
 
+        a = array([True] * 5, bool)
+        assert a.sum() == 5
+
     def test_prod(self):
         from numpy import array
         a = array(range(1,6))
@@ -420,6 +476,9 @@
         b = array([])
         raises(ValueError, "b.argmax()")
 
+        a = array(range(-5, 5))
+        assert a.argmax() == 9
+
     def test_argmin(self):
         from numpy import array
         a = array([-1.2, 3.4, 5.7, -3.0, 2.7])
@@ -450,12 +509,25 @@
         a = array(range(5))
         assert a.dot(a) == 30.0
 
+        a = array(range(5))
+        assert a.dot(range(5)) == 30
+
     def test_dot_constant(self):
         from numpy import array
         a = array(range(5))
         b = a.dot(2.5)
         for i in xrange(5):
-            assert b[i] == 2.5*a[i]
+            assert b[i] == 2.5 * a[i]
+
+    def test_dtype_guessing(self):
+        from numpy import array, dtype
+
+        assert array([True]).dtype is dtype(bool)
+        assert array([True, 1]).dtype is dtype(long)
+        assert array([1, 2, 3]).dtype is dtype(long)
+        assert array([1.2, True]).dtype is dtype(float)
+        assert array([1.2, 5]).dtype is dtype(float)
+        assert array([]).dtype is dtype(float)
 
 
 class AppTestSupport(object):
@@ -469,5 +541,4 @@
         a = fromstring(self.data)
         for i in range(4):
             assert a[i] == i + 1
-        raises(ValueError, fromstring, "abc")
-
+        raises(ValueError, fromstring, "abc")
\ No newline at end of file
diff --git a/pypy/module/micronumpy/test/test_ufuncs.py b/pypy/module/micronumpy/test/test_ufuncs.py
--- a/pypy/module/micronumpy/test/test_ufuncs.py
+++ b/pypy/module/micronumpy/test/test_ufuncs.py
@@ -86,7 +86,7 @@
     def test_fabs(self):
         from numpy import array, fabs
         from math import fabs as math_fabs
-        
+
         a = array([-5.0, -0.0, 1.0])
         b = fabs(a)
         for i in range(3):
@@ -110,6 +110,10 @@
         for i in range(3):
             assert c[i] == max(a[i], b[i])
 
+        x = maximum(2, 3)
+        assert x == 3
+        assert isinstance(x, (int, long))
+
     def test_multiply(self):
         from numpy import array, multiply
 
@@ -120,7 +124,7 @@
             assert c[i] == a[i] * b[i]
 
     def test_sign(self):
-        from numpy import array, sign
+        from numpy import array, sign, dtype
 
         reference = [-1.0, 0.0, 0.0, 1.0]
         a = array([-5.0, -0.0, 0.0, 6.0])
@@ -128,6 +132,16 @@
         for i in range(4):
             assert b[i] == reference[i]
 
+        a = sign(array(range(-5, 5)))
+        ref = [-1, -1, -1, -1, -1, 0, 1, 1, 1, 1]
+        for i in range(10):
+            assert a[i] == ref[i]
+
+        a = sign(array([True, False], dtype=bool))
+        assert a.dtype == dtype("int8")
+        assert a[0] == 1
+        assert a[1] == 0
+
     def test_reciporocal(self):
         from numpy import array, reciprocal
 
@@ -165,6 +179,11 @@
         for i in range(4):
             assert c[i] == reference[i]
 
+        b = array([True, True, True, True], dtype=bool)
+        c = copysign(a, b)
+        for i in range(4):
+            assert c[i] == abs(a[i])
+
     def test_exp(self):
         import math
         from numpy import array, exp
@@ -188,6 +207,10 @@
         for i in range(len(a)):
             assert b[i] == math.sin(a[i])
 
+        a = sin(array([True, False], dtype=bool))
+        assert a[0] == sin(1)
+        assert a[1] == 0.0
+
     def test_cos(self):
         import math
         from numpy import array, cos
@@ -211,7 +234,7 @@
         import math
         from numpy import array, arcsin
 
-        a = array([-1, -0.5, -0.33, 0, 0.33, 0.5, 1])        
+        a = array([-1, -0.5, -0.33, 0, 0.33, 0.5, 1])
         b = arcsin(a)
         for i in range(len(a)):
             assert b[i] == math.asin(a[i])
@@ -230,7 +253,7 @@
         for i in range(len(a)):
             assert b[i] == math.acos(a[i])
 
-        
+
         a = array([-10, -1.5, -1.01, 1.01, 1.5, 10, float('nan'), float('inf'), float('-inf')])
         b = arccos(a)
         for f in b:
diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -1,34 +1,26 @@
 from pypy.jit.metainterp.test.support import LLJitMixin
+from pypy.module.micronumpy import interp_ufuncs, signature
+from pypy.module.micronumpy.compile import (numpy_compile, FakeSpace,
+    FloatObject)
+from pypy.module.micronumpy.interp_dtype import W_Float64Dtype, W_Int64Dtype
+from pypy.module.micronumpy.interp_numarray import (BaseArray, SingleDimArray,
+    SingleDimSlice, scalar_w)
+from pypy.rlib.nonconst import NonConstant
+from pypy.rpython.annlowlevel import llstr
 from pypy.rpython.test.test_llinterp import interpret
-from pypy.module.micronumpy.interp_numarray import (SingleDimArray, Signature,
-    FloatWrapper, Call2, SingleDimSlice, add, mul, Call1)
-from pypy.module.micronumpy.interp_ufuncs import negative
-from pypy.module.micronumpy.compile import numpy_compile
-from pypy.rlib.objectmodel import specialize
-from pypy.rlib.nonconst import NonConstant
 
-class FakeSpace(object):
-    w_ValueError = None
-
-    def issequence_w(self, w_obj):
-        return True
-
-    @specialize.argtype(1)
-    def wrap(self, w_obj):
-        return w_obj
-
-    def float_w(self, w_obj):
-        return float(w_obj)
 
 class TestNumpyJIt(LLJitMixin):
     def setup_class(cls):
         cls.space = FakeSpace()
+        cls.float64_dtype = cls.space.fromcache(W_Float64Dtype)
+        cls.int64_dtype = cls.space.fromcache(W_Int64Dtype)
 
     def test_add(self):
         def f(i):
-            ar = SingleDimArray(i)
-            v = Call2(add, ar, ar, Signature())
-            return v.get_concrete().storage[3]
+            ar = SingleDimArray(i, dtype=self.float64_dtype)
+            v = interp_ufuncs.add(self.space, ar, ar)
+            return v.get_concrete().eval(3).val
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({'getarrayitem_raw': 2, 'float_add': 1,
@@ -38,9 +30,13 @@
 
     def test_floatadd(self):
         def f(i):
-            ar = SingleDimArray(i)
-            v = Call2(add, ar, FloatWrapper(4.5), Signature())
-            return v.get_concrete().storage[3]
+            ar = SingleDimArray(i, dtype=self.float64_dtype)
+            v = interp_ufuncs.add(self.space,
+                ar,
+                scalar_w(self.space, self.float64_dtype, self.space.wrap(4.5))
+            )
+            assert isinstance(v, BaseArray)
+            return v.get_concrete().eval(3).val
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({"getarrayitem_raw": 1, "float_add": 1,
@@ -50,10 +46,18 @@
 
     def test_sum(self):
         space = self.space
+        float64_dtype = self.float64_dtype
+        int64_dtype = self.int64_dtype
 
         def f(i):
-            ar = SingleDimArray(i)
-            return ar.descr_add(space, ar).descr_sum(space)
+            if NonConstant(False):
+                dtype = int64_dtype
+            else:
+                dtype = float64_dtype
+            ar = SingleDimArray(i, dtype=dtype)
+            v = ar.descr_add(space, ar).descr_sum(space)
+            assert isinstance(v, FloatObject)
+            return v.floatval
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({"getarrayitem_raw": 2, "float_add": 2,
@@ -63,10 +67,18 @@
 
     def test_prod(self):
         space = self.space
+        float64_dtype = self.float64_dtype
+        int64_dtype = self.int64_dtype
 
         def f(i):
-            ar = SingleDimArray(i)
-            return ar.descr_add(space, ar).descr_prod(space)
+            if NonConstant(False):
+                dtype = int64_dtype
+            else:
+                dtype = float64_dtype
+            ar = SingleDimArray(i, dtype=dtype)
+            v = ar.descr_add(space, ar).descr_prod(space)
+            assert isinstance(v, FloatObject)
+            return v.floatval
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({"getarrayitem_raw": 2, "float_add": 1,
@@ -76,32 +88,34 @@
 
     def test_max(self):
         space = self.space
+        float64_dtype = self.float64_dtype
 
         def f(i):
-            ar = SingleDimArray(i)
+            ar = SingleDimArray(i, dtype=NonConstant(float64_dtype))
             j = 0
             while j < i:
-                ar.get_concrete().storage[j] = float(j)
+                ar.get_concrete().setitem(j, float64_dtype.box(float(j)))
                 j += 1
-            return ar.descr_add(space, ar).descr_max(space)
+            return ar.descr_add(space, ar).descr_max(space).floatval
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({"getarrayitem_raw": 2, "float_add": 1,
                           "float_gt": 1, "int_add": 1,
-                          "int_lt": 1, "guard_true": 1, 
+                          "int_lt": 1, "guard_true": 1,
                           "guard_false": 1, "jump": 1})
         assert result == f(5)
 
     def test_min(self):
         space = self.space
+        float64_dtype = self.float64_dtype
 
         def f(i):
-            ar = SingleDimArray(i)
+            ar = SingleDimArray(i, dtype=NonConstant(float64_dtype))
             j = 0
             while j < i:
-                ar.get_concrete().storage[j] = float(j)
+                ar.get_concrete().setitem(j, float64_dtype.box(float(j)))
                 j += 1
-            return ar.descr_add(space, ar).descr_min(space)
+            return ar.descr_add(space, ar).descr_min(space).floatval
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({"getarrayitem_raw": 2, "float_add": 1,
@@ -112,14 +126,15 @@
 
     def test_argmin(self):
         space = self.space
+        float64_dtype = self.float64_dtype
 
         def f(i):
-            ar = SingleDimArray(i)
+            ar = SingleDimArray(i, dtype=NonConstant(float64_dtype))
             j = 0
             while j < i:
-                ar.get_concrete().storage[j] = float(j)
+                ar.get_concrete().setitem(j, float64_dtype.box(float(j)))
                 j += 1
-            return ar.descr_add(space, ar).descr_argmin(space)
+            return ar.descr_add(space, ar).descr_argmin(space).intval
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({"getarrayitem_raw": 2, "float_add": 1,
@@ -130,14 +145,16 @@
 
     def test_all(self):
         space = self.space
+        float64_dtype = self.float64_dtype
 
         def f(i):
-            ar = SingleDimArray(i)
+            ar = SingleDimArray(i, dtype=NonConstant(float64_dtype))
             j = 0
             while j < i:
-                ar.get_concrete().storage[j] = 1.0
+                ar.get_concrete().setitem(j, float64_dtype.box(1.0))
                 j += 1
-            return ar.descr_add(space, ar).descr_all(space)
+            return ar.descr_add(space, ar).descr_all(space).boolval
+
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({"getarrayitem_raw": 2, "float_add": 1,
                           "int_add": 1, "float_ne": 1,
@@ -146,10 +163,11 @@
 
     def test_any(self):
         space = self.space
+        float64_dtype = self.float64_dtype
 
         def f(i):
-            ar = SingleDimArray(i)
-            return ar.descr_add(space, ar).descr_any(space)
+            ar = SingleDimArray(i, dtype=NonConstant(float64_dtype))
+            return ar.descr_add(space, ar).descr_any(space).boolval
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({"getarrayitem_raw": 2, "float_add": 1,
@@ -157,13 +175,17 @@
                           "int_lt": 1, "guard_true": 1, "jump": 1})
         assert result == f(5)
 
-    def test_already_forecd(self):
+    def test_already_forced(self):
+        space = self.space
+
         def f(i):
-            ar = SingleDimArray(i)
-            v1 = Call2(add, ar, FloatWrapper(4.5), Signature())
-            v2 = Call2(mul, v1, FloatWrapper(4.5), Signature())
+            ar = SingleDimArray(i, dtype=self.float64_dtype)
+            v1 = interp_ufuncs.add(space, ar, scalar_w(space, self.float64_dtype, space.wrap(4.5)))
+            assert isinstance(v1, BaseArray)
+            v2 = interp_ufuncs.multiply(space, v1, scalar_w(space, self.float64_dtype, space.wrap(4.5)))
             v1.force_if_needed()
-            return v2.get_concrete().storage[3]
+            assert isinstance(v2, BaseArray)
+            return v2.get_concrete().eval(3).val
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         # This is the sum of the ops for both loops, however if you remove the
@@ -177,10 +199,10 @@
     def test_ufunc(self):
         space = self.space
         def f(i):
-            ar = SingleDimArray(i)
-            v1 = Call2(add, ar, ar, Signature())
-            v2 = negative(space, v1)
-            return v2.get_concrete().storage[3]
+            ar = SingleDimArray(i, dtype=self.float64_dtype)
+            v1 = interp_ufuncs.add(space, ar, ar)
+            v2 = interp_ufuncs.negative(space, v1)
+            return v2.get_concrete().eval(3).val
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({"getarrayitem_raw": 2, "float_add": 1, "float_neg": 1,
@@ -192,17 +214,15 @@
     def test_appropriate_specialization(self):
         space = self.space
         def f(i):
-            add_sig = Signature()
-            mul_sig = Signature()
-            ar = SingleDimArray(i)
+            ar = SingleDimArray(i, dtype=self.float64_dtype)
 
-            v1 = Call2(add, ar, ar, ar.signature.transition(add_sig))
-            v2 = negative(space, v1)
+            v1 = interp_ufuncs.add(space, ar, ar)
+            v2 = interp_ufuncs.negative(space, v1)
             v2.get_concrete()
 
             for i in xrange(5):
-                v1 = Call2(mul, ar, ar, ar.signature.transition(mul_sig))
-                v2 = negative(space, v1)
+                v1 = interp_ufuncs.multiply(space, ar, ar)
+                v2 = interp_ufuncs.negative(space, v1)
                 v2.get_concrete()
 
         self.meta_interp(f, [5], listops=True, backendopt=True)
@@ -212,10 +232,13 @@
     def test_slice(self):
         def f(i):
             step = 3
-            ar = SingleDimArray(step*i)
-            s = SingleDimSlice(0, step*i, step, i, ar, ar.signature.transition(SingleDimSlice.static_signature))
-            v = Call2(add, s, s, Signature())
-            return v.get_concrete().storage[3]
+            ar = SingleDimArray(step*i, dtype=self.float64_dtype)
+            new_sig = signature.Signature.find_sig([
+                SingleDimSlice.signature, ar.signature
+            ])
+            s = SingleDimSlice(0, step*i, step, i, ar, new_sig)
+            v = interp_ufuncs.add(self.space, s, s)
+            return v.get_concrete().eval(3).val
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({'int_mul': 1, 'getarrayitem_raw': 2, 'float_add': 1,
@@ -227,11 +250,17 @@
         def f(i):
             step1 = 2
             step2 = 3
-            ar = SingleDimArray(step2*i)
-            s1 = SingleDimSlice(0, step1*i, step1, i, ar, ar.signature.transition(SingleDimSlice.static_signature))
-            s2 = SingleDimSlice(0, step2*i, step2, i, ar, ar.signature.transition(SingleDimSlice.static_signature))
-            v = Call2(add, s1, s2, Signature())
-            return v.get_concrete().storage[3]
+            ar = SingleDimArray(step2*i, dtype=self.float64_dtype)
+            new_sig = signature.Signature.find_sig([
+                SingleDimSlice.signature, ar.signature
+            ])
+            s1 = SingleDimSlice(0, step1*i, step1, i, ar, new_sig)
+            new_sig = signature.Signature.find_sig([
+                SingleDimSlice.signature, s1.signature
+            ])
+            s2 = SingleDimSlice(0, step2*i, step2, i, ar, new_sig)
+            v = interp_ufuncs.add(self.space, s1, s2)
+            return v.get_concrete().eval(3).val
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({'int_mul': 2, 'getarrayitem_raw': 2, 'float_add': 1,
@@ -241,18 +270,16 @@
 
     def test_setslice(self):
         space = self.space
+        float64_dtype = self.float64_dtype
 
         def f(i):
             step = NonConstant(3)
-            ar = SingleDimArray(step*i)
-            ar2 = SingleDimArray(i)
-            ar2.storage[1] = 5.5
-            if NonConstant(False):
-                arg = ar2
-            else:
-                arg = ar2.descr_add(space, ar2)
+            ar = SingleDimArray(step*i, dtype=float64_dtype)
+            ar2 = SingleDimArray(i, dtype=float64_dtype)
+            ar2.get_concrete().setitem(1, float64_dtype.box(5.5))
+            arg = ar2.descr_add(space, ar2)
             ar.setslice(space, 0, step*i, step, i, arg)
-            return ar.get_concrete().storage[3]
+            return ar.get_concrete().eval(3).val
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({'getarrayitem_raw': 2,
@@ -267,12 +294,11 @@
         x = x.compute()
         assert isinstance(x, SingleDimArray)
         assert x.size == 10
-        assert x.storage[0] == 0
-        assert x.storage[1] == ((1 + 1) * 1.2) / 1.2 - 1
-    
+        assert x.eval(0).val == 0
+        assert x.eval(1).val == ((1 + 1) * 1.2) / 1.2 - 1
+
     def test_translation(self):
         # we import main to check if the target compiles
         from pypy.translator.goal.targetnumpystandalone import main
-        from pypy.rpython.annlowlevel import llstr
-        
+
         interpret(main, [llstr('af+'), 100])
diff --git a/pypy/module/posix/__init__.py b/pypy/module/posix/__init__.py
--- a/pypy/module/posix/__init__.py
+++ b/pypy/module/posix/__init__.py
@@ -161,6 +161,8 @@
         interpleveldefs['mknod'] = 'interp_posix.mknod'
     if hasattr(os, 'nice'):
         interpleveldefs['nice'] = 'interp_posix.nice'
+    if hasattr(os, 'getlogin'):
+        interpleveldefs['getlogin'] = 'interp_posix.getlogin'
 
     for name in ['setsid', 'getuid', 'geteuid', 'getgid', 'getegid', 'setuid',
                  'seteuid', 'setgid', 'setegid', 'getgroups', 'getpgrp', 
diff --git a/pypy/module/posix/interp_posix.py b/pypy/module/posix/interp_posix.py
--- a/pypy/module/posix/interp_posix.py
+++ b/pypy/module/posix/interp_posix.py
@@ -464,6 +464,15 @@
                              space.wrap("strerror() argument out of range"))
     return space.wrap(text)
 
+def getlogin(space):
+    """Return the currently logged in user."""
+    try:
+        cur = os.getlogin()
+    except OSError, e:
+        raise wrap_oserror(space, e)
+    else:
+        return space.wrap(cur)
+
 # ____________________________________________________________
 
 def getstatfields(space):
diff --git a/pypy/module/posix/test/test_posix2.py b/pypy/module/posix/test/test_posix2.py
--- a/pypy/module/posix/test/test_posix2.py
+++ b/pypy/module/posix/test/test_posix2.py
@@ -805,6 +805,16 @@
                 data = f.read()
                 assert data == "who cares?"
 
+    try:
+        os.getlogin()
+    except (AttributeError, OSError):
+        pass
+    else:
+        def test_getlogin(self):
+            assert isinstance(self.posix.getlogin(), str)
+            # How else could we test that getlogin is properly
+            # working?
+
     def test_tmpfile(self):
         os = self.posix
         f = os.tmpfile()
diff --git a/pypy/module/pyexpat/interp_pyexpat.py b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -9,6 +9,7 @@
 
 from pypy.rpython.tool import rffi_platform
 from pypy.translator.tool.cbuild import ExternalCompilationInfo
+from pypy.translator.platform import platform
 
 import sys
 import py
@@ -19,7 +20,9 @@
     libname = 'expat'
 eci = ExternalCompilationInfo(
     libraries=[libname],
+    library_dirs=platform.preprocess_library_dirs([]),
     includes=['expat.h'],
+    include_dirs=platform.preprocess_include_dirs([]),
     )
 
 eci = rffi_platform.configure_external_library(
diff --git a/pypy/module/pypyjit/interp_jit.py b/pypy/module/pypyjit/interp_jit.py
--- a/pypy/module/pypyjit/interp_jit.py
+++ b/pypy/module/pypyjit/interp_jit.py
@@ -24,6 +24,7 @@
                             'last_exception',
                             'lastblock',
                             'is_being_profiled',
+                            'w_globals',
                             ]
 
 JUMP_ABSOLUTE = opmap['JUMP_ABSOLUTE']
diff --git a/pypy/module/pypyjit/test_pypy_c/test_call.py b/pypy/module/pypyjit/test_pypy_c/test_call.py
--- a/pypy/module/pypyjit/test_pypy_c/test_call.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_call.py
@@ -67,24 +67,14 @@
         assert log.opnames(ops) == ["guard_value",
                                     "getfield_gc", "guard_value",
                                     "getfield_gc", "guard_value",
-                                    "getfield_gc", "guard_nonnull_class"]
-        # LOAD_GLOBAL of OFFSET but in different function partially folded
-        # away
-        # XXX could be improved
+                                    "guard_not_invalidated"]
         ops = entry_bridge.ops_by_id('add', opcode='LOAD_GLOBAL')
-        assert log.opnames(ops) == ["guard_value", "getfield_gc", "guard_value"]
+        assert log.opnames(ops) == ["guard_not_invalidated"]
         #
-        # two LOAD_GLOBAL of f, the second is folded away
         ops = entry_bridge.ops_by_id('call', opcode='LOAD_GLOBAL')
-        assert log.opnames(ops) == ["getfield_gc", "guard_nonnull_class"]
+        assert log.opnames(ops) == []
         #
         assert entry_bridge.match_by_id('call', """
-            p29 = getfield_gc(ConstPtr(ptr28), descr=<GcPtrFieldDescr pypy.objspace.std.celldict.ModuleCell.inst_w_value .*>)
-            guard_nonnull_class(p29, ConstClass(Function), descr=...)
-            p33 = getfield_gc(p29, descr=<GcPtrFieldDescr pypy.interpreter.function.Function.inst_code .*>)
-            guard_value(p33, ConstPtr(ptr34), descr=...)
-            p35 = getfield_gc(p29, descr=<GcPtrFieldDescr pypy.interpreter.function.Function.inst_w_func_globals .*>)
-            p36 = getfield_gc(p29, descr=<GcPtrFieldDescr pypy.interpreter.function.Function.inst_closure .*>)
             p38 = call(ConstClass(getexecutioncontext), descr=<GcPtrCallDescr>)
             p39 = getfield_gc(p38, descr=<GcPtrFieldDescr pypy.interpreter.executioncontext.ExecutionContext.inst_topframeref .*>)
             i40 = force_token()
@@ -100,19 +90,16 @@
         # -----------------------------
         loop, = log.loops_by_id('call')
         assert loop.match("""
-            i12 = int_lt(i5, i6)
-            guard_true(i12, descr=...)
+            guard_not_invalidated(descr=...)
+            i9 = int_lt(i5, i6)
+            guard_true(i9, descr=...)
+            i10 = force_token()
+            i12 = int_add(i5, 1)
             i13 = force_token()
-            i15 = int_add(i5, 1)
-            i16 = int_add_ovf(i15, i7)
-            guard_no_overflow(descr=...)
-            i18 = force_token()
-            i20 = int_add_ovf(i16, 1)
-            guard_no_overflow(descr=...)
-            i21 = int_add_ovf(i20, i7)
+            i15 = int_add_ovf(i12, 1)
             guard_no_overflow(descr=...)
             --TICK--
-            jump(p0, p1, p2, p3, p4, i21, i6, i7, p8, p9, p10, p11, descr=<Loop0>)
+            jump(p0, p1, p2, p3, p4, i15, i6, p7, p8, descr=<Loop0>)
         """)
 
     def test_method_call(self):
diff --git a/pypy/module/pypyjit/test_pypy_c/test_globals.py b/pypy/module/pypyjit/test_pypy_c/test_globals.py
--- a/pypy/module/pypyjit/test_pypy_c/test_globals.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_globals.py
@@ -20,11 +20,9 @@
             guard_value(p10, ConstPtr(ptr11), descr=...)
             p12 = getfield_gc(p10, descr=<GcPtrFieldDescr .*W_DictMultiObject.inst_strategy .*>)
             guard_value(p12, ConstPtr(ptr13), descr=...)
-            p15 = getfield_gc(ConstPtr(ptr14), descr=<GcPtrFieldDescr .*ModuleCell.inst_w_value .*>)
-            guard_isnull(p15, descr=...)
             guard_not_invalidated(descr=...)
             p19 = getfield_gc(ConstPtr(p17), descr=<GcPtrFieldDescr .*W_DictMultiObject.inst_strategy .*>)
             guard_value(p19, ConstPtr(ptr20), descr=...)
             p22 = getfield_gc(ConstPtr(ptr21), descr=<GcPtrFieldDescr .*ModuleCell.inst_w_value .*>)
             guard_nonnull(p22, descr=...)
-        """)
\ No newline at end of file
+        """)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_instance.py b/pypy/module/pypyjit/test_pypy_c/test_instance.py
--- a/pypy/module/pypyjit/test_pypy_c/test_instance.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_instance.py
@@ -52,7 +52,7 @@
             i10 = int_add_ovf(i5, i7)
             guard_no_overflow(descr=...)
             --TICK--
-            jump(p0, p1, p2, p3, p4, i10, i6, p7, i7, p8, descr=<Loop0>)
+            jump(p0, p1, p2, p3, p4, i10, i6, i7, p8, descr=<Loop0>)
         """)
 
     def test_getattr_with_dynamic_attribute(self):
@@ -151,6 +151,7 @@
         assert loop.match_by_id('loadattr',
         '''
         guard_not_invalidated(descr=...)
+        i16 = arraylen_gc(p10, descr=<GcPtrArrayDescr>)
         i19 = call(ConstClass(ll_dict_lookup), _, _, _, descr=...)
         guard_no_exception(descr=...)
         i21 = int_and(i19, _)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_math.py b/pypy/module/pypyjit/test_pypy_c/test_math.py
--- a/pypy/module/pypyjit/test_pypy_c/test_math.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_math.py
@@ -47,6 +47,7 @@
         assert loop.match("""
             i2 = int_lt(i0, i1)
             guard_true(i2, descr=...)
+            guard_not_invalidated(descr=...)
             f1 = cast_int_to_float(i0)
             i3 = float_eq(f1, inf)
             i4 = float_eq(f1, -inf)
@@ -60,4 +61,33 @@
             i7 = int_add(i0, f1)
             --TICK--
             jump(..., descr=)
+        """)
+
+    def test_fmod(self):
+        def main(n):
+            import math
+
+            s = 0
+            while n > 0:
+                s += math.fmod(n, 2.0)
+                n -= 1
+            return s
+        log = self.run(main, [500])
+        assert log.result == main(500)
+        loop, = log.loops_by_filename(self.filepath)
+        assert loop.match("""
+            i1 = int_gt(i0, 0)
+            guard_true(i1, descr=...)
+            guard_not_invalidated(descr=...)
+            f1 = cast_int_to_float(i0)
+            i2 = float_eq(f1, inf)
+            i3 = float_eq(f1, -inf)
+            i4 = int_or(i2, i3)
+            i5 = int_is_true(i4)
+            guard_false(i5, descr=...)
+            f2 = call(ConstClass(fmod), f1, 2.0, descr=<FloatCallDescr>)
+            f3 = float_add(f0, f2)
+            i6 = int_sub(i0, 1)
+            --TICK--
+            jump(..., descr=)
         """)
\ No newline at end of file
diff --git a/pypy/module/pypyjit/test_pypy_c/test_misc.py b/pypy/module/pypyjit/test_pypy_c/test_misc.py
--- a/pypy/module/pypyjit/test_pypy_c/test_misc.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_misc.py
@@ -268,3 +268,18 @@
             return total
         #
         self.run_and_check(main, [])
+
+
+    def test_global(self):
+        log = self.run("""
+        i = 0
+        globalinc = 1
+        def main(n):
+            global i
+            while i < n:
+                l = globalinc # ID: globalread
+                i += l
+        """, [1000])
+
+        loop, = log.loops_by_id("globalread", is_entry_bridge=True)
+        assert len(loop.ops_by_id("globalread")) == 0
diff --git a/pypy/module/pypyjit/test_pypy_c/test_string.py b/pypy/module/pypyjit/test_pypy_c/test_string.py
--- a/pypy/module/pypyjit/test_pypy_c/test_string.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_string.py
@@ -34,9 +34,9 @@
             i25 = unicodegetitem(p13, i19)
             p27 = newstr(1)
             strsetitem(p27, 0, i23)
-            p30 = call(ConstClass(ll_str2unicode__rpy_stringPtr), p27, descr=<GcPtrCallDescr>)
+            p30 = call(ConstClass(ll_str2unicode__rpy_stringPtr), p27, descr=...)
             guard_no_exception(descr=...)
-            i32 = call(ConstClass(_ll_2_str_eq_checknull_char__rpy_unicodePtr_UniChar), p30, i25, descr=<SignedCallDescr>)
+            i32 = call(ConstClass(_ll_2_str_eq_checknull_char__rpy_unicodePtr_UniChar), p30, i25, descr=...)
             guard_true(i32, descr=...)
             i34 = int_add(i6, 1)
             --TICK--
@@ -105,5 +105,5 @@
             i58 = int_add_ovf(i6, i57)
             guard_no_overflow(descr=...)
             --TICK--
-            jump(p0, p1, p2, p3, p4, p5, i58, i7, i8, p9, p10, descr=<Loop4>)
+            jump(p0, p1, p2, p3, p4, p5, i58, i7, descr=<Loop4>)
         """)
diff --git a/pypy/module/test_lib_pypy/ctypes_tests/_ctypes_test.c b/pypy/module/test_lib_pypy/ctypes_tests/_ctypes_test.c
--- a/pypy/module/test_lib_pypy/ctypes_tests/_ctypes_test.c
+++ b/pypy/module/test_lib_pypy/ctypes_tests/_ctypes_test.c
@@ -481,6 +481,16 @@
 	int a, b, c, d, e, f, g, h;
 } S8I;
 
+
+
+typedef int (*CALLBACK_RECT)(RECT rect);
+
+EXPORT(int) call_callback_with_rect(CALLBACK_RECT cb, RECT rect)
+{
+    return cb(rect);
+}
+
+
 EXPORT(S8I) ret_8i_func(S8I inp)
 {
 	inp.a *= 2;
diff --git a/pypy/module/test_lib_pypy/ctypes_tests/test_callbacks.py b/pypy/module/test_lib_pypy/ctypes_tests/test_callbacks.py
--- a/pypy/module/test_lib_pypy/ctypes_tests/test_callbacks.py
+++ b/pypy/module/test_lib_pypy/ctypes_tests/test_callbacks.py
@@ -150,7 +150,6 @@
 class TestMoreCallbacks(BaseCTypesTestChecker):
 
     def test_callback_with_struct_argument(self):
-        py.test.skip("callbacks with struct arguments not implemented yet")
         class RECT(Structure):
             _fields_ = [("left", c_int), ("top", c_int),
                         ("right", c_int), ("bottom", c_int)]
@@ -167,6 +166,28 @@
 
         assert res == 1111
 
+    def test_callback_from_c_with_struct_argument(self):
+        import conftest
+        _ctypes_test = str(conftest.sofile)
+        dll = CDLL(_ctypes_test)
+
+        class RECT(Structure):
+            _fields_ = [("left", c_long), ("top", c_long),
+                        ("right", c_long), ("bottom", c_long)]
+
+        proto = CFUNCTYPE(c_int, RECT)
+        def callback(point):
+            return point.left+point.top+point.right+point.bottom
+
+        cbp = proto(callback)
+        rect = RECT(1000,100,10,1)
+
+        call_callback_with_rect = dll.call_callback_with_rect
+        call_callback_with_rect.restype = c_int
+        call_callback_with_rect.argtypes = [proto, RECT]
+        res = call_callback_with_rect(cbp, rect)
+        assert res == 1111
+
     def test_callback_unsupported_return_struct(self):
         class RECT(Structure):
             _fields_ = [("left", c_int), ("top", c_int),
diff --git a/pypy/module/test_lib_pypy/test_greenlet.py b/pypy/module/test_lib_pypy/test_greenlet.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/test_lib_pypy/test_greenlet.py
@@ -0,0 +1,233 @@
+from pypy.conftest import gettestobjspace
+
+
+class AppTestGreenlet:
+    def setup_class(cls):
+        cls.space = gettestobjspace(usemodules=['_continuation'])
+
+    def test_simple(self):
+        from greenlet import greenlet
+        lst = []
+        def f():
+            lst.append(1)
+            greenlet.getcurrent().parent.switch()
+            lst.append(3)
+        g = greenlet(f)
+        lst.append(0)
+        g.switch()
+        lst.append(2)
+        g.switch()
+        lst.append(4)
+        assert lst == range(5)
+
+    def test_parent(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        assert gmain.parent is None
+        g = greenlet(lambda: None)
+        assert g.parent is gmain
+
+    def test_pass_around(self):
+        from greenlet import greenlet
+        seen = []
+        def f(x, y):
+            seen.append((x, y))
+            seen.append(greenlet.getcurrent().parent.switch())
+            seen.append(greenlet.getcurrent().parent.switch(42))
+            return 44, 'z'
+        g = greenlet(f)
+        seen.append(g.switch(40, 'x'))
+        seen.append(g.switch(41, 'y'))
+        seen.append(g.switch(43))
+        #
+        def f2():
+            return 45
+        g = greenlet(f2)
+        seen.append(g.switch())
+        #
+        def f3():
+            pass
+        g = greenlet(f3)
+        seen.append(g.switch())
+        #
+        assert seen == [(40, 'x'), (), (41, 'y'), 42, 43, (44, 'z'), 45, None]
+
+    def test_exception_simple(self):
+        from greenlet import greenlet
+        #
+        def fmain():
+            raise ValueError
+        #
+        g1 = greenlet(fmain)
+        raises(ValueError, g1.switch)
+
+    def test_dead(self):
+        from greenlet import greenlet
+        #
+        def fmain():
+            assert g1 and not g1.dead
+        #
+        g1 = greenlet(fmain)
+        assert not g1 and not g1.dead
+        g1.switch()
+        assert not g1 and g1.dead
+        #
+        gmain = greenlet.getcurrent()
+        assert gmain and not gmain.dead
+
+    def test_GreenletExit(self):
+        from greenlet import greenlet, GreenletExit
+        #
+        def fmain(*args):
+            raise GreenletExit(*args)
+        #
+        g1 = greenlet(fmain)
+        res = g1.switch('foo', 'bar')
+        assert isinstance(res, GreenletExit) and res.args == ('foo', 'bar')
+
+    def test_throw_1(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        #
+        def f():
+            try:
+                gmain.switch()
+            except ValueError:
+                return "ok"
+        #
+        g = greenlet(f)
+        g.switch()
+        res = g.throw(ValueError)
+        assert res == "ok"
+
+    def test_throw_2(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        #
+        def f():
+            gmain.throw(ValueError)
+        #
+        g = greenlet(f)
+        raises(ValueError, g.switch)
+
+    def test_throw_3(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        raises(ValueError, gmain.throw, ValueError)
+
+    def test_throw_4(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        #
+        def f1():
+            g2.throw(ValueError)
+        #
+        def f2():
+            try:
+                gmain.switch()
+            except ValueError:
+                return "ok"
+        #
+        g1 = greenlet(f1)
+        g2 = greenlet(f2)
+        g2.switch()
+        res = g1.switch()
+        assert res == "ok"
+
+    def test_nondefault_parent(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            g2 = greenlet(f2)
+            res = g2.switch()
+            assert res == "from 2"
+            return "from 1"
+        #
+        def f2():
+            return "from 2"
+        #
+        g1 = greenlet(f1)
+        res = g1.switch()
+        assert res == "from 1"
+
+    def test_change_parent(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            res = g2.switch()
+            assert res == "from 2"
+            return "from 1"
+        #
+        def f2():
+            return "from 2"
+        #
+        g1 = greenlet(f1)
+        g2 = greenlet(f2)
+        g2.parent = g1
+        res = g1.switch()
+        assert res == "from 1"
+
+    def test_raises_through_parent_chain(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            raises(IndexError, g2.switch)
+            raise ValueError
+        #
+        def f2():
+            raise IndexError
+        #
+        g1 = greenlet(f1)
+        g2 = greenlet(f2)
+        g2.parent = g1
+        raises(ValueError, g1.switch)
+
+    def test_switch_to_dead_1(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            return "ok"
+        #
+        g1 = greenlet(f1)
+        res = g1.switch()
+        assert res == "ok"
+        res = g1.switch("goes to gmain instead")
+        assert res == "goes to gmain instead"
+
+    def test_switch_to_dead_2(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            g2 = greenlet(f2)
+            return g2.switch()
+        #
+        def f2():
+            return "ok"
+        #
+        g1 = greenlet(f1)
+        res = g1.switch()
+        assert res == "ok"
+        res = g1.switch("goes to gmain instead")
+        assert res == "goes to gmain instead"
+
+    def test_switch_to_dead_3(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        #
+        def f1():
+            res = g2.switch()
+            assert res == "ok"
+            res = gmain.switch("next step")
+            assert res == "goes to f1 instead"
+            return "all ok"
+        #
+        def f2():
+            return "ok"
+        #
+        g1 = greenlet(f1)
+        g2 = greenlet(f2)
+        g2.parent = g1
+        res = g1.switch()
+        assert res == "next step"
+        res = g2.switch("goes to f1 instead")
+        assert res == "all ok"
diff --git a/pypy/module/thread/os_thread.py b/pypy/module/thread/os_thread.py
--- a/pypy/module/thread/os_thread.py
+++ b/pypy/module/thread/os_thread.py
@@ -15,11 +15,6 @@
 # * The start-up data (the app-level callable and arguments) is
 #   stored in the global bootstrapper object.
 #
-# * The GC is notified that a new thread is about to start; in the
-#   framework GC with shadow stacks, this allocates a fresh new shadow
-#   stack (but doesn't use it yet).  See gc_thread_prepare().  This
-#   has no effect in asmgcc.
-#
 # * The new thread is launched at RPython level using an rffi call
 #   to the C function RPyThreadStart() defined in
 #   translator/c/src/thread*.h.  This RPython thread will invoke the
@@ -33,8 +28,8 @@
 #   operation is called (this is all done by gil.after_external_call(),
 #   called from the rffi-generated wrapper).  The gc_thread_run()
 #   operation will automatically notice that the current thread id was
-#   not seen before, and start using the freshly prepared shadow stack.
-#   Again, this has no effect in asmgcc.
+#   not seen before, and (in shadowstack) it will allocate and use a
+#   fresh new stack.  Again, this has no effect in asmgcc.
 #
 # * Only then does bootstrap() really run.  The first thing it does
 #   is grab the start-up information (app-level callable and args)
@@ -180,7 +175,7 @@
     bootstrapper.acquire(space, w_callable, args)
     try:
         try:
-            thread.gc_thread_prepare()
+            thread.gc_thread_prepare()     # (this has no effect any more)
             ident = thread.start_new_thread(bootstrapper.bootstrap, ())
         except Exception, e:
             bootstrapper.release()     # normally called by the new thread
diff --git a/pypy/objspace/descroperation.py b/pypy/objspace/descroperation.py
--- a/pypy/objspace/descroperation.py
+++ b/pypy/objspace/descroperation.py
@@ -724,13 +724,22 @@
         w_left_src, w_left_impl = space.lookup_in_type_where(w_typ1, left)
         w_first = w_obj1
         w_second = w_obj2
-
-        if _same_class_w(space, w_obj1, w_obj2, w_typ1, w_typ2):
+        #
+        if left == right and _same_class_w(space, w_obj1, w_obj2,
+                                           w_typ1, w_typ2):
+            # for __eq__ and __ne__, if the objects have the same
+            # (old-style or new-style) class, then don't try the
+            # opposite method, which is the same one.
             w_right_impl = None
         else:
-            w_right_src, w_right_impl = space.lookup_in_type_where(w_typ2, right)
-            # XXX see binop_impl
-            if space.is_true(space.issubtype(w_typ2, w_typ1)):
+            # in all other cases, try the opposite method.
+            w_right_src, w_right_impl = space.lookup_in_type_where(w_typ2,right)
+            if space.is_w(w_typ1, w_typ2):
+                # if the type is the same, *or* if both are old-style classes,
+                # then don't reverse: try left first, right next.
+                pass
+            elif space.is_true(space.issubtype(w_typ2, w_typ1)):
+                # for new-style classes, if typ2 is a subclass of typ1.
                 w_obj1, w_obj2 = w_obj2, w_obj1
                 w_left_impl, w_right_impl = w_right_impl, w_left_impl
 
diff --git a/pypy/objspace/std/celldict.py b/pypy/objspace/std/celldict.py
--- a/pypy/objspace/std/celldict.py
+++ b/pypy/objspace/std/celldict.py
@@ -1,50 +1,57 @@
-""" A very simple cell dict implementation. The dictionary maps keys to cell.
-This ensures that the function (dict, key) -> cell is pure. By itself, this
-optimization is not helping at all, but in conjunction with the JIT it can
-speed up global lookups a lot."""
+""" A very simple cell dict implementation using a version tag. The dictionary
+maps keys to objects. If a specific key is changed a lot, a level of
+indirection is introduced to make the version tag change less often.
+"""
 
+from pypy.interpreter.baseobjspace import W_Root
 from pypy.objspace.std.dictmultiobject import IteratorImplementation
 from pypy.objspace.std.dictmultiobject import DictStrategy, _never_equal_to_string
 from pypy.objspace.std.dictmultiobject import ObjectDictStrategy
 from pypy.rlib import jit, rerased
 
-class ModuleCell(object):
+class VersionTag(object):
+    pass
+
+class ModuleCell(W_Root):
     def __init__(self, w_value=None):
         self.w_value = w_value
 
-    def invalidate(self):
-        w_value = self.w_value
-        self.w_value = None
-        return w_value
-
     def __repr__(self):
         return "<ModuleCell: %s>" % (self.w_value, )
 
+def unwrap_cell(w_value):
+    if isinstance(w_value, ModuleCell):
+        return w_value.w_value
+    return w_value
+
 class ModuleDictStrategy(DictStrategy):
 
     erase, unerase = rerased.new_erasing_pair("modulecell")
     erase = staticmethod(erase)
     unerase = staticmethod(unerase)
 
+    _immutable_fields_ = ["version?"]
+
     def __init__(self, space):
         self.space = space
+        self.version = VersionTag()
 
     def get_empty_storage(self):
        return self.erase({})
 
-    def getcell(self, w_dict, key, makenew):
-        if makenew or jit.we_are_jitted():
-            # when we are jitting, we always go through the pure function
-            # below, to ensure that we have no residual dict lookup
-            w_dict = jit.promote(w_dict)
-            self = jit.promote(self)
-            return self._getcell_makenew(w_dict, key)
+    def mutated(self):
+       self.version = VersionTag()
+
+    def getdictvalue_no_unwrapping(self, w_dict, key):
+        # NB: it's important to promote self here, so that self.version is a
+        # no-op due to the quasi-immutable field
+        self = jit.promote(self)
+        return self._getdictvalue_no_unwrapping_pure(self.version, w_dict, key)
+
+    @jit.elidable_promote('0,1,2')
+    def _getdictvalue_no_unwrapping_pure(self, version, w_dict, key):
         return self.unerase(w_dict.dstorage).get(key, None)
 
-    @jit.elidable
-    def _getcell_makenew(self, w_dict, key):
-        return self.unerase(w_dict.dstorage).setdefault(key, ModuleCell())
-
     def setitem(self, w_dict, w_key, w_value):
         space = self.space
         if space.is_w(space.type(w_key), space.w_str):
@@ -54,15 +61,24 @@
             w_dict.setitem(w_key, w_value)
 
     def setitem_str(self, w_dict, key, w_value):
-        self.getcell(w_dict, key, True).w_value = w_value
+        cell = self.getdictvalue_no_unwrapping(w_dict, key)
+        if isinstance(cell, ModuleCell):
+            cell.w_value = w_value
+            return
+        if cell is not None:
+            w_value = ModuleCell(w_value)
+        self.mutated()
+        self.unerase(w_dict.dstorage)[key] = w_value
 
     def setdefault(self, w_dict, w_key, w_default):
         space = self.space
         if space.is_w(space.type(w_key), space.w_str):
-            cell = self.getcell(w_dict, space.str_w(w_key), True)
-            if cell.w_value is None:
-                cell.w_value = w_default
-            return cell.w_value
+            key = space.str_w(w_key)
+            w_result = self.getitem_str(w_dict, key)
+            if w_result is not None:
+                return w_result
+            self.setitem_str(w_dict, key, w_default)
+            return w_default
         else:
             self.switch_to_object_strategy(w_dict)
             return w_dict.setdefault(w_key, w_default)
@@ -72,14 +88,13 @@
         w_key_type = space.type(w_key)
         if space.is_w(w_key_type, space.w_str):
             key = space.str_w(w_key)
-            cell = self.getcell(w_dict, key, False)
-            if cell is None or cell.w_value is None:
-                raise KeyError
-            # note that we don't remove the cell from self.content, to make
-            # sure that a key that was found at any point in the dict, still
-            # maps to the same cell later (even if this cell no longer
-            # represents a key)
-            cell.invalidate()
+            dict_w = self.unerase(w_dict.dstorage)
+            try:
+                del dict_w[key]
+            except KeyError:
+                raise
+            else:
+                self.mutated()
         elif _never_equal_to_string(space, w_key_type):
             raise KeyError
         else:
@@ -87,12 +102,7 @@
             w_dict.delitem(w_key)
 
     def length(self, w_dict):
-        # inefficient, but do we care?
-        res = 0
-        for cell in self.unerase(w_dict.dstorage).itervalues():
-            if cell.w_value is not None:
-                res += 1
-        return res
+        return len(self.unerase(w_dict.dstorage))
 
     def getitem(self, w_dict, w_key):
         space = self.space
@@ -107,11 +117,8 @@
             return w_dict.getitem(w_key)
 
     def getitem_str(self, w_dict, key):
-        res = self.getcell(w_dict, key, False)
-        if res is None:
-            return None
-        # note that even if the res.w_value is None, the next line is fine
-        return res.w_value
+        w_res = self.getdictvalue_no_unwrapping(w_dict, key)
+        return unwrap_cell(w_res)
 
     def iter(self, w_dict):
         return ModuleDictIteratorImplementation(self.space, self, w_dict)
@@ -119,44 +126,34 @@
     def keys(self, w_dict):
         space = self.space
         iterator = self.unerase(w_dict.dstorage).iteritems
-        return [space.wrap(key) for key, cell in iterator()
-                    if cell.w_value is not None]
+        return [space.wrap(key) for key, cell in iterator()]
 
     def values(self, w_dict):
         iterator = self.unerase(w_dict.dstorage).itervalues
-        return [cell.w_value for cell in iterator()
-                    if cell.w_value is not None]
+        return [unwrap_cell(cell) for cell in iterator()]
 
     def items(self, w_dict):
         space = self.space
         iterator = self.unerase(w_dict.dstorage).iteritems
-        return [space.newtuple([space.wrap(key), cell.w_value])
-                    for (key, cell) in iterator()
-                        if cell.w_value is not None]
+        return [space.newtuple([space.wrap(key), unwrap_cell(cell)])
+                    for key, cell in iterator()]
 
     def clear(self, w_dict):
-        iterator = self.unerase(w_dict.dstorage).iteritems
-        for k, cell in iterator():
-            cell.invalidate()
+        iterator = self.unerase(w_dict.dstorage).clear()
+        self.mutated()
 
     def popitem(self, w_dict):
-        # This is O(n) if called repeatadly, you probably shouldn't be on a
-        # Module's dict though
-        for k, cell in self.unerase(w_dict.dstorage).iteritems():
-            if cell.w_value is not None:
-                w_value = cell.w_value
-                cell.invalidate()
-                return self.space.wrap(k), w_value
-        else:
-            raise KeyError
+        d = self.unerase(w_dict.dstorage)
+        key, w_value = d.popitem()
+        self.mutated()
+        return self.space.wrap(key), unwrap_cell(w_value)
 
     def switch_to_object_strategy(self, w_dict):
         d = self.unerase(w_dict.dstorage)
         strategy = self.space.fromcache(ObjectDictStrategy)
         d_new = strategy.unerase(strategy.get_empty_storage())
         for key, cell in d.iteritems():
-            if cell.w_value is not None:
-                d_new[self.space.wrap(key)] = cell.w_value
+            d_new[self.space.wrap(key)] = unwrap_cell(cell)
         w_dict.strategy = strategy
         w_dict.dstorage = strategy.erase(d_new)
 
@@ -168,7 +165,6 @@
 
     def next_entry(self):
         for key, cell in self.iterator:
-            if cell.w_value is not None:
-                return (self.space.wrap(key), cell.w_value)
+            return (self.space.wrap(key), unwrap_cell(cell))
         else:
             return None, None
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -38,7 +38,9 @@
         if space.config.objspace.std.withcelldict and module:
             from pypy.objspace.std.celldict import ModuleDictStrategy
             assert w_type is None
-            strategy = space.fromcache(ModuleDictStrategy)
+            # every module needs its own strategy, because the strategy stores
+            # the version tag
+            strategy = ModuleDictStrategy(space)
 
         elif instance or strdict or module:
             assert w_type is None
diff --git a/pypy/objspace/std/test/test_celldict.py b/pypy/objspace/std/test/test_celldict.py
--- a/pypy/objspace/std/test/test_celldict.py
+++ b/pypy/objspace/std/test/test_celldict.py
@@ -2,42 +2,110 @@
 from pypy.conftest import gettestobjspace, option
 from pypy.objspace.std.dictmultiobject import W_DictMultiObject
 from pypy.objspace.std.celldict import ModuleCell, ModuleDictStrategy
-from pypy.objspace.std.test.test_dictmultiobject import FakeSpace
+from pypy.objspace.std.test.test_dictmultiobject import FakeSpace, \
+        BaseTestRDictImplementation, BaseTestDevolvedDictImplementation
 from pypy.interpreter import gateway
 
 space = FakeSpace()
 
 class TestCellDict(object):
-    def test_basic_property(self):
+    def test_basic_property_cells(self):
         strategy = ModuleDictStrategy(space)
         storage = strategy.get_empty_storage()
         d = W_DictMultiObject(space, strategy, storage)
 
-        # replace getcell with getcell from strategy
-        def f(key, makenew):
-            return strategy.getcell(d, key, makenew)
-        d.getcell = f
+        v1 = strategy.version
+        d.setitem("a", 1)
+        v2 = strategy.version
+        assert v1 is not v2
+        assert d.getitem("a") == 1
+        assert d.strategy.getdictvalue_no_unwrapping(d, "a") == 1
 
-        d.setitem("a", 1)
-        assert d.getcell("a", False) is d.getcell("a", False)
-        acell = d.getcell("a", False)
-        d.setitem("b", 2)
-        assert d.getcell("b", False) is d.getcell("b", False)
-        assert d.getcell("c", True) is d.getcell("c", True)
+        d.setitem("a", 2)
+        v3 = strategy.version
+        assert v2 is not v3
+        assert d.getitem("a") == 2
+        assert d.strategy.getdictvalue_no_unwrapping(d, "a").w_value == 2
 
-        assert d.getitem("a") == 1
-        assert d.getitem("b") == 2
+        d.setitem("a", 3)
+        v4 = strategy.version
+        assert v3 is v4
+        assert d.getitem("a") == 3
+        assert d.strategy.getdictvalue_no_unwrapping(d, "a").w_value == 3
 
         d.delitem("a")
-        py.test.raises(KeyError, d.delitem, "a")
+        v5 = strategy.version
+        assert v5 is not v4
         assert d.getitem("a") is None
-        assert d.getcell("a", False) is acell
-        assert d.length() == 1
+        assert d.strategy.getdictvalue_no_unwrapping(d, "a") is None
 
-        d.clear()
-        assert d.getitem("a") is None
-        assert d.getcell("a", False) is acell
-        assert d.length() == 0
+class AppTestModuleDict(object):
+    def setup_class(cls):
+        cls.space = gettestobjspace(**{"objspace.std.withcelldict": True})
+        cls.w_runappdirect = cls.space.wrap(option.runappdirect)
+
+    def w_impl_used(self, obj):
+        if self.runappdirect:
+            skip("__repr__ doesn't work on appdirect")
+        import __pypy__
+        assert "ModuleDictStrategy" in __pypy__.internal_repr(obj)
+
+    def test_check_module_uses_module_dict(self):
+        m = type(__builtins__)("abc")
+        self.impl_used(m.__dict__)
+
+    def test_key_not_there(self):
+        d = type(__builtins__)("abc").__dict__
+        raises(KeyError, "d['def']")
+
+    def test_fallback_evil_key(self):
+        class F(object):
+            def __hash__(self):
+                return hash("s")
+            def __eq__(self, other):
+                return other == "s"
+        d = type(__builtins__)("abc").__dict__
+        d["s"] = 12
+        assert d["s"] == 12
+        assert d[F()] == d["s"]
+
+        d = type(__builtins__)("abc").__dict__
+        x = d.setdefault("s", 12)
+        assert x == 12
+        x = d.setdefault(F(), 12)
+        assert x == 12
+
+        d = type(__builtins__)("abc").__dict__
+        x = d.setdefault(F(), 12)
+        assert x == 12
+
+        d = type(__builtins__)("abc").__dict__
+        d["s"] = 12
+        del d[F()]
+
+        assert "s" not in d
+        assert F() not in d
+
+
+class TestModuleDictImplementation(BaseTestRDictImplementation):
+    StrategyClass = ModuleDictStrategy
+
+class TestModuleDictImplementationWithBuiltinNames(BaseTestRDictImplementation):
+    StrategyClass = ModuleDictStrategy
+
+    string = "int"
+    string2 = "isinstance"
+
+
+class TestDevolvedModuleDictImplementation(BaseTestDevolvedDictImplementation):
+    StrategyClass = ModuleDictStrategy
+
+class TestDevolvedModuleDictImplementationWithBuiltinNames(BaseTestDevolvedDictImplementation):
+    StrategyClass = ModuleDictStrategy
+
+    string = "int"
+    string2 = "isinstance"
+
 
 class AppTestCellDict(object):
     OPTIONS = {"objspace.std.withcelldict": True}
@@ -67,4 +135,4 @@
         d["a"] = 3
         del d["a"]
         d[object()] = 5
-        assert d.values() == [5]
\ No newline at end of file
+        assert d.values() == [5]
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -5,7 +5,6 @@
      W_DictMultiObject, setitem__DictMulti_ANY_ANY, getitem__DictMulti_ANY, \
      StringDictStrategy, ObjectDictStrategy
 
-from pypy.objspace.std.celldict import ModuleDictStrategy
 from pypy.conftest import gettestobjspace
 from pypy.conftest import option
 
@@ -731,52 +730,6 @@
                 set([('a', 1), ('b', 2), ('d', 4), ('e', 5)]))
 
 
-class AppTestModuleDict(object):
-    def setup_class(cls):
-        cls.space = gettestobjspace(**{"objspace.std.withcelldict": True})
-        if option.runappdirect:
-            py.test.skip("__repr__ doesn't work on appdirect")
-
-    def w_impl_used(self, obj):
-        import __pypy__
-        assert "ModuleDictStrategy" in __pypy__.internal_repr(obj)
-
-    def test_check_module_uses_module_dict(self):
-        m = type(__builtins__)("abc")
-        self.impl_used(m.__dict__)
-
-    def test_key_not_there(self):
-        d = type(__builtins__)("abc").__dict__
-        raises(KeyError, "d['def']")
-
-    def test_fallback_evil_key(self):
-        class F(object):
-            def __hash__(self):
-                return hash("s")
-            def __eq__(self, other):
-                return other == "s"
-        d = type(__builtins__)("abc").__dict__
-        d["s"] = 12
-        assert d["s"] == 12
-        assert d[F()] == d["s"]
-
-        d = type(__builtins__)("abc").__dict__
-        x = d.setdefault("s", 12)
-        assert x == 12
-        x = d.setdefault(F(), 12)
-        assert x == 12
-
-        d = type(__builtins__)("abc").__dict__
-        x = d.setdefault(F(), 12)
-        assert x == 12
-
-        d = type(__builtins__)("abc").__dict__
-        d["s"] = 12
-        del d[F()]
-
-        assert "s" not in d
-        assert F() not in d
-
 class AppTestStrategies(object):
     def setup_class(cls):
         if option.runappdirect:
@@ -1071,16 +1024,6 @@
 ##     ImplementionClass = MeasuringDictImplementation
 ##     DevolvedClass = MeasuringDictImplementation
 
-class TestModuleDictImplementation(BaseTestRDictImplementation):
-    StrategyClass = ModuleDictStrategy
-
-class TestModuleDictImplementationWithBuiltinNames(BaseTestRDictImplementation):
-    StrategyClass = ModuleDictStrategy
-
-    string = "int"
-    string2 = "isinstance"
-
-
 class BaseTestDevolvedDictImplementation(BaseTestRDictImplementation):
     def fill_impl(self):
         BaseTestRDictImplementation.fill_impl(self)
@@ -1092,15 +1035,6 @@
 class TestDevolvedStrDictImplementation(BaseTestDevolvedDictImplementation):
     StrategyClass = StringDictStrategy
 
-class TestDevolvedModuleDictImplementation(BaseTestDevolvedDictImplementation):
-    StrategyClass = ModuleDictStrategy
-
-class TestDevolvedModuleDictImplementationWithBuiltinNames(BaseTestDevolvedDictImplementation):
-    StrategyClass = ModuleDictStrategy
-
-    string = "int"
-    string2 = "isinstance"
-
 
 def test_module_uses_strdict():
     fakespace = FakeSpace()
diff --git a/pypy/objspace/test/test_descroperation.py b/pypy/objspace/test/test_descroperation.py
--- a/pypy/objspace/test/test_descroperation.py
+++ b/pypy/objspace/test/test_descroperation.py
@@ -377,7 +377,26 @@
 
         setattr(P, "__weakref__", 0)
 
+    def test_subclass_addition(self):
+        # the __radd__ is never called (compare with the next test)
+        l = []
+        class A(object):
+            def __add__(self, other):
+                l.append(self.__class__)
+                l.append(other.__class__)
+                return 123
+            def __radd__(self, other):
+                # should never be called!
+                return 456
+        class B(A):
+            pass
+        res1 = A() + B()
+        res2 = B() + A()
+        assert res1 == res2 == 123
+        assert l == [A, B, B, A]
+
     def test_subclass_comparison(self):
+        # the __eq__ *is* called with reversed arguments
         l = []
         class A(object):
             def __eq__(self, other):
@@ -395,7 +414,27 @@
 
         A() == B()
         A() < B()
-        assert l == [B, A, A, B]
+        B() < A()
+        assert l == [B, A, A, B, B, A]
+
+    def test_subclass_comparison_more(self):
+        # similarly, __gt__(b,a) is called instead of __lt__(a,b)
+        l = []
+        class A(object):
+            def __lt__(self, other):
+                l.append(self.__class__)
+                l.append(other.__class__)
+                return '<'
+            def __gt__(self, other):
+                l.append(self.__class__)
+                l.append(other.__class__)
+                return '>'
+        class B(A):
+            pass
+        res1 = A() < B()
+        res2 = B() < A()
+        assert res1 == '>' and res2 == '<'
+        assert l == [B, A, B, A]
 
     def test_rich_comparison(self):
         # Old-style
@@ -434,6 +473,84 @@
         assert not(C(1) == D(2))
         assert not(D(1) == C(2))
 
+    def test_partial_ordering(self):
+        class A(object):
+            def __lt__(self, other):
+                return self
+        a1 = A()
+        a2 = A()
+        assert (a1 < a2) is a1
+        assert (a1 > a2) is a2
+
+    def test_eq_order(self):
+        class A(object):
+            def __eq__(self, other): return self.__class__.__name__+':A.eq'
+            def __ne__(self, other): return self.__class__.__name__+':A.ne'
+            def __lt__(self, other): return self.__class__.__name__+':A.lt'
+            def __le__(self, other): return self.__class__.__name__+':A.le'
+            def __gt__(self, other): return self.__class__.__name__+':A.gt'
+            def __ge__(self, other): return self.__class__.__name__+':A.ge'
+        class B(object):
+            def __eq__(self, other): return self.__class__.__name__+':B.eq'
+            def __ne__(self, other): return self.__class__.__name__+':B.ne'
+            def __lt__(self, other): return self.__class__.__name__+':B.lt'
+            def __le__(self, other): return self.__class__.__name__+':B.le'
+            def __gt__(self, other): return self.__class__.__name__+':B.gt'
+            def __ge__(self, other): return self.__class__.__name__+':B.ge'
+        #
+        assert (A() == B()) == 'A:A.eq'
+        assert (A() != B()) == 'A:A.ne'
+        assert (A() <  B()) == 'A:A.lt'
+        assert (A() <= B()) == 'A:A.le'
+        assert (A() >  B()) == 'A:A.gt'
+        assert (A() >= B()) == 'A:A.ge'
+        #
+        assert (B() == A()) == 'B:B.eq'
+        assert (B() != A()) == 'B:B.ne'
+        assert (B() <  A()) == 'B:B.lt'
+        assert (B() <= A()) == 'B:B.le'
+        assert (B() >  A()) == 'B:B.gt'
+        assert (B() >= A()) == 'B:B.ge'
+        #
+        class C(A):
+            def __eq__(self, other): return self.__class__.__name__+':C.eq'
+            def __ne__(self, other): return self.__class__.__name__+':C.ne'
+            def __lt__(self, other): return self.__class__.__name__+':C.lt'
+            def __le__(self, other): return self.__class__.__name__+':C.le'
+            def __gt__(self, other): return self.__class__.__name__+':C.gt'
+            def __ge__(self, other): return self.__class__.__name__+':C.ge'
+        #
+        assert (A() == C()) == 'C:C.eq'
+        assert (A() != C()) == 'C:C.ne'
+        assert (A() <  C()) == 'C:C.gt'
+        assert (A() <= C()) == 'C:C.ge'
+        assert (A() >  C()) == 'C:C.lt'
+        assert (A() >= C()) == 'C:C.le'
+        #
+        assert (C() == A()) == 'C:C.eq'
+        assert (C() != A()) == 'C:C.ne'
+        assert (C() <  A()) == 'C:C.lt'
+        assert (C() <= A()) == 'C:C.le'
+        assert (C() >  A()) == 'C:C.gt'
+        assert (C() >= A()) == 'C:C.ge'
+        #
+        class D(A):
+            pass
+        #
+        assert (A() == D()) == 'D:A.eq'
+        assert (A() != D()) == 'D:A.ne'
+        assert (A() <  D()) == 'D:A.gt'
+        assert (A() <= D()) == 'D:A.ge'
+        assert (A() >  D()) == 'D:A.lt'
+        assert (A() >= D()) == 'D:A.le'
+        #
+        assert (D() == A()) == 'D:A.eq'
+        assert (D() != A()) == 'D:A.ne'
+        assert (D() <  A()) == 'D:A.lt'
+        assert (D() <= A()) == 'D:A.le'
+        assert (D() >  A()) == 'D:A.gt'
+        assert (D() >= A()) == 'D:A.ge'
+
     def test_addition(self):
         # Old-style
         class A:
diff --git a/pypy/pytest-A-stackless.cfg b/pypy/pytest-A-stackless.cfg
deleted file mode 100644
--- a/pypy/pytest-A-stackless.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-# run for some directories a file at a time
-
-def collect_one_testdir(testdirs, reldir, tests):
-    if (reldir.startswith('module/_stackless/') or
-        reldir.startswith('lib')):
-        testdirs.extend(tests)
-    else:     
-        testdirs.append(reldir)
-
-    
diff --git a/pypy/rlib/_rffi_stacklet.py b/pypy/rlib/_rffi_stacklet.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/_rffi_stacklet.py
@@ -0,0 +1,49 @@
+import py
+from pypy.tool.autopath import pypydir
+from pypy.rpython.lltypesystem import lltype, llmemory, rffi
+from pypy.translator.tool.cbuild import ExternalCompilationInfo
+from pypy.rpython.tool import rffi_platform
+
+
+cdir = py.path.local(pypydir) / 'translator' / 'c'
+
+
+eci = ExternalCompilationInfo(
+    include_dirs = [cdir],
+    includes = ['src/stacklet/stacklet.h'],
+    separate_module_sources = ['#include "src/stacklet/stacklet.c"\n'],
+)
+rffi_platform.verify_eci(eci.convert_sources_to_files())
+
+def llexternal(name, args, result, **kwds):
+    return rffi.llexternal(name, args, result, compilation_info=eci,
+                           _nowrapper=True, **kwds)
+
+# ----- types -----
+
+handle = rffi.COpaquePtr(typedef='stacklet_handle', compilation_info=eci)
+thread_handle = rffi.COpaquePtr(typedef='stacklet_thread_handle',
+                                compilation_info=eci)
+run_fn = lltype.Ptr(lltype.FuncType([handle, llmemory.Address], handle))
+
+# ----- constants -----
+
+null_handle = lltype.nullptr(handle.TO)
+
+def is_empty_handle(h):
+    return rffi.cast(lltype.Signed, h) == -1
+
+# ----- functions -----
+
+newthread = llexternal('stacklet_newthread', [], thread_handle)
+deletethread = llexternal('stacklet_deletethread',[thread_handle], lltype.Void)
+
+new = llexternal('stacklet_new', [thread_handle, run_fn, llmemory.Address],
+                 handle, random_effects_on_gcobjs=True)
+switch = llexternal('stacklet_switch', [thread_handle, handle], handle,
+                    random_effects_on_gcobjs=True)
+destroy = llexternal('stacklet_destroy', [thread_handle, handle], lltype.Void)
+
+_translate_pointer = llexternal("_stacklet_translate_pointer",
+                                [llmemory.Address, llmemory.Address],
+                                llmemory.Address)
diff --git a/pypy/rlib/_rsocket_rffi.py b/pypy/rlib/_rsocket_rffi.py
--- a/pypy/rlib/_rsocket_rffi.py
+++ b/pypy/rlib/_rsocket_rffi.py
@@ -489,10 +489,10 @@
 getnameinfo = external('getnameinfo', [sockaddr_ptr, socklen_t, CCHARP,
                        size_t, CCHARP, size_t, rffi.INT], rffi.INT)
 
-htonl = external('htonl', [rffi.UINT], rffi.UINT)
-htons = external('htons', [rffi.USHORT], rffi.USHORT)
-ntohl = external('ntohl', [rffi.UINT], rffi.UINT)
-ntohs = external('ntohs', [rffi.USHORT], rffi.USHORT)
+htonl = external('htonl', [rffi.UINT], rffi.UINT, threadsafe=False)
+htons = external('htons', [rffi.USHORT], rffi.USHORT, threadsafe=False)
+ntohl = external('ntohl', [rffi.UINT], rffi.UINT, threadsafe=False)
+ntohs = external('ntohs', [rffi.USHORT], rffi.USHORT, threadsafe=False)
 
 if _POSIX:
     inet_aton = external('inet_aton', [CCHARP, lltype.Ptr(in_addr)],
diff --git a/pypy/rlib/_stacklet_asmgcc.py b/pypy/rlib/_stacklet_asmgcc.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/_stacklet_asmgcc.py
@@ -0,0 +1,277 @@
+from pypy.rlib import _rffi_stacklet as _c
+from pypy.rlib.debug import ll_assert
+from pypy.rpython.lltypesystem import lltype, llmemory, rffi
+from pypy.rpython.lltypesystem.lloperation import llop
+from pypy.rpython.annlowlevel import llhelper
+
+
+_asmstackrootwalker = None    # BIG HACK: monkey-patched by asmgcroot.py
+_stackletrootwalker = None
+
+def get_stackletrootwalker():
+    # lazily called, to make the following imports lazy
+    global _stackletrootwalker
+    if _stackletrootwalker is not None:
+        return _stackletrootwalker
+
+    from pypy.rpython.memory.gctransform.asmgcroot import (
+        WALKFRAME, CALLEE_SAVED_REGS, INDEX_OF_EBP, sizeofaddr)
+
+    assert _asmstackrootwalker is not None, "should have been monkey-patched"
+    basewalker = _asmstackrootwalker
+
+    class StackletRootWalker(object):
+        _alloc_flavor_ = "raw"
+
+        enumerating = False
+
+        def setup(self, obj):
+            # initialization: read the SUSPSTACK object
+            p = llmemory.cast_adr_to_ptr(obj, lltype.Ptr(SUSPSTACK))
+            if not p.handle:
+                return False
+            self.context = llmemory.cast_ptr_to_adr(p.handle)
+            anchor = p.anchor
+            del p
+            self.curframe = lltype.malloc(WALKFRAME, flavor='raw')
+            self.otherframe = lltype.malloc(WALKFRAME, flavor='raw')
+            self.fill_initial_frame(self.curframe, anchor)
+            return True
+
+        def fill_initial_frame(self, curframe, initialframedata):
+            # Copy&paste :-(
+            initialframedata += 2*sizeofaddr
+            reg = 0
+            while reg < CALLEE_SAVED_REGS:
+                curframe.regs_stored_at[reg] = initialframedata+reg*sizeofaddr
+                reg += 1
+            retaddraddr = initialframedata + CALLEE_SAVED_REGS * sizeofaddr
+            retaddraddr = self.translateptr(retaddraddr)
+            curframe.frame_address = retaddraddr.address[0]
+
+        def teardown(self):
+            lltype.free(self.curframe, flavor='raw')
+            lltype.free(self.otherframe, flavor='raw')
+            self.context = llmemory.NULL
+            return llmemory.NULL
+
+        def next(self, obj, prev):
+            #
+            # Pointers to the stack can be "translated" or not:
+            #
+            #   * Non-translated pointers point to where the data would be
+            #     if the stack was installed and running.
+            #
+            #   * Translated pointers correspond to where the data
+            #     is now really in memory.
+            #
+            # Note that 'curframe' contains non-translated pointers, and
+            # of course the stack itself is full of non-translated pointers.
+            #
+            while True:
+                if not self.enumerating:
+                    if not prev:
+                        if not self.setup(obj):      # one-time initialization
+                            return llmemory.NULL
+                        prev = obj   # random value, but non-NULL
+                    callee = self.curframe
+                    retaddraddr = self.translateptr(callee.frame_address)
+                    retaddr = retaddraddr.address[0]
+                    basewalker.locate_caller_based_on_retaddr(retaddr)
+                    self.enumerating = True
+                #
+                # not really a loop, but kept this way for similarity
+                # with asmgcroot:
+                callee = self.curframe
+                ebp_in_caller = callee.regs_stored_at[INDEX_OF_EBP]
+                ebp_in_caller = self.translateptr(ebp_in_caller)
+                ebp_in_caller = ebp_in_caller.address[0]
+                while True:
+                    location = basewalker._shape_decompressor.next()
+                    if location == 0:
+                        break
+                    addr = basewalker.getlocation(callee, ebp_in_caller,
+                                                  location)
+                    # yield the translated addr of the next GCREF in the stack
+                    return self.translateptr(addr)
+                #
+                self.enumerating = False
+                caller = self.otherframe
+                reg = CALLEE_SAVED_REGS - 1
+                while reg >= 0:
+                    location = basewalker._shape_decompressor.next()
+                    addr = basewalker.getlocation(callee, ebp_in_caller,
+                                                  location)
+                    caller.regs_stored_at[reg] = addr   # non-translated
+                    reg -= 1
+
+                location = basewalker._shape_decompressor.next()
+                caller.frame_address = basewalker.getlocation(callee,
+                                                              ebp_in_caller,
+                                                              location)
+                # ^^^ non-translated
+                if caller.frame_address == llmemory.NULL:
+                    return self.teardown()    # completely done with this stack
+                #
+                self.otherframe = callee
+                self.curframe = caller
+                # loop back
+
+        def translateptr(self, addr):
+            return _c._translate_pointer(self.context, addr)
+
+    _stackletrootwalker = StackletRootWalker()
+    return _stackletrootwalker
+get_stackletrootwalker._annspecialcase_ = 'specialize:memo'
+
+
+def customtrace(obj, prev):
+    stackletrootwalker = get_stackletrootwalker()
+    return stackletrootwalker.next(obj, prev)
+
+
+SUSPSTACK = lltype.GcStruct('SuspStack',
+                            ('handle', _c.handle),
+                            ('anchor', llmemory.Address),
+                            rtti=True)
+NULL_SUSPSTACK = lltype.nullptr(SUSPSTACK)
+CUSTOMTRACEFUNC = lltype.FuncType([llmemory.Address, llmemory.Address],
+                                  llmemory.Address)
+customtraceptr = llhelper(lltype.Ptr(CUSTOMTRACEFUNC), customtrace)
+lltype.attachRuntimeTypeInfo(SUSPSTACK, customtraceptr=customtraceptr)
+
+ASM_FRAMEDATA_HEAD_PTR = lltype.Ptr(lltype.ForwardReference())
+ASM_FRAMEDATA_HEAD_PTR.TO.become(lltype.Struct('ASM_FRAMEDATA_HEAD',
+        ('prev', ASM_FRAMEDATA_HEAD_PTR),
+        ('next', ASM_FRAMEDATA_HEAD_PTR)
+    ))
+alternateanchor = lltype.malloc(ASM_FRAMEDATA_HEAD_PTR.TO,
+                                immortal=True)
+alternateanchor.prev = alternateanchor
+alternateanchor.next = alternateanchor
+
+FUNCNOARG_P = lltype.Ptr(lltype.FuncType([], _c.handle))
+pypy_asm_stackwalk2 = rffi.llexternal('pypy_asm_stackwalk',
+                                      [FUNCNOARG_P,
+                                       ASM_FRAMEDATA_HEAD_PTR],
+                                      _c.handle, sandboxsafe=True,
+                                      _nowrapper=True)
+
+
+def _new_callback():
+    # Here, we just closed the stack.  Get the stack anchor, store
+    # it in the gcrootfinder.suspstack.anchor, and create a new
+    # stacklet with stacklet_new().  If this call fails, then we
+    # are just returning NULL.
+    _stack_just_closed()
+    return _c.new(gcrootfinder.thrd, llhelper(_c.run_fn, _new_runfn),
+                  llmemory.NULL)
+
+def _stack_just_closed():
+    # Immediately unlink the new stackanchor from the doubly-linked
+    # chained list.  When returning from pypy_asm_stackwalk2, the
+    # assembler code will try to unlink it again, which should be
+    # a no-op given that the doubly-linked list is empty.
+    stackanchor = llmemory.cast_ptr_to_adr(alternateanchor.next)
+    gcrootfinder.suspstack.anchor = stackanchor
+    alternateanchor.prev = alternateanchor
+    alternateanchor.next = alternateanchor
+
+def _new_runfn(h, _):
+    # Here, we are in a fresh new stacklet.
+    llop.gc_stack_bottom(lltype.Void)   # marker for trackgcroot.py
+    #
+    # There is a fresh suspstack object waiting on the gcrootfinder,
+    # so populate it with data that represents the parent suspended
+    # stacklet and detach the suspstack object from gcrootfinder.
+    suspstack = gcrootfinder.attach_handle_on_suspstack(h)
+    #
+    # Call the main function provided by the (RPython) user.
+    suspstack = gcrootfinder.runfn(suspstack, gcrootfinder.arg)
+    #
+    # Here, suspstack points to the target stacklet to which we want
+    # to jump to next.  Read the 'handle' and forget about the
+    # suspstack object.
+    return _consume_suspstack(suspstack)
+
+def _consume_suspstack(suspstack):
+    h = suspstack.handle
+    ll_assert(bool(h), "_consume_suspstack: null handle")
+    suspstack.handle = _c.null_handle
+    return h
+
+def _switch_callback():
+    # Here, we just closed the stack.  Get the stack anchor, store
+    # it in the gcrootfinder.suspstack.anchor, and switch to this
+    # suspstack with stacklet_switch().  If this call fails, then we
+    # are just returning NULL.
+    oldanchor = gcrootfinder.suspstack.anchor
+    _stack_just_closed()
+    h = _consume_suspstack(gcrootfinder.suspstack)
+    #
+    # gcrootfinder.suspstack.anchor is left with the anchor of the
+    # previous place (i.e. before the call to switch()).
+    h2 = _c.switch(gcrootfinder.thrd, h)
+    #
+    if not h2:    # MemoryError: restore
+        gcrootfinder.suspstack.anchor = oldanchor
+        gcrootfinder.suspstack.handle = h
+    return h2
+
+
+class StackletGcRootFinder(object):
+    suspstack = NULL_SUSPSTACK
+
+    def new(self, thrd, callback, arg):
+        self.thrd = thrd._thrd
+        self.runfn = callback
+        self.arg = arg
+        # make a fresh new clean SUSPSTACK
+        newsuspstack = lltype.malloc(SUSPSTACK)
+        newsuspstack.handle = _c.null_handle
+        self.suspstack = newsuspstack
+        # Invoke '_new_callback' by closing the stack
+        h = pypy_asm_stackwalk2(llhelper(FUNCNOARG_P, _new_callback),
+                                alternateanchor)
+        return self.get_result_suspstack(h)
+
+    def switch(self, thrd, suspstack):
+        self.thrd = thrd._thrd
+        self.suspstack = suspstack
+        h = pypy_asm_stackwalk2(llhelper(FUNCNOARG_P, _switch_callback),
+                                alternateanchor)
+        return self.get_result_suspstack(h)
+
+    def attach_handle_on_suspstack(self, handle):
+        s = self.suspstack
+        self.suspstack = NULL_SUSPSTACK
+        ll_assert(bool(s.anchor), "s.anchor should not be null")
+        s.handle = handle
+        llop.gc_assume_young_pointers(lltype.Void, llmemory.cast_ptr_to_adr(s))
+        return s
+
+    def get_result_suspstack(self, h):
+        #
+        # Return from a new() or a switch(): 'h' is a handle, possibly
+        # an empty one, that says from where we switched to.
+        if not h:
+            raise MemoryError
+        elif _c.is_empty_handle(h):
+            return NULL_SUSPSTACK
+        else:
+            # This is a return that gave us a real handle.  Store it.
+            return self.attach_handle_on_suspstack(h)
+
+    def destroy(self, thrd, suspstack):
+        h = suspstack.handle
+        suspstack.handle = _c.null_handle
+        _c.destroy(thrd._thrd, h)
+
+    def is_empty_handle(self, suspstack):
+        return not suspstack
+
+    def get_null_handle(self):
+        return NULL_SUSPSTACK
+
+
+gcrootfinder = StackletGcRootFinder()
diff --git a/pypy/rlib/_stacklet_n_a.py b/pypy/rlib/_stacklet_n_a.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/_stacklet_n_a.py
@@ -0,0 +1,31 @@
+from pypy.rlib import _rffi_stacklet as _c
+from pypy.rpython.annlowlevel import llhelper
+from pypy.tool.staticmethods import StaticMethods
+
+
+class StackletGcRootFinder:
+    __metaclass__ = StaticMethods
+
+    def new(thrd, callback, arg):
+        h = _c.new(thrd._thrd, llhelper(_c.run_fn, callback), arg)
+        if not h:
+            raise MemoryError
+        return h
+    new._annspecialcase_ = 'specialize:arg(1)'
+
+    def switch(thrd, h):
+        h = _c.switch(thrd._thrd, h)
+        if not h:
+            raise MemoryError
+        return h
+
+    def destroy(thrd, h):
+        _c.destroy(thrd._thrd, h)
+
+    is_empty_handle = _c.is_empty_handle
+
+    def get_null_handle():
+        return _c.null_handle
+
+
+gcrootfinder = StackletGcRootFinder    # class object
diff --git a/pypy/rlib/_stacklet_shadowstack.py b/pypy/rlib/_stacklet_shadowstack.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/_stacklet_shadowstack.py
@@ -0,0 +1,110 @@
+from pypy.rlib import _rffi_stacklet as _c
+from pypy.rlib.debug import ll_assert
+from pypy.rpython.annlowlevel import llhelper
+from pypy.rpython.lltypesystem import lltype, llmemory
+from pypy.rpython.lltypesystem.lloperation import llop
+from pypy.tool.staticmethods import StaticMethods
+
+
+NULL_SUSPSTACK = lltype.nullptr(llmemory.GCREF.TO)
+
+
+def _new_callback(h, arg):
+    # We still have the old shadowstack active at this point; save it
+    # away, and start a fresh new one
+    oldsuspstack = gcrootfinder.oldsuspstack
+    h = llmemory.cast_ptr_to_adr(h)
+    llop.gc_save_current_state_away(lltype.Void,
+                                    oldsuspstack, h)
+    llop.gc_start_fresh_new_state(lltype.Void)
+    gcrootfinder.oldsuspstack = NULL_SUSPSTACK
+    #
+    newsuspstack = gcrootfinder.callback(oldsuspstack, arg)
+    #
+    # Finishing this stacklet.
+    gcrootfinder.oldsuspstack = NULL_SUSPSTACK
+    gcrootfinder.newsuspstack = newsuspstack
+    h = llop.gc_shadowstackref_context(llmemory.Address, newsuspstack)
+    return llmemory.cast_adr_to_ptr(h, _c.handle)
+
+def prepare_old_suspstack():
+    if not gcrootfinder.oldsuspstack:   # else reuse the one still there
+        _allocate_old_suspstack()
+
+def _allocate_old_suspstack():
+    suspstack = llop.gc_shadowstackref_new(llmemory.GCREF)
+    gcrootfinder.oldsuspstack = suspstack
+_allocate_old_suspstack._dont_inline_ = True
+
+def get_result_suspstack(h):
+    # Now we are in the target, after the switch() or the new().
+    # Note that this whole module was carefully written in such a way as
+    # not to invoke pushing/popping things off the shadowstack at
+    # unexpected moments...
+    oldsuspstack = gcrootfinder.oldsuspstack
+    newsuspstack = gcrootfinder.newsuspstack
+    gcrootfinder.oldsuspstack = NULL_SUSPSTACK
+    gcrootfinder.newsuspstack = NULL_SUSPSTACK
+    if not h:
+        raise MemoryError
+    # We still have the old shadowstack active at this point; save it
+    # away, and restore the new one
+    if oldsuspstack:
+        ll_assert(not _c.is_empty_handle(h),"unexpected empty stacklet handle")
+        h = llmemory.cast_ptr_to_adr(h)
+        llop.gc_save_current_state_away(lltype.Void, oldsuspstack, h)
+    else:
+        ll_assert(_c.is_empty_handle(h),"unexpected non-empty stacklet handle")
+        llop.gc_forget_current_state(lltype.Void)
+    #
+    llop.gc_restore_state_from(lltype.Void, newsuspstack)
+    #
+    # From this point on, 'newsuspstack' is consumed and done, its
+    # shadow stack installed as the current one.  It should not be
+    # used any more.  For performance, we avoid it being deallocated
+    # by letting it be reused on the next switch.
+    gcrootfinder.oldsuspstack = newsuspstack
+    # Return.
+    return oldsuspstack
+
+
+class StackletGcRootFinder:
+    __metaclass__ = StaticMethods
+
+    def new(thrd, callback, arg):
+        gcrootfinder.callback = callback
+        thread_handle = thrd._thrd
+        prepare_old_suspstack()
+        h = _c.new(thread_handle, llhelper(_c.run_fn, _new_callback), arg)
+        return get_result_suspstack(h)
+    new._dont_inline_ = True
+
+    def switch(thrd, suspstack):
+        # suspstack has a handle to target, i.e. where to switch to
+        ll_assert(suspstack != gcrootfinder.oldsuspstack,
+                  "stacklet: invalid use")
+        gcrootfinder.newsuspstack = suspstack
+        thread_handle = thrd._thrd
+        h = llop.gc_shadowstackref_context(llmemory.Address, suspstack)
+        h = llmemory.cast_adr_to_ptr(h, _c.handle)
+        prepare_old_suspstack()
+        h = _c.switch(thread_handle, h)
+        return get_result_suspstack(h)
+    switch._dont_inline_ = True
+
+    def destroy(thrd, suspstack):
+        h = llop.gc_shadowstackref_context(llmemory.Address, suspstack)
+        h = llmemory.cast_adr_to_ptr(h, _c.handle)
+        llop.gc_shadowstackref_destroy(lltype.Void, suspstack)
+        _c.destroy(thrd._thrd, h)
+
+    def is_empty_handle(suspstack):
+        return not suspstack
+
+    def get_null_handle():
+        return NULL_SUSPSTACK
+
+
+gcrootfinder = StackletGcRootFinder()
+gcrootfinder.oldsuspstack = NULL_SUSPSTACK
+gcrootfinder.newsuspstack = NULL_SUSPSTACK
diff --git a/pypy/rlib/debug.py b/pypy/rlib/debug.py
--- a/pypy/rlib/debug.py
+++ b/pypy/rlib/debug.py
@@ -26,6 +26,7 @@
         llop.debug_print_traceback(lltype.Void)
     llop.debug_fatalerror(lltype.Void, msg)
 fatalerror._dont_inline_ = True
+fatalerror._annspecialcase_ = 'specialize:arg(1)'
 
 
 class DebugLog(list):
diff --git a/pypy/rlib/nonconst.py b/pypy/rlib/nonconst.py
--- a/pypy/rlib/nonconst.py
+++ b/pypy/rlib/nonconst.py
@@ -24,6 +24,12 @@
     def __add__(self, other):
         return self.__dict__['constant'] + other
 
+    def __radd__(self, other):
+        return other + self.__dict__['constant']
+
+    def __mul__(self, other):
+        return self.__dict__['constant'] * other
+
 class EntryNonConstant(ExtRegistryEntry):
     _about_ = NonConstant
 
diff --git a/pypy/rlib/parsing/makepackrat.py b/pypy/rlib/parsing/makepackrat.py
--- a/pypy/rlib/parsing/makepackrat.py
+++ b/pypy/rlib/parsing/makepackrat.py
@@ -251,9 +251,11 @@
         return "ErrorInformation(%s, %s)" % (self.pos, self.expected)
 
     def get_line_column(self, source):
-        uptoerror = source[:self.pos]
+        pos = self.pos
+        assert pos >= 0
+        uptoerror = source[:pos]
         lineno = uptoerror.count("\n")
-        columnno = self.pos - uptoerror.rfind("\n")
+        columnno = pos - uptoerror.rfind("\n")
         return lineno, columnno
 
     def nice_error_message(self, filename='<filename>', source=""):
diff --git a/pypy/rlib/rcoroutine.py b/pypy/rlib/rcoroutine.py
--- a/pypy/rlib/rcoroutine.py
+++ b/pypy/rlib/rcoroutine.py
@@ -29,6 +29,11 @@
 The type of a switch is determined by the target's costate.
 """
 
+import py; py.test.skip("fixme: rewrite using rlib.rstacklet")
+# XXX ^^^ the reason it is not done is that pypy.rlib.rcoroutine
+# plus pypy/module/_stackless look like faaaaaar too much code
+# to me :-(
+
 from pypy.rlib.rstack import yield_current_frame_to_caller
 from pypy.rlib.objectmodel import we_are_translated
 
diff --git a/pypy/rlib/rerased.py b/pypy/rlib/rerased.py
--- a/pypy/rlib/rerased.py
+++ b/pypy/rlib/rerased.py
@@ -117,6 +117,10 @@
 
     return erase, unerase
 
+def new_static_erasing_pair(name):
+    erase, unerase = new_erasing_pair(name)
+    return staticmethod(erase), staticmethod(unerase)
+
 
 # ---------- implementation-specific ----------
 
diff --git a/pypy/rlib/rgc.py b/pypy/rlib/rgc.py
--- a/pypy/rlib/rgc.py
+++ b/pypy/rlib/rgc.py
@@ -15,132 +15,8 @@
     pass
 
 # ____________________________________________________________
-# Framework GC features
-
-class GcPool(object):
-    pass
-
-def gc_swap_pool(newpool):
-    """Set newpool as the current pool (create one if newpool is None).
-    All malloc'ed objects are put into the current pool;this is a
-    way to separate objects depending on when they were allocated.
-    """
-    raise NotImplementedError("only works in stacklessgc translated versions")
-
-def gc_clone(gcobject, pool):
-    """Recursively clone the gcobject and everything it points to,
-    directly or indirectly -- but stops at objects that are not
-    in the specified pool.  Pool can be None to mean the current one.
-    A new pool is built to contain the copies.  Return (newobject, newpool).
-    """
-    raise NotImplementedError("only works in stacklessgc translated versions")
-
-# ____________________________________________________________
 # Annotation and specialization
 
-class GcPoolEntry(ExtRegistryEntry):
-    "Link GcPool to its Repr."
-    _type_ = GcPool
-
-    def get_repr(self, rtyper, s_pool):
-        config = rtyper.getconfig()
-        # if the gc policy doesn't support allocation pools, lltype
-        # pools as Void.
-        if config.translation.gc != 'marksweep':
-            from pypy.annotation.model import s_None
-            return rtyper.getrepr(s_None)
-        else:
-            from pypy.rpython.rmodel import SimplePointerRepr
-            from pypy.rpython.memory.gc.marksweep import X_POOL_PTR
-            return SimplePointerRepr(X_POOL_PTR)
-
-
-class SwapPoolFnEntry(ExtRegistryEntry):
-    "Annotation and specialization of gc_swap_pool()."
-    _about_ = gc_swap_pool
-
-    def compute_result_annotation(self, s_newpool):
-        from pypy.annotation import model as annmodel
-        return annmodel.SomeExternalObject(GcPool)
-
-    def specialize_call(self, hop):
-        from pypy.annotation import model as annmodel
-        s_pool_ptr = annmodel.SomeExternalObject(GcPool)
-        r_pool_ptr = hop.rtyper.getrepr(s_pool_ptr)
-
-        opname = 'gc_x_swap_pool'
-        config = hop.rtyper.getconfig()
-        if config.translation.gc != 'marksweep':
-            # when the gc policy doesn't support pools, just return
-            # the argument (which is lltyped as Void anyway)
-            opname = 'same_as'
-            
-        s_pool_ptr = annmodel.SomeExternalObject(GcPool)
-        r_pool_ptr = hop.rtyper.getrepr(s_pool_ptr)
-        vlist = hop.inputargs(r_pool_ptr)
-        return hop.genop(opname, vlist, resulttype = r_pool_ptr)
-
-def _raise():
-    raise RuntimeError
-
-class CloneFnEntry(ExtRegistryEntry):
-    "Annotation and specialization of gc_clone()."
-    _about_ = gc_clone
-
-    def compute_result_annotation(self, s_gcobject, s_pool):
-        from pypy.annotation import model as annmodel
-        return annmodel.SomeTuple([s_gcobject,
-                                   annmodel.SomeExternalObject(GcPool)])
-
-    def specialize_call(self, hop):
-        from pypy.rpython.error import TyperError
-        from pypy.rpython.lltypesystem import rtuple
-        from pypy.annotation import model as annmodel
-        from pypy.rpython.memory.gc.marksweep import X_CLONE, X_CLONE_PTR
-
-        config = hop.rtyper.getconfig()
-        if config.translation.gc != 'marksweep':
-            # if the gc policy does not support allocation pools,
-            # gc_clone always raises RuntimeError
-            hop.exception_is_here()
-            hop.gendirectcall(_raise)
-            s_pool_ptr = annmodel.SomeExternalObject(GcPool)
-            r_pool_ptr = hop.rtyper.getrepr(s_pool_ptr)
-            r_tuple = hop.r_result
-            v_gcobject, v_pool = hop.inputargs(hop.args_r[0], r_pool_ptr)
-            return rtuple.newtuple(hop.llops, r_tuple, [v_gcobject, v_pool])
-
-        r_gcobject = hop.args_r[0]
-        if (not isinstance(r_gcobject.lowleveltype, lltype.Ptr) or
-            r_gcobject.lowleveltype.TO._gckind != 'gc'):
-            raise TyperError("gc_clone() can only clone a dynamically "
-                             "allocated object;\ngot %r" % (r_gcobject,))
-        s_pool_ptr = annmodel.SomeExternalObject(GcPool)
-        r_pool_ptr = hop.rtyper.getrepr(s_pool_ptr)
-        r_tuple = hop.r_result
-
-        c_CLONE       = hop.inputconst(lltype.Void, X_CLONE)
-        c_flags       = hop.inputconst(lltype.Void, {'flavor': 'gc'})
-        c_gcobjectptr = hop.inputconst(lltype.Void, "gcobjectptr")
-        c_pool        = hop.inputconst(lltype.Void, "pool")
-
-        v_gcobject, v_pool = hop.inputargs(hop.args_r[0], r_pool_ptr)
-        v_gcobjectptr = hop.genop('cast_opaque_ptr', [v_gcobject],
-                                  resulttype = llmemory.GCREF)
-        v_clonedata = hop.genop('malloc', [c_CLONE, c_flags],
-                                resulttype = X_CLONE_PTR)
-        hop.genop('setfield', [v_clonedata, c_gcobjectptr, v_gcobjectptr])
-        hop.genop('setfield', [v_clonedata, c_pool, v_pool])
-        hop.exception_is_here()
-        hop.genop('gc_x_clone', [v_clonedata])
-        v_gcobjectptr = hop.genop('getfield', [v_clonedata, c_gcobjectptr],
-                                  resulttype = llmemory.GCREF)
-        v_pool        = hop.genop('getfield', [v_clonedata, c_pool],
-                                  resulttype = r_pool_ptr)
-        v_gcobject = hop.genop('cast_opaque_ptr', [v_gcobjectptr],
-                               resulttype = r_tuple.items_r[0])
-        return rtuple.newtuple(hop.llops, r_tuple, [v_gcobject, v_pool])
-
 # Support for collection.
 
 class CollectEntry(ExtRegistryEntry):
diff --git a/pypy/rlib/rstack.py b/pypy/rlib/rstack.py
--- a/pypy/rlib/rstack.py
+++ b/pypy/rlib/rstack.py
@@ -14,25 +14,6 @@
 from pypy.rpython.controllerentry import Controller, SomeControlledInstance
 from pypy.translator.tool.cbuild import ExternalCompilationInfo
 
-def stack_unwind():
-    if we_are_translated():
-        return llop.stack_unwind(lltype.Void)
-    raise RuntimeError("cannot unwind stack in non-translated versions")
-
-
-def stack_capture():
-    if we_are_translated():
-        ptr = llop.stack_capture(OPAQUE_STATE_HEADER_PTR)
-        return frame_stack_top_controller.box(ptr)
-    raise RuntimeError("cannot unwind stack in non-translated versions")
-
-
-def stack_frames_depth():
-    if we_are_translated():
-        return llop.stack_frames_depth(lltype.Signed)
-    else:
-        return len(inspect.stack())
-
 # ____________________________________________________________
 
 compilation_info = ExternalCompilationInfo(includes=['src/stack.h'])
@@ -88,78 +69,6 @@
 @rgc.no_collect
 def stack_check_slowpath(current):
     if ord(_stack_too_big_slowpath(current)):
-        # Now we are sure that the stack is really too big.  Note that the
-        # stack_unwind implementation is different depending on if stackless
-        # is enabled. If it is it unwinds the stack, otherwise it simply
-        # raises a RuntimeError.
-        stack_unwind()
+        from pypy.rlib.rstackovf import _StackOverflow
+        raise _StackOverflow
 stack_check_slowpath._dont_inline_ = True
-
-# ____________________________________________________________
-
-def yield_current_frame_to_caller():
-    raise NotImplementedError("only works in translated versions")
-
-
-class frame_stack_top(object):
-    def switch(self):
-        raise NotImplementedError("only works in translated versions")
-
-
-class BoundSwitchOfFrameStackTop(object): pass
-class BoundSwitchOfFrameStackTopController(Controller):
-    knowntype = BoundSwitchOfFrameStackTop
-    def call(self, real_object):
-        from pypy.rpython.lltypesystem.lloperation import llop
-        ptr = llop.stack_switch(OPAQUE_STATE_HEADER_PTR, real_object)
-        return frame_stack_top_controller.box(ptr)
-
-
-class FrameStackTopController(Controller):
-    knowntype = frame_stack_top
-    can_be_None = True
-
-    def is_true(self, real_object):
-        return bool(real_object)
-
-    def get_switch(self, real_object):
-        return bound_switch_of_frame_stack_top_controller.box(real_object)
-
-    def convert(self, obj):
-        assert obj is None
-        return lltype.nullptr(OPAQUE_STATE_HEADER_PTR.TO)
-
-frame_stack_top_controller = FrameStackTopController()
-bound_switch_of_frame_stack_top_controller = BoundSwitchOfFrameStackTopController()
-OPAQUE_STATE_HEADER = lltype.GcOpaqueType("OPAQUE_STATE_HEADER", hints={"render_structure": True})
-OPAQUE_STATE_HEADER_PTR = lltype.Ptr(OPAQUE_STATE_HEADER)
-
-
-
-class FrameStackTopReturningFnEntry(ExtRegistryEntry):
-    def compute_result_annotation(self):
-        from pypy.annotation import model as annmodel
-        return SomeControlledInstance(annmodel.lltype_to_annotation(OPAQUE_STATE_HEADER_PTR), frame_stack_top_controller)
-
-
-class YieldCurrentFrameToCallerFnEntry(FrameStackTopReturningFnEntry):
-    _about_ = yield_current_frame_to_caller
-
-    def specialize_call(self, hop):
-        var = hop.genop("yield_current_frame_to_caller", [], hop.r_result.lowleveltype)
-        return var
-
-
-# ____________________________________________________________
-
-def get_stack_depth_limit():
-    if we_are_translated():
-        from pypy.rpython.lltypesystem.lloperation import llop
-        return llop.get_stack_depth_limit(lltype.Signed)
-    raise RuntimeError("no stack depth limit in non-translated versions")
-
-def set_stack_depth_limit(limit):
-    if we_are_translated():
-        from pypy.rpython.lltypesystem.lloperation import llop
-        return llop.set_stack_depth_limit(lltype.Void, limit)
-    raise RuntimeError("no stack depth limit in non-translated versions")
diff --git a/pypy/rlib/rstacklet.py b/pypy/rlib/rstacklet.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/rstacklet.py
@@ -0,0 +1,58 @@
+from pypy.rlib import _rffi_stacklet as _c
+from pypy.rpython.lltypesystem import lltype, llmemory
+
+
+class StackletThread(object):
+
+    def __init__(self, config):
+        self._gcrootfinder = _getgcrootfinder(config)
+        self._thrd = _c.newthread()
+        if not self._thrd:
+            raise MemoryError
+        self._thrd_deleter = StackletThreadDeleter(self._thrd)
+
+    def new(self, callback, arg=llmemory.NULL):
+        return self._gcrootfinder.new(self, callback, arg)
+    new._annspecialcase_ = 'specialize:arg(1)'
+
+    def switch(self, stacklet):
+        return self._gcrootfinder.switch(self, stacklet)
+
+    def destroy(self, stacklet):
+        self._gcrootfinder.destroy(self, stacklet)
+
+    def is_empty_handle(self, stacklet):
+        # note that "being an empty handle" and being equal to
+        # "get_null_handle()" may be the same, or not; don't rely on it
+        return self._gcrootfinder.is_empty_handle(stacklet)
+
+    def get_null_handle(self):
+        return self._gcrootfinder.get_null_handle()
+
+
+class StackletThreadDeleter(object):
+    # quick hack: the __del__ is on another object, so that
+    # if the main StackletThread ends up in random circular
+    # references, on pypy deletethread() is only called
+    # when all that circular reference mess is gone.
+    def __init__(self, thrd):
+        self._thrd = thrd
+    def __del__(self):
+        thrd = self._thrd
+        if thrd:
+            self._thrd = lltype.nullptr(_c.thread_handle.TO)
+            _c.deletethread(thrd)
+
+# ____________________________________________________________
+
+def _getgcrootfinder(config):
+    if (config is None or
+        config.translation.gc in ('ref', 'boehm', 'none')):   # for tests
+        gcrootfinder = 'n/a'
+    else:
+        gcrootfinder = config.translation.gcrootfinder
+    gcrootfinder = gcrootfinder.replace('/', '_')
+    module = __import__('pypy.rlib._stacklet_%s' % gcrootfinder,
+                        None, None, ['__doc__'])
+    return module.gcrootfinder
+_getgcrootfinder._annspecialcase_ = 'specialize:memo'
diff --git a/pypy/rlib/streamio.py b/pypy/rlib/streamio.py
--- a/pypy/rlib/streamio.py
+++ b/pypy/rlib/streamio.py
@@ -496,29 +496,24 @@
         if bufsize == -1:     # Get default from the class
             bufsize = self.bufsize
         self.bufsize = bufsize  # buffer size (hint only)
-        self.lines = []         # ready-made lines (sans "\n")
-        self.buf = ""           # raw data (may contain "\n")
-        # Invariant: readahead == "\n".join(self.lines + [self.buf])
-        # self.lines contains no "\n"
-        # self.buf may contain "\n"
+        self.buf = ""           # raw data
+        self.pos = 0
 
     def flush_buffers(self):
-        if self.lines or self.buf:
+        if self.buf:
             try:
                 self.do_seek(self.tell(), 0)
             except MyNotImplementedError:
                 pass
             else:
-                self.lines = []
                 self.buf = ""
+                self.pos = 0
 
     def tell(self):
-        bytes = self.do_tell()  # This may fail
-        offset = len(self.buf)
-        for line in self.lines:
-            offset += len(line) + 1
-        assert bytes >= offset #, (locals(), self.__dict__)
-        return bytes - offset
+        tellpos = self.do_tell()  # This may fail
+        offset = len(self.buf) - self.pos
+        assert tellpos >= offset #, (locals(), self.__dict__)
+        return tellpos - offset
 
     def seek(self, offset, whence):
         # This may fail on the do_seek() or do_tell() call.
@@ -526,32 +521,25 @@
         # Nor on a seek to the very end.
         if whence == 0:
             self.do_seek(offset, 0)
-            self.lines = []
             self.buf = ""
+            self.pos = 0
             return
         if whence == 1:
+            currentsize = len(self.buf) - self.pos
             if offset < 0:
-                self.do_seek(self.tell() + offset, 0)
-                self.lines = []
-                self.buf = ""
+                if self.pos + offset >= 0:
+                    self.pos += offset
+                else:
+                    self.do_seek(self.tell() + offset, 0)
+                    self.pos = 0
+                    self.buf = ""
                 return
-            while self.lines:
-                line = self.lines[-1]
-                if offset <= len(line):
-                    intoffset = intmask(offset)
-                    assert intoffset >= 0
-                    self.lines[-1] = line[intoffset:]
-                    return
-                offset -= len(self.lines[-1]) - 1
-                self.lines.pop()
-            assert not self.lines
-            if offset <= len(self.buf):
-                intoffset = intmask(offset)
-                assert intoffset >= 0
-                self.buf = self.buf[intoffset:]
+            elif offset <= currentsize:
+                self.pos += offset
                 return
-            offset -= len(self.buf)
             self.buf = ""
+            self.pos = 0
+            offset -= currentsize
             try:
                 self.do_seek(offset, 1)
             except MyNotImplementedError:
@@ -564,18 +552,18 @@
             except MyNotImplementedError:
                 pass
             else:
-                self.lines = []
+                self.pos = 0
                 self.buf = ""
                 return
             # Skip relative to EOF by reading and saving only just as
             # much as needed
             intoffset = offset2int(offset)
-            self.lines.reverse()
-            data = "\n".join(self.lines + [self.buf])
-            total = len(data)
-            buffers = [data]
-            self.lines = []
+            pos = self.pos
+            assert pos >= 0
+            buffers = [self.buf[pos:]]
+            total = len(buffers[0])
             self.buf = ""
+            self.pos = 0
             while 1:
                 data = self.do_read(self.bufsize)
                 if not data:
@@ -589,157 +577,101 @@
             if cutoff < 0:
                 raise StreamError("cannot seek back")
             if buffers:
+                assert cutoff >= 0
                 buffers[0] = buffers[0][cutoff:]
             self.buf = "".join(buffers)
-            self.lines = []
             return
+
         raise StreamError("whence should be 0, 1 or 2")
 
     def readall(self):
-        self.lines.reverse()
-        self.lines.append(self.buf)
-        more = ["\n".join(self.lines)]
-        self.lines = []
+        pos = self.pos
+        assert pos >= 0
+        chunks = [self.buf[pos:]]
         self.buf = ""
+        self.pos = 0
         bufsize = self.bufsize
         while 1:
             data = self.do_read(bufsize)
             if not data:
                 break
-            more.append(data)
+            chunks.append(data)
             bufsize = min(bufsize*2, self.bigsize)
-        return "".join(more)
+        return "".join(chunks)
 
-    def read(self, n):
+    def read(self, n=-1):
         assert isinstance(n, int)
-        assert n >= 0
-        if self.lines:
-            # See if this can be satisfied from self.lines[0]
-            line = self.lines[-1]
-            if len(line) >= n:
-                self.lines[-1] = line[n:]
-                return line[:n]
-
-            # See if this can be satisfied *without exhausting* self.lines
-            k = 0
-            i = 0
-            lgt = len(self.lines)
-            for linenum in range(lgt-1,-1,-1):
-                line = self.lines[linenum]
-                k += len(line)
-                if k >= n:
-                    lines = self.lines[linenum + 1:]
-                    data = self.lines[linenum]
-                    cutoff = len(data) - (k-n)
-                    assert cutoff >= 0
-                    lines.reverse()
-                    lines.append(data[:cutoff])
-                    del self.lines[linenum:]
-                    self.lines.append(data[cutoff:])
-                    return "\n".join(lines)
-                k += 1
-
-            # See if this can be satisfied from self.lines plus self.buf
-            if k + len(self.buf) >= n:
-                lines = self.lines
-                lines.reverse()
-                self.lines = []
-                cutoff = n - k
-                assert cutoff >= 0
-                lines.append(self.buf[:cutoff])
-                self.buf = self.buf[cutoff:]
-                return "\n".join(lines)
-
+        if n < 0:
+            return self.readall()
+        currentsize = len(self.buf) - self.pos
+        start = self.pos
+        assert start >= 0
+        if n <= currentsize:
+            stop = start + n
+            assert stop >= 0
+            result = self.buf[start:stop]
+            self.pos += n
+            return result
         else:
-            # See if this can be satisfied from self.buf
-            data = self.buf
-            k = len(data)
-            if k >= n:
-                cutoff = len(data) - (k-n)
-                assert cutoff >= 0
-                assert len(data) >= cutoff
-                self.buf = data[cutoff:]
-                return data[:cutoff]
-
-        lines = self.lines
-        lines.reverse()
-        self.lines = []
-        lines.append(self.buf)
-        self.buf = ""
-        data = "\n".join(lines)
-        more = [data]
-        k = len(data)
-        while k < n:
-            data = self.do_read(max(self.bufsize, n-k))
-            k += len(data)
-            more.append(data)
-            if not data:
-                break
-        cutoff = len(data) - (k-n)
-        assert cutoff >= 0
-        if len(data) <= cutoff:
-            self.buf = ""
-        else:
-            self.buf = data[cutoff:]
-            more[-1] = data[:cutoff]
-        return "".join(more)
-
-    # read_next_bunch is generally this, version below is slightly faster
-    #def _read_next_bunch(self):
-    #    self.lines = self.buf.split("\n")
-    #    self.buf = self.lines.pop()
-    #    self.lines.reverse()
-
-    def _read_next_bunch(self):
-        numlines = self.buf.count("\n")
-        self.lines = [None] * numlines
-        last = -1
-        num = numlines - 1
-        while True:
-            start = last + 1
-            assert start >= 0
-            next = self.buf.find("\n", start)
-            if next == -1:
-                if last != -1:
-                    self.buf = self.buf[start:]
-                break
-            assert next >= 0
-            self.lines[num] = self.buf[start:next]
-            last = next
-            num -= 1
+            chunks = [self.buf[start:]]
+            while 1:
+                self.buf = self.do_read(self.bufsize)
+                if not self.buf:
+                    self.pos = 0
+                    break
+                currentsize += len(self.buf)
+                if currentsize >= n:
+                    self.pos = len(self.buf) - (currentsize - n)
+                    stop = self.pos
+                    assert stop >= 0
+                    chunks.append(self.buf[:stop])
+                    break
+                chunks.append(self.buf)
+            return ''.join(chunks)
 
     def readline(self):
-        if self.lines:
-            return self.lines.pop() + "\n"
-
-        # This block is needed because read() can leave self.buf
-        # containing newlines
-        self._read_next_bunch()
-        if self.lines:
-            return self.lines.pop() + "\n"
-
-        if self.buf:
-            buf = [self.buf]
-        else:
-            buf = []
+        pos = self.pos
+        assert pos >= 0
+        i = self.buf.find("\n", pos)
+        start = self.pos
+        assert start >= 0
+        if i >= 0: # new line found
+            i += 1
+            result = self.buf[start:i]
+            self.pos = i
+            return result
+        temp = self.buf[start:]
+        # read one buffer and most of the time a new line will be found
+        self.buf = self.do_read(self.bufsize)
+        i = self.buf.find("\n")
+        if i >= 0: # new line found
+            i += 1
+            result = temp + self.buf[:i]
+            self.pos = i
+            return result
+        if not self.buf:
+            self.pos = 0
+            return temp
+        # need to keep getting data until we find a new line
+        chunks = [temp, self.buf]
         while 1:
             self.buf = self.do_read(self.bufsize)
-            self._read_next_bunch()
-            if self.lines:
-                buf.append(self.lines.pop())
-                buf.append("\n")
+            if not self.buf:
+                self.pos = 0
                 break
-            if not self.buf:
+            i = self.buf.find("\n")
+            if i >= 0:
+                i += 1
+                chunks.append(self.buf[:i])
+                self.pos = i
                 break
-            buf.append(self.buf)
-
-        return "".join(buf)
+            chunks.append(self.buf)
+        return "".join(chunks)
 
     def peek(self):
-        if self.lines:
-            return self.lines[-1] + "\n"
-        else:
-            return self.buf
+        pos = self.pos
+        assert pos >= 0
+        return self.buf[pos:]
 
     write      = PassThrough("write",     flush_buffers=True)
     truncate   = PassThrough("truncate",  flush_buffers=True)
diff --git a/pypy/rlib/test/test_rstacklet.py b/pypy/rlib/test/test_rstacklet.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/test/test_rstacklet.py
@@ -0,0 +1,272 @@
+import gc
+import py
+from pypy.rpython.tool.rffi_platform import CompilationError
+try:
+    from pypy.rlib import rstacklet
+except CompilationError, e:
+    py.test.skip("cannot import rstacklet: %s" % e)
+
+from pypy.rlib import rrandom
+from pypy.rlib.rarithmetic import intmask
+from pypy.rpython.lltypesystem import lltype, llmemory, rffi
+from pypy.translator.c.test.test_standalone import StandaloneTests
+
+
+
+class Runner:
+    STATUSMAX = 5000
+    config = None
+
+    def init(self, seed):
+        self.sthread = rstacklet.StackletThread(self.config)
+        self.random = rrandom.Random(seed)
+
+    def done(self):
+        self.sthread = None
+        gc.collect(); gc.collect(); gc.collect()
+
+    TESTS = []
+    def here_is_a_test(fn, TESTS=TESTS):
+        TESTS.append((fn.__name__, fn))
+        return fn
+
+    @here_is_a_test
+    def test_new(self):
+        print 'start'
+        h = self.sthread.new(empty_callback, rffi.cast(llmemory.Address, 123))
+        print 'end', h
+        assert self.sthread.is_empty_handle(h)
+
+    def nextstatus(self, nextvalue):
+        print 'expected nextvalue to be %d, got %d' % (nextvalue,
+                                                       self.status + 1)
+        assert self.status + 1 == nextvalue
+        self.status = nextvalue
+
+    @here_is_a_test
+    def test_simple_switch(self):
+        self.status = 0
+        h = self.sthread.new(switchbackonce_callback,
+                             rffi.cast(llmemory.Address, 321))
+        assert not self.sthread.is_empty_handle(h)
+        self.nextstatus(2)
+        h = self.sthread.switch(h)
+        self.nextstatus(4)
+        print 'end', h
+        assert self.sthread.is_empty_handle(h)
+
+    @here_is_a_test
+    def test_various_depths(self):
+        self.tasks = [Task(i) for i in range(10)]
+        self.nextstep = -1
+        self.comefrom = -1
+        self.status = 0
+        while self.status < self.STATUSMAX or self.any_alive():
+            self.tasks[0].withdepth(self.random.genrand32() % 50)
+            assert len(self.tasks[0].lst) == 0
+
+    def any_alive(self):
+        for task in self.tasks:
+            if task.h:
+                return True
+        return False
+
+
+class FooObj:
+    def __init__(self, n, d, next=None):
+        self.n = n
+        self.d = d
+        self.next = next
+
+
+class Task:
+    def __init__(self, n):
+        self.n = n
+        self.h = runner.sthread.get_null_handle()
+        self.lst = []
+
+    def withdepth(self, d):
+        if d > 0:
+            foo = FooObj(self.n, d)
+            foo2 = FooObj(self.n + 100, d, foo)
+            self.lst.append(foo)
+            res = self.withdepth(d-1)
+            foo = self.lst.pop()
+            assert foo2.n == self.n + 100
+            assert foo2.d == d
+            assert foo2.next is foo
+            assert foo.n == self.n
+            assert foo.d == d
+            assert foo.next is None
+        else:
+            res = 0
+            n = intmask(runner.random.genrand32() % 10)
+            if n == self.n or (runner.status >= runner.STATUSMAX and
+                               not runner.tasks[n].h):
+                return 1
+
+            print "status == %d, self.n = %d" % (runner.status, self.n)
+            assert not self.h
+            assert runner.nextstep == -1
+            runner.status += 1
+            runner.nextstep = runner.status
+            runner.comefrom = self.n
+            runner.gointo = n
+            task = runner.tasks[n]
+            if not task.h:
+                # start a new stacklet
+                print "NEW", n
+                h = runner.sthread.new(variousstackdepths_callback,
+                                       rffi.cast(llmemory.Address, n))
+            else:
+                # switch to this stacklet
+                print "switch to", n
+                h = task.h
+                task.h = runner.sthread.get_null_handle()
+                h = runner.sthread.switch(h)
+
+            print "back in self.n = %d, coming from %d" % (self.n,
+                                                           runner.comefrom)
+            assert runner.nextstep == runner.status
+            runner.nextstep = -1
+            assert runner.gointo == self.n
+            assert runner.comefrom != self.n
+            assert not self.h
+            if runner.comefrom != -42:
+                assert 0 <= runner.comefrom < 10
+                task = runner.tasks[runner.comefrom]
+                assert not task.h
+                task.h = h
+            else:
+                assert runner.sthread.is_empty_handle(h)
+            runner.comefrom = -1
+            runner.gointo = -1
+        assert (res & (res-1)) == 0   # to prevent a tail-call to withdepth()
+        return res
+
+
+runner = Runner()
+
+
+def empty_callback(h, arg):
+    print 'in empty_callback:', h, arg
+    assert rffi.cast(lltype.Signed, arg) == 123
+    return h
+
+def switchbackonce_callback(h, arg):
+    print 'in switchbackonce_callback:', h, arg
+    assert rffi.cast(lltype.Signed, arg) == 321
+    runner.nextstatus(1)
+    assert not runner.sthread.is_empty_handle(h)
+    h = runner.sthread.switch(h)
+    runner.nextstatus(3)
+    assert not runner.sthread.is_empty_handle(h)
+    return h
+
+def variousstackdepths_callback(h, arg):
+    assert runner.nextstep == runner.status
+    runner.nextstep = -1
+    arg = rffi.cast(lltype.Signed, arg)
+    assert arg == runner.gointo
+    self = runner.tasks[arg]
+    assert self.n == runner.gointo
+    assert not self.h
+    assert 0 <= runner.comefrom < 10
+    task = runner.tasks[runner.comefrom]
+    assert not task.h
+    assert bool(h) and not runner.sthread.is_empty_handle(h)
+    task.h = h
+    runner.comefrom = -1
+    runner.gointo = -1
+
+    while self.withdepth(runner.random.genrand32() % 20) == 0:
+        assert len(self.lst) == 0
+
+    assert len(self.lst) == 0
+    assert not self.h
+    while 1:
+        n = intmask(runner.random.genrand32() % 10)
+        h = runner.tasks[n].h
+        if h:
+            break
+
+    assert not runner.sthread.is_empty_handle(h)
+    runner.tasks[n].h = runner.sthread.get_null_handle()
+    runner.comefrom = -42
+    runner.gointo = n
+    assert runner.nextstep == -1
+    runner.status += 1
+    runner.nextstep = runner.status
+    print "LEAVING %d to go to %d" % (self.n, n)
+    return h
+
+
+def entry_point(argv):
+    seed = 0
+    if len(argv) > 1:
+        seed = int(argv[1])
+    runner.init(seed)
+    for name, meth in Runner.TESTS:
+        print '-----', name, '-----'
+        meth(runner)
+    print '----- all done -----'
+    runner.done()
+    return 0
+
+
+class BaseTestStacklet(StandaloneTests):
+
+    def setup_class(cls):
+        from pypy.config.pypyoption import get_pypy_config
+        config = get_pypy_config(translating=True)
+        config.translation.gc = cls.gc
+        if cls.gcrootfinder is not None:
+            config.translation.continuation = True
+            config.translation.gcrootfinder = cls.gcrootfinder
+            GCROOTFINDER = cls.gcrootfinder
+        cls.config = config
+        cls.old_values = Runner.config, Runner.STATUSMAX
+        Runner.config = config
+        Runner.STATUSMAX = 25000
+
+    def teardown_class(cls):
+        Runner.config, Runner.STATUSMAX = cls.old_values
+
+    def test_demo1(self):
+        t, cbuilder = self.compile(entry_point)
+
+        for i in range(15):
+            if (i & 1) == 0:
+                env = {}
+            else:
+                env = {'PYPY_GC_NURSERY': '2k'}
+            print 'running %s/%s with arg=%d and env=%r' % (
+                self.gc, self.gcrootfinder, i, env)
+            data = cbuilder.cmdexec('%d' % i, env=env)
+            assert data.endswith("----- all done -----\n")
+            for name, meth in Runner.TESTS:
+                assert ('----- %s -----\n' % name) in data
+
+
+class DONTTestStackletBoehm(BaseTestStacklet):
+    # Boehm does not work well with stacklets, probably because the
+    # moved-away copies of the stack are parsed using a different
+    # selection logic than the real stack
+    gc = 'boehm'
+    gcrootfinder = None
+
+class TestStackletAsmGcc(BaseTestStacklet):
+    gc = 'minimark'
+    gcrootfinder = 'asmgcc'
+
+class TestStackletShadowStack(BaseTestStacklet):
+    gc = 'minimark'
+    gcrootfinder = 'shadowstack'
+
+
+def target(*args):
+    return entry_point, None
+
+if __name__ == '__main__':
+    import sys
+    sys.exit(entry_point(sys.argv))
diff --git a/pypy/rpython/extfuncregistry.py b/pypy/rpython/extfuncregistry.py
--- a/pypy/rpython/extfuncregistry.py
+++ b/pypy/rpython/extfuncregistry.py
@@ -44,32 +44,28 @@
        ('log10', [float], float),
        ('sin', [float], float),
        ('cos', [float], float),
+       ('atan2', [float, float], float),
+       ('hypot', [float, float], float),
+       ('frexp', [float], (float, int)),
+       ('ldexp', [float, int], float),
+       ('modf', [float], (float, float)),
+       ('fmod', [float, float], float),
+       ('pow', [float, float], float),
     ]),
 ]
 for module, methods in _register:
     for name, arg_types, return_type in methods:
         method_name = 'll_math_%s' % name
+        oofake = None
+        # Things with a tuple return type have a fake impl for RPython, check
+        # to see if the method has one.
+        if hasattr(oo_math, method_name):
+          oofake = getattr(oo_math, method_name)
         register_external(getattr(module, name), arg_types, return_type,
                           export_name='ll_math.%s' % method_name,
                           sandboxsafe=True,
-                          llimpl=getattr(ll_math, method_name))
-
-
-complex_math_functions = [
-    ('frexp', [float],        (float, int)),
-    ('ldexp', [float, int],   float),
-    ('modf',  [float],        (float, float)),
-    ] + [(name, [float, float], float)
-         for name in 'atan2', 'fmod', 'hypot', 'pow']
-
-for name, args, res in complex_math_functions:
-    func = getattr(math, name)
-    llimpl = getattr(ll_math, 'll_math_%s' % name, None)
-    oofake = getattr(oo_math, 'll_math_%s' % name, None)
-    register_external(func, args, res, 'll_math.ll_math_%s' % name,
-                      llimpl=llimpl, oofakeimpl=oofake,
-                      sandboxsafe=True)
-
+                          llimpl=getattr(ll_math, method_name),
+                          oofakeimpl=oofake)
 
 # ___________________________
 # os.path functions
diff --git a/pypy/rpython/llinterp.py b/pypy/rpython/llinterp.py
--- a/pypy/rpython/llinterp.py
+++ b/pypy/rpython/llinterp.py
@@ -675,21 +675,6 @@
             #log.warn("op_indirect_call with graphs=None:", f)
         return self.op_direct_call(f, *args)
 
-    def op_adr_call(self, TGT, f, *inargs):
-        checkadr(f)
-        obj = self.llinterpreter.typer.type_system.deref(f.ref())
-        assert hasattr(obj, 'graph') # don't want to think about that
-        graph = obj.graph
-        args = []
-        for inarg, arg in zip(inargs, obj.graph.startblock.inputargs):
-            args.append(lltype._cast_whatever(arg.concretetype, inarg))
-        frame = self.newsubframe(graph, args)
-        result = frame.eval()
-        from pypy.translator.stackless.frame import storage_type
-        assert storage_type(lltype.typeOf(result)) == TGT
-        return lltype._cast_whatever(TGT, result)
-    op_adr_call.need_result_type = True
-
     def op_malloc(self, obj, flags):
         flavor = flags['flavor']
         zero = flags.get('zero', False)
@@ -840,10 +825,11 @@
 
     def op_gc_adr_of_nursery_top(self):
         raise NotImplementedError
-
     def op_gc_adr_of_nursery_free(self):
         raise NotImplementedError
 
+    def op_gc_adr_of_root_stack_base(self):
+        raise NotImplementedError
     def op_gc_adr_of_root_stack_top(self):
         raise NotImplementedError
 
@@ -894,6 +880,21 @@
     def op_gc_stack_bottom(self):
         pass       # marker for trackgcroot.py
 
+    def op_gc_shadowstackref_new(self):   # stacklet+shadowstack
+        raise NotImplementedError("gc_shadowstackref_new")
+    def op_gc_shadowstackref_context(self):
+        raise NotImplementedError("gc_shadowstackref_context")
+    def op_gc_shadowstackref_destroy(self):
+        raise NotImplementedError("gc_shadowstackref_destroy")
+    def op_gc_save_current_state_away(self):
+        raise NotImplementedError("gc_save_current_state_away")
+    def op_gc_forget_current_state(self):
+        raise NotImplementedError("gc_forget_current_state")
+    def op_gc_restore_state_from(self):
+        raise NotImplementedError("gc_restore_state_from")
+    def op_gc_start_fresh_new_state(self):
+        raise NotImplementedError("gc_start_fresh_new_state")
+
     def op_gc_get_type_info_group(self):
         raise NotImplementedError("gc_get_type_info_group")
 
@@ -930,27 +931,6 @@
     def op_get_write_barrier_from_array_failing_case(self):
         raise NotImplementedError("get_write_barrier_from_array_failing_case")
 
-    def op_yield_current_frame_to_caller(self):
-        raise NotImplementedError("yield_current_frame_to_caller")
-
-    def op_stack_frames_depth(self):
-        return len(self.llinterpreter.frame_stack)
-
-    def op_stack_switch(self, frametop):
-        raise NotImplementedError("stack_switch")
-
-    def op_stack_unwind(self):
-        raise NotImplementedError("stack_unwind")
-
-    def op_stack_capture(self):
-        raise NotImplementedError("stack_capture")
-
-    def op_get_stack_depth_limit(self):
-        raise NotImplementedError("get_stack_depth_limit")
-
-    def op_set_stack_depth_limit(self):
-        raise NotImplementedError("set_stack_depth_limit")
-
     def op_stack_current(self):
         return 0
 
@@ -1131,16 +1111,6 @@
         assert isinstance(x, (int, Symbolic))
         return bool(x)
 
-    # read frame var support
-
-    def op_get_frame_base(self):
-        self._obj0 = self        # hack
-        return llmemory.fakeaddress(self)
-
-    def op_frame_info(self, *vars):
-        pass
-    op_frame_info.specialform = True
-
     # hack for jit.codegen.llgraph
 
     def op_check_and_clear_exc(self):
diff --git a/pypy/rpython/lltypesystem/ll2ctypes.py b/pypy/rpython/lltypesystem/ll2ctypes.py
--- a/pypy/rpython/lltypesystem/ll2ctypes.py
+++ b/pypy/rpython/lltypesystem/ll2ctypes.py
@@ -20,7 +20,7 @@
 from pypy.rpython.extfunc import ExtRegistryEntry
 from pypy.rlib.objectmodel import Symbolic, ComputedIntSymbolic
 from pypy.tool.uid import fixid
-from pypy.rlib.rarithmetic import r_uint, r_singlefloat, r_longfloat, intmask
+from pypy.rlib.rarithmetic import r_uint, r_singlefloat, r_longfloat, base_int, intmask
 from pypy.annotation import model as annmodel
 from pypy.rpython.llinterp import LLInterpreter, LLException
 from pypy.rpython.lltypesystem.rclass import OBJECT, OBJECT_VTABLE
@@ -113,7 +113,7 @@
         rffi.LONGLONG:   ctypes.c_longlong,
         rffi.ULONGLONG:  ctypes.c_ulonglong,
         rffi.SIZE_T:     ctypes.c_size_t,
-        lltype.Bool:     ctypes.c_long, # XXX
+        lltype.Bool:     ctypes.c_bool,
         llmemory.Address:  ctypes.c_void_p,
         llmemory.GCREF:    ctypes.c_void_p,
         llmemory.WeakRef:  ctypes.c_void_p, # XXX
@@ -1098,6 +1098,8 @@
     for i in range(len(FUNCTYPE.ARGS)):
         if FUNCTYPE.ARGS[i] is lltype.Void:
             void_arguments.append(i)
+    def callme(cargs):   # an extra indirection: workaround for rlib.rstacklet
+        return cfunc(*cargs)
     def invoke_via_ctypes(*argvalues):
         global _callback_exc_info
         cargs = []
@@ -1109,7 +1111,7 @@
                 cargs.append(cvalue)
         _callback_exc_info = None
         _restore_c_errno()
-        cres = cfunc(*cargs)
+        cres = callme(cargs)
         _save_c_errno()
         if _callback_exc_info:
             etype, evalue, etb = _callback_exc_info
@@ -1140,6 +1142,8 @@
             cvalue = 0
     elif isinstance(cvalue, (str, unicode)):
         cvalue = ord(cvalue)     # character -> integer
+    elif hasattr(RESTYPE, "_type") and issubclass(RESTYPE._type, base_int):
+        cvalue = int(cvalue)
 
     if not isinstance(cvalue, (int, long, float)):
         raise NotImplementedError("casting %r to %r" % (TYPE1, RESTYPE))
diff --git a/pypy/rpython/lltypesystem/lloperation.py b/pypy/rpython/lltypesystem/lloperation.py
--- a/pypy/rpython/lltypesystem/lloperation.py
+++ b/pypy/rpython/lltypesystem/lloperation.py
@@ -9,7 +9,7 @@
 class LLOp(object):
 
     def __init__(self, sideeffects=True, canfold=False, canraise=(),
-                 pyobj=False, canunwindgc=False, canrun=False, oo=False,
+                 pyobj=False, canmallocgc=False, canrun=False, oo=False,
                  tryfold=False):
         # self.opname = ... (set afterwards)
 
@@ -36,12 +36,12 @@
         # The operation manipulates PyObjects
         self.pyobj = pyobj
 
-        # The operation can unwind the stack in a stackless gc build
-        self.canunwindgc = canunwindgc
-        if canunwindgc:
-            if (StackException not in self.canraise and
+        # The operation can go a GC malloc
+        self.canmallocgc = canmallocgc
+        if canmallocgc:
+            if (MemoryError not in self.canraise and
                 Exception not in self.canraise):
-                self.canraise += (StackException,)
+                self.canraise += (MemoryError,)
 
         # The operation can be run directly with __call__
         self.canrun = canrun or canfold
@@ -175,10 +175,6 @@
         return hop.genop(op.opname, args_v, resulttype=hop.r_result.lowleveltype)
 
 
-class StackException(Exception):
-    """Base for internal exceptions possibly used by the stackless
-    implementation."""
-
 # ____________________________________________________________
 #
 # This list corresponds to the operations implemented by the LLInterpreter.
@@ -356,10 +352,10 @@
 
     # __________ pointer operations __________
 
-    'malloc':               LLOp(canraise=(MemoryError,), canunwindgc=True),
-    'malloc_varsize':       LLOp(canraise=(MemoryError,), canunwindgc=True),
-    'malloc_nonmovable':    LLOp(canraise=(MemoryError,), canunwindgc=True),
-    'malloc_nonmovable_varsize':LLOp(canraise=(MemoryError,),canunwindgc=True),
+    'malloc':               LLOp(canmallocgc=True),
+    'malloc_varsize':       LLOp(canmallocgc=True),
+    'malloc_nonmovable':    LLOp(canmallocgc=True),
+    'malloc_nonmovable_varsize':LLOp(canmallocgc=True),
     'shrink_array':         LLOp(canrun=True),
     'zero_gc_pointers_inside': LLOp(),
     'free':                 LLOp(),
@@ -414,7 +410,6 @@
     'adr_ne':               LLOp(canfold=True),
     'adr_gt':               LLOp(canfold=True),
     'adr_ge':               LLOp(canfold=True),
-    'adr_call':             LLOp(canraise=(Exception,)),
     'cast_ptr_to_adr':      LLOp(sideeffects=False),
     'cast_adr_to_ptr':      LLOp(canfold=True),
     'cast_adr_to_int':      LLOp(sideeffects=False),
@@ -436,8 +431,8 @@
     'jit_force_quasi_immutable': LLOp(canrun=True),
     'get_exception_addr':   LLOp(),
     'get_exc_value_addr':   LLOp(),
-    'do_malloc_fixedsize_clear':LLOp(canraise=(MemoryError,),canunwindgc=True),
-    'do_malloc_varsize_clear':  LLOp(canraise=(MemoryError,),canunwindgc=True),
+    'do_malloc_fixedsize_clear':LLOp(canmallocgc=True),
+    'do_malloc_varsize_clear':  LLOp(canmallocgc=True),
     'get_write_barrier_failing_case': LLOp(sideeffects=False),
     'get_write_barrier_from_array_failing_case': LLOp(sideeffects=False),
     'gc_get_type_info_group': LLOp(sideeffects=False),
@@ -445,7 +440,7 @@
 
     # __________ GC operations __________
 
-    'gc__collect':          LLOp(canunwindgc=True),
+    'gc__collect':          LLOp(canmallocgc=True),
     'gc_free':              LLOp(),
     'gc_fetch_exception':   LLOp(),
     'gc_restore_exception': LLOp(),
@@ -455,17 +450,12 @@
     'gc_pop_alive_pyobj':   LLOp(),
     'gc_reload_possibly_moved': LLOp(),
     # see rlib/objectmodel for gc_identityhash and gc_id
-    'gc_identityhash':      LLOp(canraise=(MemoryError,), sideeffects=False,
-                                 canunwindgc=True),
-    'gc_id':                LLOp(canraise=(MemoryError,), sideeffects=False),
-                                 # ^^^ but canunwindgc=False, as it is
-                                 # allocating non-GC structures only
+    'gc_identityhash':      LLOp(sideeffects=False, canmallocgc=True),
+    'gc_id':                LLOp(sideeffects=False, canmallocgc=True),
     'gc_obtain_free_space': LLOp(),
     'gc_set_max_heap_size': LLOp(),
     'gc_can_move'         : LLOp(sideeffects=False),
-    'gc_thread_prepare'   : LLOp(canraise=(MemoryError,)),
-                                 # ^^^ but canunwindgc=False, as it is
-                                 # allocating non-GC structures only
+    'gc_thread_prepare'   : LLOp(canmallocgc=True),
     'gc_thread_run'       : LLOp(),
     'gc_thread_start'     : LLOp(),