[pypy-commit] pypy unroll-if-const: merged upstream.

Tue Sep 13 01:55:51 CEST 2011

Author: Alex Gaynor <alex.gaynor at gmail.com>
Branch: unroll-if-const
Changeset: r47234:d886ce5dd420
Date: 2011-09-12 19:52 -0400
http://bitbucket.org/pypy/pypy/changeset/d886ce5dd420/

Log:	merged upstream.

diff too long, truncating to 10000 out of 49037 lines

diff --git a/.hgtags b/.hgtags
--- a/.hgtags
+++ b/.hgtags
@@ -1,1 +1,2 @@
 b590cf6de4190623aad9aa698694c22e614d67b9 release-1.5
+b48df0bf4e75b81d98f19ce89d4a7dc3e1dab5e5 benchmarked
diff --git a/LICENSE b/LICENSE
--- a/LICENSE
+++ b/LICENSE
@@ -37,22 +37,22 @@
     Armin Rigo
     Maciej Fijalkowski
     Carl Friedrich Bolz
+    Antonio Cuni
     Amaury Forgeot d'Arc
-    Antonio Cuni
     Samuele Pedroni
     Michael Hudson
     Holger Krekel
+    Benjamin Peterson
     Christian Tismer
-    Benjamin Peterson
+    Hakan Ardo
+    Alex Gaynor
     Eric van Riet Paap
-    Anders Chrigstr&#246;m
-    H&#229;kan Ard&#246;
+    Anders Chrigstrom
+    David Schneider
     Richard Emslie
     Dan Villiom Podlaski Christiansen
     Alexander Schremmer
-    Alex Gaynor
-    David Schneider
-    Aureli&#233;n Campeas
+    Aurelien Campeas
     Anders Lehmann
     Camillo Bruni
     Niklaus Haldimann
@@ -63,16 +63,17 @@
     Bartosz Skowron
     Jakub Gustak
     Guido Wesdorp
+    Daniel Roberts
     Adrien Di Mascio
     Laura Creighton
     Ludovic Aubry
     Niko Matsakis
-    Daniel Roberts
     Jason Creighton
-    Jacob Hall&#233;n
+    Jacob Hallen
     Alex Martelli
     Anders Hammarquist
     Jan de Mooij
+    Wim Lavrijsen
     Stephan Diehl
     Michael Foord
     Stefan Schwarzer
@@ -83,9 +84,13 @@
     Alexandre Fayolle
     Marius Gedminas
     Simon Burton
+    Justin Peel
     Jean-Paul Calderone
     John Witulski
+    Lukas Diekmann
+    holger krekel
     Wim Lavrijsen
+    Dario Bertini
     Andreas St&#252;hrk
     Jean-Philippe St. Pierre
     Guido van Rossum
@@ -97,15 +102,16 @@
     Georg Brandl
     Gerald Klix
     Wanja Saatkamp
+    Ronny Pfannschmidt
     Boris Feigin
     Oscar Nierstrasz
-    Dario Bertini
     David Malcolm
     Eugene Oden
     Henry Mason
+    Sven Hager
     Lukas Renggli
+    Ilya Osadchiy
     Guenter Jantzen
-    Ronny Pfannschmidt
     Bert Freudenberg
     Amit Regmi
     Ben Young
@@ -122,8 +128,8 @@
     Jared Grubb
     Karl Bartel
     Gabriel Lavoie
+    Victor Stinner
     Brian Dorsey
-    Victor Stinner
     Stuart Williams
     Toby Watson
     Antoine Pitrou
@@ -134,19 +140,23 @@
     Jonathan David Riehl
     Elmo M&#228;ntynen
     Anders Qvist
-    Beatrice D&#252;ring
+    Beatrice During
     Alexander Sedov
+    Timo Paulssen
+    Corbin Simpson
     Vincent Legoll
+    Romain Guillebert
     Alan McIntyre
-    Romain Guillebert
     Alex Perry
     Jens-Uwe Mager
+    Simon Cross
     Dan Stromberg
-    Lukas Diekmann
+    Guillebert Romain
     Carl Meyer
     Pieter Zieschang
     Alejandro J. Cura
     Sylvain Thenault
+    Christoph Gerum
     Travis Francis Athougies
     Henrik Vendelbo
     Lutz Paelike
@@ -157,6 +167,7 @@
     Miguel de Val Borro
     Ignas Mikalajunas
     Artur Lisiecki
+    Philip Jenvey
     Joshua Gilbert
     Godefroid Chappelle
     Yusei Tahara
@@ -165,27 +176,31 @@
     Gustavo Niemeyer
     William Leslie
     Akira Li
-    Kristj&#225;n Valur J&#243;nsson
+    Kristjan Valur Jonsson
     Bobby Impollonia
+    Michael Hudson-Doyle
     Andrew Thompson
     Anders Sigfridsson
+    Floris Bruynooghe
     Jacek Generowicz
     Dan Colish
-    Sven Hager
     Zooko Wilcox-O Hearn
+    Dan Villiom Podlaski Christiansen
     Anders Hammarquist
+    Chris Lambacher
     Dinu Gherman
     Dan Colish
+    Brett Cannon
     Daniel Neuh&#228;user
     Michael Chermside
     Konrad Delong
     Anna Ravencroft
     Greg Price
     Armin Ronacher
+    Christian Muirhead
     Jim Baker
-    Philip Jenvey
     Rodrigo Ara&#250;jo
-    Brett Cannon
+    Romain Guillebert
 
     Heinrich-Heine University, Germany 
     Open End AB (formerly AB Strakt), Sweden
diff --git a/ctypes_configure/configure.py b/ctypes_configure/configure.py
--- a/ctypes_configure/configure.py
+++ b/ctypes_configure/configure.py
@@ -559,7 +559,9 @@
 C_HEADER = """
 #include <stdio.h>
 #include <stddef.h>   /* for offsetof() */
-#include <stdint.h>   /* FreeBSD: for uint64_t */
+#ifndef _WIN32
+#  include <stdint.h>   /* FreeBSD: for uint64_t */
+#endif
 
 void dump(char* key, int value) {
     printf("%s: %d\\n", key, value);
diff --git a/ctypes_configure/stdoutcapture.py b/ctypes_configure/stdoutcapture.py
--- a/ctypes_configure/stdoutcapture.py
+++ b/ctypes_configure/stdoutcapture.py
@@ -15,6 +15,15 @@
             not hasattr(os, 'fdopen')):
             self.dummy = 1
         else:
+            try:
+                self.tmpout = os.tmpfile()
+                if mixed_out_err:
+                    self.tmperr = self.tmpout
+                else:
+                    self.tmperr = os.tmpfile()
+            except OSError:     # bah?  on at least one Windows box
+                self.dummy = 1
+                return
             self.dummy = 0
             # make new stdout/stderr files if needed
             self.localoutfd = os.dup(1)
@@ -29,11 +38,6 @@
                 sys.stderr = os.fdopen(self.localerrfd, 'w', 0)
             else:
                 self.saved_stderr = None
-            self.tmpout = os.tmpfile()
-            if mixed_out_err:
-                self.tmperr = self.tmpout
-            else:
-                self.tmperr = os.tmpfile()
             os.dup2(self.tmpout.fileno(), 1)
             os.dup2(self.tmperr.fileno(), 2)
 
diff --git a/lib-python/conftest.py b/lib-python/conftest.py
--- a/lib-python/conftest.py
+++ b/lib-python/conftest.py
@@ -154,18 +154,18 @@
     RegrTest('test_cmd.py'),
     RegrTest('test_cmd_line_script.py'),
     RegrTest('test_codeccallbacks.py', core=True),
-    RegrTest('test_codecencodings_cn.py'),
-    RegrTest('test_codecencodings_hk.py'),
-    RegrTest('test_codecencodings_jp.py'),
-    RegrTest('test_codecencodings_kr.py'),
-    RegrTest('test_codecencodings_tw.py'),
+    RegrTest('test_codecencodings_cn.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecencodings_hk.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecencodings_jp.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecencodings_kr.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecencodings_tw.py', usemodules='_multibytecodec'),
 
-    RegrTest('test_codecmaps_cn.py'),
-    RegrTest('test_codecmaps_hk.py'),
-    RegrTest('test_codecmaps_jp.py'),
-    RegrTest('test_codecmaps_kr.py'),
-    RegrTest('test_codecmaps_tw.py'),
-    RegrTest('test_codecs.py', core=True),
+    RegrTest('test_codecmaps_cn.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecmaps_hk.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecmaps_jp.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecmaps_kr.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecmaps_tw.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecs.py', core=True, usemodules='_multibytecodec'),
     RegrTest('test_codeop.py', core=True),
     RegrTest('test_coercion.py', core=True),
     RegrTest('test_collections.py'),
@@ -314,7 +314,7 @@
     RegrTest('test_mmap.py'),
     RegrTest('test_module.py', core=True),
     RegrTest('test_modulefinder.py'),
-    RegrTest('test_multibytecodec.py'),
+    RegrTest('test_multibytecodec.py', usemodules='_multibytecodec'),
     RegrTest('test_multibytecodec_support.py', skip="not a test"),
     RegrTest('test_multifile.py'),
     RegrTest('test_multiprocessing.py', skip='FIXME leaves subprocesses'),
@@ -359,7 +359,7 @@
     RegrTest('test_property.py', core=True),
     RegrTest('test_pstats.py'),
     RegrTest('test_pty.py', skip="unsupported extension module"),
-    RegrTest('test_pwd.py', skip=skip_win32),
+    RegrTest('test_pwd.py', usemodules="pwd", skip=skip_win32),
     RegrTest('test_py3kwarn.py'),
     RegrTest('test_pyclbr.py'),
     RegrTest('test_pydoc.py'),
diff --git a/lib-python/modified-2.7/ctypes/__init__.py b/lib-python/modified-2.7/ctypes/__init__.py
--- a/lib-python/modified-2.7/ctypes/__init__.py
+++ b/lib-python/modified-2.7/ctypes/__init__.py
@@ -489,9 +489,12 @@
         _flags_ = _FUNCFLAG_CDECL | _FUNCFLAG_PYTHONAPI
     return CFunctionType
 
-_cast = PYFUNCTYPE(py_object, c_void_p, py_object, py_object)(_cast_addr)
 def cast(obj, typ):
-    return _cast(obj, obj, typ)
+    try:
+        c_void_p.from_param(obj)
+    except TypeError, e:
+        raise ArgumentError(str(e))
+    return _cast_addr(obj, obj, typ)
 
 _string_at = PYFUNCTYPE(py_object, c_void_p, c_int)(_string_at_addr)
 def string_at(ptr, size=-1):
diff --git a/lib-python/modified-2.7/ctypes/util.py b/lib-python/modified-2.7/ctypes/util.py
--- a/lib-python/modified-2.7/ctypes/util.py
+++ b/lib-python/modified-2.7/ctypes/util.py
@@ -72,8 +72,8 @@
         return name
 
 if os.name == "posix" and sys.platform == "darwin":
-    from ctypes.macholib.dyld import dyld_find as _dyld_find
     def find_library(name):
+        from ctypes.macholib.dyld import dyld_find as _dyld_find
         possible = ['lib%s.dylib' % name,
                     '%s.dylib' % name,
                     '%s.framework/%s' % (name, name)]
diff --git a/lib-python/modified-2.7/distutils/unixccompiler.py b/lib-python/modified-2.7/distutils/unixccompiler.py
--- a/lib-python/modified-2.7/distutils/unixccompiler.py
+++ b/lib-python/modified-2.7/distutils/unixccompiler.py
@@ -324,7 +324,7 @@
             # On OSX users can specify an alternate SDK using
             # '-isysroot', calculate the SDK root if it is specified
             # (and use it further on)
-            cflags = sysconfig.get_config_var('CFLAGS')
+            cflags = sysconfig.get_config_var('CFLAGS') or ''
             m = re.search(r'-isysroot\s+(\S+)', cflags)
             if m is None:
                 sysroot = '/'
diff --git a/lib-python/modified-2.7/gzip.py b/lib-python/modified-2.7/gzip.py
new file mode 100644
--- /dev/null
+++ b/lib-python/modified-2.7/gzip.py
@@ -0,0 +1,514 @@
+"""Functions that read and write gzipped files.
+
+The user of the file doesn't have to worry about the compression,
+but random access is not allowed."""
+
+# based on Andrew Kuchling's minigzip.py distributed with the zlib module
+
+import struct, sys, time, os
+import zlib
+import io
+import __builtin__
+
+__all__ = ["GzipFile","open"]
+
+FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
+
+READ, WRITE = 1, 2
+
+def write32u(output, value):
+    # The L format writes the bit pattern correctly whether signed
+    # or unsigned.
+    output.write(struct.pack("<L", value))
+
+def read32(input):
+    return struct.unpack("<I", input.read(4))[0]
+
+def open(filename, mode="rb", compresslevel=9):
+    """Shorthand for GzipFile(filename, mode, compresslevel).
+
+    The filename argument is required; mode defaults to 'rb'
+    and compresslevel defaults to 9.
+
+    """
+    return GzipFile(filename, mode, compresslevel)
+
+class GzipFile(io.BufferedIOBase):
+    """The GzipFile class simulates most of the methods of a file object with
+    the exception of the readinto() and truncate() methods.
+
+    """
+
+    myfileobj = None
+    max_read_chunk = 10 * 1024 * 1024   # 10Mb
+
+    def __init__(self, filename=None, mode=None,
+                 compresslevel=9, fileobj=None, mtime=None):
+        """Constructor for the GzipFile class.
+
+        At least one of fileobj and filename must be given a
+        non-trivial value.
+
+        The new class instance is based on fileobj, which can be a regular
+        file, a StringIO object, or any other object which simulates a file.
+        It defaults to None, in which case filename is opened to provide
+        a file object.
+
+        When fileobj is not None, the filename argument is only used to be
+        included in the gzip file header, which may includes the original
+        filename of the uncompressed file.  It defaults to the filename of
+        fileobj, if discernible; otherwise, it defaults to the empty string,
+        and in this case the original filename is not included in the header.
+
+        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
+        depending on whether the file will be read or written.  The default
+        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
+        Be aware that only the 'rb', 'ab', and 'wb' values should be used
+        for cross-platform portability.
+
+        The compresslevel argument is an integer from 1 to 9 controlling the
+        level of compression; 1 is fastest and produces the least compression,
+        and 9 is slowest and produces the most compression.  The default is 9.
+
+        The mtime argument is an optional numeric timestamp to be written
+        to the stream when compressing.  All gzip compressed streams
+        are required to contain a timestamp.  If omitted or None, the
+        current time is used.  This module ignores the timestamp when
+        decompressing; however, some programs, such as gunzip, make use
+        of it.  The format of the timestamp is the same as that of the
+        return value of time.time() and of the st_mtime member of the
+        object returned by os.stat().
+
+        """
+
+        # guarantee the file is opened in binary mode on platforms
+        # that care about that sort of thing
+        if mode and 'b' not in mode:
+            mode += 'b'
+        if fileobj is None:
+            fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
+        if filename is None:
+            if hasattr(fileobj, 'name'): filename = fileobj.name
+            else: filename = ''
+        if mode is None:
+            if hasattr(fileobj, 'mode'): mode = fileobj.mode
+            else: mode = 'rb'
+
+        if mode[0:1] == 'r':
+            self.mode = READ
+            # Set flag indicating start of a new member
+            self._new_member = True
+            # Buffer data read from gzip file. extrastart is offset in
+            # stream where buffer starts. extrasize is number of
+            # bytes remaining in buffer from current stream position.
+            self.extrabuf = ""
+            self.extrasize = 0
+            self.extrastart = 0
+            self.name = filename
+            # Starts small, scales exponentially
+            self.min_readsize = 100
+
+        elif mode[0:1] == 'w' or mode[0:1] == 'a':
+            self.mode = WRITE
+            self._init_write(filename)
+            self.compress = zlib.compressobj(compresslevel,
+                                             zlib.DEFLATED,
+                                             -zlib.MAX_WBITS,
+                                             zlib.DEF_MEM_LEVEL,
+                                             0)
+        else:
+            raise IOError, "Mode " + mode + " not supported"
+
+        self.fileobj = fileobj
+        self.offset = 0
+        self.mtime = mtime
+
+        if self.mode == WRITE:
+            self._write_gzip_header()
+
+    @property
+    def filename(self):
+        import warnings
+        warnings.warn("use the name attribute", DeprecationWarning, 2)
+        if self.mode == WRITE and self.name[-3:] != ".gz":
+            return self.name + ".gz"
+        return self.name
+
+    def __repr__(self):
+        s = repr(self.fileobj)
+        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
+
+    def _check_closed(self):
+        """Raises a ValueError if the underlying file object has been closed.
+
+        """
+        if self.closed:
+            raise ValueError('I/O operation on closed file.')
+
+    def _init_write(self, filename):
+        self.name = filename
+        self.crc = zlib.crc32("") & 0xffffffffL
+        self.size = 0
+        self.writebuf = []
+        self.bufsize = 0
+
+    def _write_gzip_header(self):
+        self.fileobj.write('\037\213')             # magic header
+        self.fileobj.write('\010')                 # compression method
+        fname = os.path.basename(self.name)
+        if fname.endswith(".gz"):
+            fname = fname[:-3]
+        flags = 0
+        if fname:
+            flags = FNAME
+        self.fileobj.write(chr(flags))
+        mtime = self.mtime
+        if mtime is None:
+            mtime = time.time()
+        write32u(self.fileobj, long(mtime))
+        self.fileobj.write('\002')
+        self.fileobj.write('\377')
+        if fname:
+            self.fileobj.write(fname + '\000')
+
+    def _init_read(self):
+        self.crc = zlib.crc32("") & 0xffffffffL
+        self.size = 0
+
+    def _read_gzip_header(self):
+        magic = self.fileobj.read(2)
+        if magic != '\037\213':
+            raise IOError, 'Not a gzipped file'
+        method = ord( self.fileobj.read(1) )
+        if method != 8:
+            raise IOError, 'Unknown compression method'
+        flag = ord( self.fileobj.read(1) )
+        self.mtime = read32(self.fileobj)
+        # extraflag = self.fileobj.read(1)
+        # os = self.fileobj.read(1)
+        self.fileobj.read(2)
+
+        if flag & FEXTRA:
+            # Read & discard the extra field, if present
+            xlen = ord(self.fileobj.read(1))
+            xlen = xlen + 256*ord(self.fileobj.read(1))
+            self.fileobj.read(xlen)
+        if flag & FNAME:
+            # Read and discard a null-terminated string containing the filename
+            while True:
+                s = self.fileobj.read(1)
+                if not s or s=='\000':
+                    break
+        if flag & FCOMMENT:
+            # Read and discard a null-terminated string containing a comment
+            while True:
+                s = self.fileobj.read(1)
+                if not s or s=='\000':
+                    break
+        if flag & FHCRC:
+            self.fileobj.read(2)     # Read & discard the 16-bit header CRC
+
+    def write(self,data):
+        self._check_closed()
+        if self.mode != WRITE:
+            import errno
+            raise IOError(errno.EBADF, "write() on read-only GzipFile object")
+
+        if self.fileobj is None:
+            raise ValueError, "write() on closed GzipFile object"
+
+        # Convert data type if called by io.BufferedWriter.
+        if isinstance(data, memoryview):
+            data = data.tobytes()
+
+        if len(data) > 0:
+            self.size = self.size + len(data)
+            self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
+            self.fileobj.write( self.compress.compress(data) )
+            self.offset += len(data)
+
+        return len(data)
+
+    def read(self, size=-1):
+        self._check_closed()
+        if self.mode != READ:
+            import errno
+            raise IOError(errno.EBADF, "read() on write-only GzipFile object")
+
+        if self.extrasize <= 0 and self.fileobj is None:
+            return ''
+
+        readsize = 1024
+        if size < 0:        # get the whole thing
+            try:
+                while True:
+                    self._read(readsize)
+                    readsize = min(self.max_read_chunk, readsize * 2)
+            except EOFError:
+                size = self.extrasize
+        elif size == 0:
+            return ""
+        else:               # just get some more of it
+            try:
+                while size > self.extrasize:
+                    self._read(readsize)
+                    readsize = min(self.max_read_chunk, readsize * 2)
+            except EOFError:
+                if size > self.extrasize:
+                    size = self.extrasize
+
+        offset = self.offset - self.extrastart
+        chunk = self.extrabuf[offset: offset + size]
+        self.extrasize = self.extrasize - size
+
+        self.offset += size
+        return chunk
+
+    def _unread(self, buf):
+        self.extrasize = len(buf) + self.extrasize
+        self.offset -= len(buf)
+
+    def _read(self, size=1024):
+        if self.fileobj is None:
+            raise EOFError, "Reached EOF"
+
+        if self._new_member:
+            # If the _new_member flag is set, we have to
+            # jump to the next member, if there is one.
+            #
+            # First, check if we're at the end of the file;
+            # if so, it's time to stop; no more members to read.
+            pos = self.fileobj.tell()   # Save current position
+            self.fileobj.seek(0, 2)     # Seek to end of file
+            if pos == self.fileobj.tell():
+                raise EOFError, "Reached EOF"
+            else:
+                self.fileobj.seek( pos ) # Return to original position
+
+            self._init_read()
+            self._read_gzip_header()
+            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
+            self._new_member = False
+
+        # Read a chunk of data from the file
+        buf = self.fileobj.read(size)
+
+        # If the EOF has been reached, flush the decompression object
+        # and mark this object as finished.
+
+        if buf == "":
+            uncompress = self.decompress.flush()
+            self._read_eof()
+            self._add_read_data( uncompress )
+            raise EOFError, 'Reached EOF'
+
+        uncompress = self.decompress.decompress(buf)
+        self._add_read_data( uncompress )
+
+        if self.decompress.unused_data != "":
+            # Ending case: we've come to the end of a member in the file,
+            # so seek back to the start of the unused data, finish up
+            # this member, and read a new gzip header.
+            # (The number of bytes to seek back is the length of the unused
+            # data, minus 8 because _read_eof() will rewind a further 8 bytes)
+            self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
+
+            # Check the CRC and file size, and set the flag so we read
+            # a new member on the next call
+            self._read_eof()
+            self._new_member = True
+
+    def _add_read_data(self, data):
+        self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
+        offset = self.offset - self.extrastart
+        self.extrabuf = self.extrabuf[offset:] + data
+        self.extrasize = self.extrasize + len(data)
+        self.extrastart = self.offset
+        self.size = self.size + len(data)
+
+    def _read_eof(self):
+        # We've read to the end of the file, so we have to rewind in order
+        # to reread the 8 bytes containing the CRC and the file size.
+        # We check the that the computed CRC and size of the
+        # uncompressed data matches the stored values.  Note that the size
+        # stored is the true file size mod 2**32.
+        self.fileobj.seek(-8, 1)
+        crc32 = read32(self.fileobj)
+        isize = read32(self.fileobj)  # may exceed 2GB
+        if crc32 != self.crc:
+            raise IOError("CRC check failed %s != %s" % (hex(crc32),
+                                                         hex(self.crc)))
+        elif isize != (self.size & 0xffffffffL):
+            raise IOError, "Incorrect length of data produced"
+
+        # Gzip files can be padded with zeroes and still have archives.
+        # Consume all zero bytes and set the file position to the first
+        # non-zero byte. See http://www.gzip.org/#faq8
+        c = "\x00"
+        while c == "\x00":
+            c = self.fileobj.read(1)
+        if c:
+            self.fileobj.seek(-1, 1)
+
+    @property
+    def closed(self):
+        return self.fileobj is None
+
+    def close(self):
+        if self.fileobj is None:
+            return
+        if self.mode == WRITE:
+            self.fileobj.write(self.compress.flush())
+            write32u(self.fileobj, self.crc)
+            # self.size may exceed 2GB, or even 4GB
+            write32u(self.fileobj, self.size & 0xffffffffL)
+            self.fileobj = None
+        elif self.mode == READ:
+            self.fileobj = None
+        if self.myfileobj:
+            self.myfileobj.close()
+            self.myfileobj = None
+
+    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
+        self._check_closed()
+        if self.mode == WRITE:
+            # Ensure the compressor's buffer is flushed
+            self.fileobj.write(self.compress.flush(zlib_mode))
+            self.fileobj.flush()
+
+    def fileno(self):
+        """Invoke the underlying file object's fileno() method.
+
+        This will raise AttributeError if the underlying file object
+        doesn't support fileno().
+        """
+        return self.fileobj.fileno()
+
+    def rewind(self):
+        '''Return the uncompressed stream file position indicator to the
+        beginning of the file'''
+        if self.mode != READ:
+            raise IOError("Can't rewind in write mode")
+        self.fileobj.seek(0)
+        self._new_member = True
+        self.extrabuf = ""
+        self.extrasize = 0
+        self.extrastart = 0
+        self.offset = 0
+
+    def readable(self):
+        return self.mode == READ
+
+    def writable(self):
+        return self.mode == WRITE
+
+    def seekable(self):
+        return True
+
+    def seek(self, offset, whence=0):
+        if whence:
+            if whence == 1:
+                offset = self.offset + offset
+            else:
+                raise ValueError('Seek from end not supported')
+        if self.mode == WRITE:
+            if offset < self.offset:
+                raise IOError('Negative seek in write mode')
+            count = offset - self.offset
+            for i in range(count // 1024):
+                self.write(1024 * '\0')
+            self.write((count % 1024) * '\0')
+        elif self.mode == READ:
+            if offset == self.offset:
+                self.read(0) # to make sure that this file is open
+                return self.offset
+            if offset < self.offset:
+                # for negative seek, rewind and do positive seek
+                self.rewind()
+            count = offset - self.offset
+            for i in range(count // 1024):
+                self.read(1024)
+            self.read(count % 1024)
+
+        return self.offset
+
+    def readline(self, size=-1):
+        if size < 0:
+            # Shortcut common case - newline found in buffer.
+            offset = self.offset - self.extrastart
+            i = self.extrabuf.find('\n', offset) + 1
+            if i > 0:
+                self.extrasize -= i - offset
+                self.offset += i - offset
+                return self.extrabuf[offset: i]
+
+            size = sys.maxint
+            readsize = self.min_readsize
+        else:
+            readsize = size
+        bufs = []
+        while size != 0:
+            c = self.read(readsize)
+            i = c.find('\n')
+
+            # We set i=size to break out of the loop under two
+            # conditions: 1) there's no newline, and the chunk is
+            # larger than size, or 2) there is a newline, but the
+            # resulting line would be longer than 'size'.
+            if (size <= i) or (i == -1 and len(c) > size):
+                i = size - 1
+
+            if i >= 0 or c == '':
+                bufs.append(c[:i + 1])    # Add portion of last chunk
+                self._unread(c[i + 1:])   # Push back rest of chunk
+                break
+
+            # Append chunk to list, decrease 'size',
+            bufs.append(c)
+            size = size - len(c)
+            readsize = min(size, readsize * 2)
+        if readsize > self.min_readsize:
+            self.min_readsize = min(readsize, self.min_readsize * 2, 512)
+        return ''.join(bufs) # Return resulting line
+
+
+def _test():
+    # Act like gzip; with -d, act like gunzip.
+    # The input file is not deleted, however, nor are any other gzip
+    # options or features supported.
+    args = sys.argv[1:]
+    decompress = args and args[0] == "-d"
+    if decompress:
+        args = args[1:]
+    if not args:
+        args = ["-"]
+    for arg in args:
+        if decompress:
+            if arg == "-":
+                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
+                g = sys.stdout
+            else:
+                if arg[-3:] != ".gz":
+                    print "filename doesn't end in .gz:", repr(arg)
+                    continue
+                f = open(arg, "rb")
+                g = __builtin__.open(arg[:-3], "wb")
+        else:
+            if arg == "-":
+                f = sys.stdin
+                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
+            else:
+                f = __builtin__.open(arg, "rb")
+                g = open(arg + ".gz", "wb")
+        while True:
+            chunk = f.read(1024)
+            if not chunk:
+                break
+            g.write(chunk)
+        if g is not sys.stdout:
+            g.close()
+        if f is not sys.stdin:
+            f.close()
+
+if __name__ == '__main__':
+    _test()
diff --git a/lib-python/modified-2.7/sqlite3/test/regression.py b/lib-python/modified-2.7/sqlite3/test/regression.py
--- a/lib-python/modified-2.7/sqlite3/test/regression.py
+++ b/lib-python/modified-2.7/sqlite3/test/regression.py
@@ -274,6 +274,18 @@
         cur.execute("UPDATE foo SET id = 3 WHERE id = 1")
         self.assertEqual(cur.description, None)
 
+    def CheckStatementCache(self):
+        cur = self.con.cursor()
+        cur.execute("CREATE TABLE foo (id INTEGER)")
+        values = [(i,) for i in xrange(5)]
+        cur.executemany("INSERT INTO foo (id) VALUES (?)", values)
+
+        cur.execute("SELECT id FROM foo")
+        self.assertEqual(list(cur), values)
+        self.con.commit()
+        cur.execute("SELECT id FROM foo")
+        self.assertEqual(list(cur), values)
+
 def suite():
     regression_suite = unittest.makeSuite(RegressionTests, "Check")
     return unittest.TestSuite((regression_suite,))
diff --git a/lib-python/modified-2.7/tarfile.py b/lib-python/modified-2.7/tarfile.py
--- a/lib-python/modified-2.7/tarfile.py
+++ b/lib-python/modified-2.7/tarfile.py
@@ -252,8 +252,8 @@
        the high bit set. So we calculate two checksums, unsigned and
        signed.
     """
-    unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
-    signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
+    unsigned_chksum = 256 + sum(struct.unpack("148B8x356B", buf[:512]))
+    signed_chksum = 256 + sum(struct.unpack("148b8x356b", buf[:512]))
     return unsigned_chksum, signed_chksum
 
 def copyfileobj(src, dst, length=None):
@@ -265,7 +265,6 @@
     if length is None:
         shutil.copyfileobj(src, dst)
         return
-
     BUFSIZE = 16 * 1024
     blocks, remainder = divmod(length, BUFSIZE)
     for b in xrange(blocks):
@@ -802,19 +801,19 @@
         if self.closed:
             raise ValueError("I/O operation on closed file")
 
-        buf = ""
         if self.buffer:
             if size is None:
-                buf = self.buffer
+                buf = self.buffer + self.fileobj.read()
                 self.buffer = ""
             else:
                 buf = self.buffer[:size]
                 self.buffer = self.buffer[size:]
-
-        if size is None:
-            buf += self.fileobj.read()
+                buf += self.fileobj.read(size - len(buf))
         else:
-            buf += self.fileobj.read(size - len(buf))
+            if size is None:
+                buf = self.fileobj.read()
+            else:
+                buf = self.fileobj.read(size)
 
         self.position += len(buf)
         return buf
diff --git a/lib-python/modified-2.7/test/regrtest.py b/lib-python/modified-2.7/test/regrtest.py
--- a/lib-python/modified-2.7/test/regrtest.py
+++ b/lib-python/modified-2.7/test/regrtest.py
@@ -1403,7 +1403,26 @@
         test_zipimport
         test_zlib
         """,
-    'openbsd3':
+    'openbsd4':
+        """
+        test_ascii_formatd
+        test_bsddb
+        test_bsddb3
+        test_ctypes
+        test_dl
+        test_epoll
+        test_gdbm
+        test_locale
+        test_normalization
+        test_ossaudiodev
+        test_pep277
+        test_tcl
+        test_tk
+        test_ttk_guionly
+        test_ttk_textonly
+        test_multiprocessing
+        """,
+    'openbsd5':
         """
         test_ascii_formatd
         test_bsddb
diff --git a/lib-python/modified-2.7/test/test_bz2.py b/lib-python/modified-2.7/test/test_bz2.py
--- a/lib-python/modified-2.7/test/test_bz2.py
+++ b/lib-python/modified-2.7/test/test_bz2.py
@@ -50,6 +50,7 @@
         self.filename = TESTFN
 
     def tearDown(self):
+        test_support.gc_collect()
         if os.path.isfile(self.filename):
             os.unlink(self.filename)
 
diff --git a/lib-python/modified-2.7/test/test_fcntl.py b/lib-python/modified-2.7/test/test_fcntl.py
new file mode 100644
--- /dev/null
+++ b/lib-python/modified-2.7/test/test_fcntl.py
@@ -0,0 +1,108 @@
+"""Test program for the fcntl C module.
+
+OS/2+EMX doesn't support the file locking operations.
+
+"""
+import os
+import struct
+import sys
+import unittest
+from test.test_support import (verbose, TESTFN, unlink, run_unittest,
+    import_module)
+
+# Skip test if no fnctl module.
+fcntl = import_module('fcntl')
+
+
+# TODO - Write tests for flock() and lockf().
+
+def get_lockdata():
+    if sys.platform.startswith('atheos'):
+        start_len = "qq"
+    else:
+        try:
+            os.O_LARGEFILE
+        except AttributeError:
+            start_len = "ll"
+        else:
+            start_len = "qq"
+
+    if sys.platform in ('netbsd1', 'netbsd2', 'netbsd3',
+                        'Darwin1.2', 'darwin',
+                        'freebsd2', 'freebsd3', 'freebsd4', 'freebsd5',
+                        'freebsd6', 'freebsd7', 'freebsd8',
+                        'bsdos2', 'bsdos3', 'bsdos4',
+                        'openbsd', 'openbsd2', 'openbsd3', 'openbsd4', 'openbsd5'):
+        if struct.calcsize('l') == 8:
+            off_t = 'l'
+            pid_t = 'i'
+        else:
+            off_t = 'lxxxx'
+            pid_t = 'l'
+        lockdata = struct.pack(off_t + off_t + pid_t + 'hh', 0, 0, 0,
+                               fcntl.F_WRLCK, 0)
+    elif sys.platform in ['aix3', 'aix4', 'hp-uxB', 'unixware7']:
+        lockdata = struct.pack('hhlllii', fcntl.F_WRLCK, 0, 0, 0, 0, 0, 0)
+    elif sys.platform in ['os2emx']:
+        lockdata = None
+    else:
+        lockdata = struct.pack('hh'+start_len+'hh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+    if lockdata:
+        if verbose:
+            print 'struct.pack: ', repr(lockdata)
+    return lockdata
+
+lockdata = get_lockdata()
+
+
+class TestFcntl(unittest.TestCase):
+
+    def setUp(self):
+        self.f = None
+
+    def tearDown(self):
+        if self.f and not self.f.closed:
+            self.f.close()
+        unlink(TESTFN)
+
+    def test_fcntl_fileno(self):
+        # the example from the library docs
+        self.f = open(TESTFN, 'w')
+        rv = fcntl.fcntl(self.f.fileno(), fcntl.F_SETFL, os.O_NONBLOCK)
+        if verbose:
+            print 'Status from fcntl with O_NONBLOCK: ', rv
+        if sys.platform not in ['os2emx']:
+            rv = fcntl.fcntl(self.f.fileno(), fcntl.F_SETLKW, lockdata)
+            if verbose:
+                print 'String from fcntl with F_SETLKW: ', repr(rv)
+        self.f.close()
+
+    def test_fcntl_file_descriptor(self):
+        # again, but pass the file rather than numeric descriptor
+        self.f = open(TESTFN, 'w')
+        rv = fcntl.fcntl(self.f, fcntl.F_SETFL, os.O_NONBLOCK)
+        if sys.platform not in ['os2emx']:
+            rv = fcntl.fcntl(self.f, fcntl.F_SETLKW, lockdata)
+        self.f.close()
+
+    def test_fcntl_64_bit(self):
+        # Issue #1309352: fcntl shouldn't fail when the third arg fits in a
+        # C 'long' but not in a C 'int'.
+        try:
+            cmd = fcntl.F_NOTIFY
+            # This flag is larger than 2**31 in 64-bit builds
+            flags = fcntl.DN_MULTISHOT
+        except AttributeError:
+            self.skipTest("F_NOTIFY or DN_MULTISHOT unavailable")
+        fd = os.open(os.path.dirname(os.path.abspath(TESTFN)), os.O_RDONLY)
+        try:
+            fcntl.fcntl(fd, cmd, flags)
+        finally:
+            os.close(fd)
+
+
+def test_main():
+    run_unittest(TestFcntl)
+
+if __name__ == '__main__':
+    test_main()
diff --git a/lib-python/modified-2.7/test/test_multibytecodec.py b/lib-python/modified-2.7/test/test_multibytecodec.py
--- a/lib-python/modified-2.7/test/test_multibytecodec.py
+++ b/lib-python/modified-2.7/test/test_multibytecodec.py
@@ -148,7 +148,8 @@
 class Test_StreamReader(unittest.TestCase):
     def test_bug1728403(self):
         try:
-            open(TESTFN, 'w').write('\xa1')
+            with open(TESTFN, 'w') as f:
+                f.write('\xa1')
             f = codecs.open(TESTFN, encoding='cp949')
             self.assertRaises(UnicodeDecodeError, f.read, 2)
         finally:
diff --git a/lib-python/modified-2.7/test/test_tempfile.py b/lib-python/modified-2.7/test/test_tempfile.py
--- a/lib-python/modified-2.7/test/test_tempfile.py
+++ b/lib-python/modified-2.7/test/test_tempfile.py
@@ -23,8 +23,8 @@
 
 # TEST_FILES may need to be tweaked for systems depending on the maximum
 # number of files that can be opened at one time (see ulimit -n)
-if sys.platform in ('openbsd3', 'openbsd4'):
-    TEST_FILES = 48
+if sys.platform.startswith("openbsd"):
+    TEST_FILES = 64 # ulimit -n defaults to 128 for normal users
 else:
     TEST_FILES = 100
 
diff --git a/lib_pypy/_ctypes/basics.py b/lib_pypy/_ctypes/basics.py
--- a/lib_pypy/_ctypes/basics.py
+++ b/lib_pypy/_ctypes/basics.py
@@ -54,7 +54,8 @@
     def get_ffi_argtype(self):
         if self._ffiargtype:
             return self._ffiargtype
-        return _shape_to_ffi_type(self._ffiargshape)
+        self._ffiargtype = _shape_to_ffi_type(self._ffiargshape)
+        return self._ffiargtype
 
     def _CData_output(self, resbuffer, base=None, index=-1):
         #assert isinstance(resbuffer, _rawffi.ArrayInstance)
@@ -166,7 +167,8 @@
     return tp._alignmentofinstances()
 
 def byref(cdata):
-    from ctypes import pointer
+    # "pointer" is imported at the end of this module to avoid circular
+    # imports
     return pointer(cdata)
 
 def cdata_from_address(self, address):
@@ -224,5 +226,9 @@
     'Z' : _ffi.types.void_p,
     'X' : _ffi.types.void_p,
     'v' : _ffi.types.sshort,
+    '?' : _ffi.types.ubyte,
     }
 
+
+# used by "byref"
+from _ctypes.pointer import pointer
diff --git a/lib_pypy/_ctypes/function.py b/lib_pypy/_ctypes/function.py
--- a/lib_pypy/_ctypes/function.py
+++ b/lib_pypy/_ctypes/function.py
@@ -78,8 +78,6 @@
     _com_iid = None
     _is_fastpath = False
 
-    __restype_set = False
-
     def _getargtypes(self):
         return self._argtypes_
 
@@ -93,13 +91,15 @@
                     raise TypeError(
                         "item %d in _argtypes_ has no from_param method" % (
                             i + 1,))
-            #
-            if all([hasattr(argtype, '_ffiargshape') for argtype in argtypes]):
-                fastpath_cls = make_fastpath_subclass(self.__class__)
-                fastpath_cls.enable_fastpath_maybe(self)
             self._argtypes_ = list(argtypes)
+            self._check_argtypes_for_fastpath()
     argtypes = property(_getargtypes, _setargtypes)
 
+    def _check_argtypes_for_fastpath(self):
+        if all([hasattr(argtype, '_ffiargshape') for argtype in self._argtypes_]):
+            fastpath_cls = make_fastpath_subclass(self.__class__)
+            fastpath_cls.enable_fastpath_maybe(self)
+
     def _getparamflags(self):
         return self._paramflags
 
@@ -149,7 +149,6 @@
         return self._restype_
 
     def _setrestype(self, restype):
-        self.__restype_set = True
         self._ptr = None
         if restype is int:
             from ctypes import c_int
@@ -219,6 +218,7 @@
                 import ctypes
                 restype = ctypes.c_int
             self._ptr = self._getfuncptr_fromaddress(self._argtypes_, restype)
+            self._check_argtypes_for_fastpath()
             return
 
         
@@ -296,13 +296,12 @@
                     "This function takes %d argument%s (%s given)"
                     % (len(self._argtypes_), plural, len(args)))
 
-            # check that arguments are convertible
-            ## XXX Not as long as ctypes.cast is a callback function with
-            ## py_object arguments...
-            ## self._convert_args(self._argtypes_, args, {})
-
             try:
-                res = self.callable(*args)
+                newargs = self._convert_args_for_callback(argtypes, args)
+            except (UnicodeError, TypeError, ValueError), e:
+                raise ArgumentError(str(e))
+            try:
+                res = self.callable(*newargs)
             except:
                 exc_info = sys.exc_info()
                 traceback.print_tb(exc_info[2], file=sys.stderr)
@@ -316,10 +315,6 @@
             warnings.warn('C function without declared arguments called',
                           RuntimeWarning, stacklevel=2)
             argtypes = []
-            
-        if not self.__restype_set:
-            warnings.warn('C function without declared return type called',
-                          RuntimeWarning, stacklevel=2)
 
         if self._com_index:
             from ctypes import cast, c_void_p, POINTER
@@ -366,7 +361,10 @@
             if self._flags_ & _rawffi.FUNCFLAG_USE_LASTERROR:
                 set_last_error(_rawffi.get_last_error())
         #
-        return self._build_result(self._restype_, result, newargs)
+        try:
+            return self._build_result(self._restype_, result, newargs)
+        finally:
+            funcptr.free_temp_buffers()
 
     def _do_errcheck(self, result, args):
         # The 'errcheck' protocol
@@ -466,6 +464,19 @@
 
         return cobj, cobj._to_ffi_param(), type(cobj)
 
+    def _convert_args_for_callback(self, argtypes, args):
+        assert len(argtypes) == len(args)
+        newargs = []
+        for argtype, arg in zip(argtypes, args):
+            param = argtype.from_param(arg)
+            _type_ = getattr(argtype, '_type_', None)
+            if _type_ == 'P': # special-case for c_void_p
+                param = param._get_buffer_value()
+            elif self._is_primitive(argtype):
+                param = param.value
+            newargs.append(param)
+        return newargs
+
     def _convert_args(self, argtypes, args, kwargs, marker=object()):
         newargs = []
         outargs = []
@@ -556,6 +567,9 @@
                 newargtypes.append(newargtype)
         return keepalives, newargs, newargtypes, outargs
 
+    @staticmethod
+    def _is_primitive(argtype):
+        return argtype.__bases__[0] is _SimpleCData
     
     def _wrap_result(self, restype, result):
         """
@@ -564,7 +578,7 @@
         """
         # hack for performance: if restype is a "simple" primitive type, don't
         # allocate the buffer because it's going to be thrown away immediately
-        if restype.__bases__[0] is _SimpleCData and not restype._is_pointer_like():
+        if self._is_primitive(restype) and not restype._is_pointer_like():
             return result
         #
         shape = restype._ffishape
@@ -680,7 +694,7 @@
             try:
                 result = self._call_funcptr(funcptr, *args)
                 result = self._do_errcheck(result, args)
-            except (TypeError, ArgumentError): # XXX, should be FFITypeError
+            except (TypeError, ArgumentError, UnicodeDecodeError):
                 assert self._slowpath_allowed
                 return CFuncPtr.__call__(self, *args)
             return result
diff --git a/lib_pypy/_ctypes/primitive.py b/lib_pypy/_ctypes/primitive.py
--- a/lib_pypy/_ctypes/primitive.py
+++ b/lib_pypy/_ctypes/primitive.py
@@ -10,6 +10,8 @@
 from _ctypes.builtin import ConvMode
 from _ctypes.array import Array
 from _ctypes.pointer import _Pointer, as_ffi_pointer
+#from _ctypes.function import CFuncPtr # this import is moved at the bottom
+                                       # because else it's circular
 
 class NULL(object):
     pass
@@ -86,7 +88,7 @@
         return res
     if isinstance(value, Array):
         return value
-    if isinstance(value, _Pointer):
+    if isinstance(value, (_Pointer, CFuncPtr)):
         return cls.from_address(value._buffer.buffer)
     if isinstance(value, (int, long)):
         return cls(value)
@@ -338,3 +340,5 @@
 
     def __nonzero__(self):
         return self._buffer[0] not in (0, '\x00')
+
+from _ctypes.function import CFuncPtr
diff --git a/lib_pypy/_ctypes/structure.py b/lib_pypy/_ctypes/structure.py
--- a/lib_pypy/_ctypes/structure.py
+++ b/lib_pypy/_ctypes/structure.py
@@ -14,6 +14,15 @@
             raise TypeError("Expected CData subclass, got %s" % (tp,))
         if isinstance(tp, StructOrUnionMeta):
             tp._make_final()
+        if len(f) == 3:
+            if (not hasattr(tp, '_type_')
+                or not isinstance(tp._type_, str)
+                or tp._type_ not in "iIhHbBlL"):
+                #XXX: are those all types?
+                #     we just dont get the type name
+                #     in the interp levle thrown TypeError
+                #     from rawffi if there are more
+                raise TypeError('bit fields not allowed for type ' + tp.__name__)
 
     all_fields = []
     for cls in reversed(inspect.getmro(superclass)):
@@ -34,34 +43,37 @@
     for i, field in enumerate(all_fields):
         name = field[0]
         value = field[1]
+        is_bitfield = (len(field) == 3)
         fields[name] = Field(name,
                              self._ffistruct.fieldoffset(name),
                              self._ffistruct.fieldsize(name),
-                             value, i)
+                             value, i, is_bitfield)
 
     if anonymous_fields:
         resnames = []
         for i, field in enumerate(all_fields):
             name = field[0]
             value = field[1]
+            is_bitfield = (len(field) == 3)
             startpos = self._ffistruct.fieldoffset(name)
             if name in anonymous_fields:
                 for subname in value._names:
                     resnames.append(subname)
-                    relpos = startpos + value._fieldtypes[subname].offset
-                    subvalue = value._fieldtypes[subname].ctype
+                    subfield = getattr(value, subname)
+                    relpos = startpos + subfield.offset
+                    subvalue = subfield.ctype
                     fields[subname] = Field(subname,
                                             relpos, subvalue._sizeofinstances(),
-                                            subvalue, i)
+                                            subvalue, i, is_bitfield)
             else:
                 resnames.append(name)
         names = resnames
     self._names = names
-    self._fieldtypes = fields
+    self.__dict__.update(fields)
 
 class Field(object):
-    def __init__(self, name, offset, size, ctype, num):
-        for k in ('name', 'offset', 'size', 'ctype', 'num'):
+    def __init__(self, name, offset, size, ctype, num, is_bitfield):
+        for k in ('name', 'offset', 'size', 'ctype', 'num', 'is_bitfield'):
             self.__dict__[k] = locals()[k]
 
     def __setattr__(self, name, value):
@@ -71,6 +83,35 @@
         return "<Field '%s' offset=%d size=%d>" % (self.name, self.offset,
                                                    self.size)
 
+    def __get__(self, obj, cls=None):
+        if obj is None:
+            return self
+        if self.is_bitfield:
+            # bitfield member, use direct access
+            return obj._buffer.__getattr__(self.name)
+        else:
+            fieldtype = self.ctype
+            offset = self.num
+            suba = obj._subarray(fieldtype, self.name)
+            return fieldtype._CData_output(suba, obj, offset)
+
+
+    def __set__(self, obj, value):
+        fieldtype = self.ctype
+        cobj = fieldtype.from_param(value)
+        if ensure_objects(cobj) is not None:
+            key = keepalive_key(self.num)
+            store_reference(obj, key, cobj._objects)
+        arg = cobj._get_buffer_value()
+        if fieldtype._fficompositesize is not None:
+            from ctypes import memmove
+            dest = obj._buffer.fieldaddress(self.name)
+            memmove(dest, arg, fieldtype._fficompositesize)
+        else:
+            obj._buffer.__setattr__(self.name, arg)
+
+
+
 # ________________________________________________________________
 
 def _set_shape(tp, rawfields, is_union=False):
@@ -79,17 +120,12 @@
     tp._ffiargshape = tp._ffishape = (tp._ffistruct, 1)
     tp._fficompositesize = tp._ffistruct.size
 
-def struct_getattr(self, name):
-    if name not in ('_fields_', '_fieldtypes'):
-        if hasattr(self, '_fieldtypes') and name in self._fieldtypes:
-            return self._fieldtypes[name]
-    return _CDataMeta.__getattribute__(self, name)
 
 def struct_setattr(self, name, value):
     if name == '_fields_':
         if self.__dict__.get('_fields_', None) is not None:
             raise AttributeError("_fields_ is final")
-        if self in [v for k, v in value]:
+        if self in [f[1] for f in value]:
             raise AttributeError("Structure or union cannot contain itself")
         names_and_fields(
             self,
@@ -127,14 +163,14 @@
         if '_fields_' not in self.__dict__:
             self._fields_ = []
             self._names = []
-            self._fieldtypes = {}
             _set_shape(self, [], self._is_union)
 
-    __getattr__ = struct_getattr
     __setattr__ = struct_setattr
 
     def from_address(self, address):
         instance = StructOrUnion.__new__(self)
+        if isinstance(address, _rawffi.StructureInstance):
+            address = address.buffer
         instance.__dict__['_buffer'] = self._ffistruct.fromaddress(address)
         return instance
 
@@ -200,40 +236,6 @@
         A = _rawffi.Array(fieldtype._ffishape)
         return A.fromaddress(address, 1)
 
-    def __setattr__(self, name, value):
-        try:
-            field = self._fieldtypes[name]
-        except KeyError:
-            return _CData.__setattr__(self, name, value)
-        fieldtype = field.ctype
-        cobj = fieldtype.from_param(value)
-        if ensure_objects(cobj) is not None:
-            key = keepalive_key(field.num)
-            store_reference(self, key, cobj._objects)
-        arg = cobj._get_buffer_value()
-        if fieldtype._fficompositesize is not None:
-            from ctypes import memmove
-            dest = self._buffer.fieldaddress(name)
-            memmove(dest, arg, fieldtype._fficompositesize)
-        else:
-            self._buffer.__setattr__(name, arg)
-
-    def __getattribute__(self, name):
-        if name == '_fieldtypes':
-            return _CData.__getattribute__(self, '_fieldtypes')
-        try:
-            field = self._fieldtypes[name]
-        except KeyError:
-            return _CData.__getattribute__(self, name)
-        if field.size >> 16:
-            # bitfield member, use direct access
-            return self._buffer.__getattr__(name)
-        else:
-            fieldtype = field.ctype
-            offset = field.num
-            suba = self._subarray(fieldtype, name)
-            return fieldtype._CData_output(suba, self, offset)
-
     def _get_buffer_for_param(self):
         return self
 
diff --git a/lib_pypy/_elementtree.py b/lib_pypy/_elementtree.py
new file mode 100644
--- /dev/null
+++ b/lib_pypy/_elementtree.py
@@ -0,0 +1,6 @@
+# Just use ElementTree.
+
+from xml.etree import ElementTree
+
+globals().update(ElementTree.__dict__)
+del __all__
diff --git a/lib_pypy/_pypy_interact.py b/lib_pypy/_pypy_interact.py
--- a/lib_pypy/_pypy_interact.py
+++ b/lib_pypy/_pypy_interact.py
@@ -56,6 +56,10 @@
                 prompt = getattr(sys, 'ps1', '>>> ')
             try:
                 line = raw_input(prompt)
+                # Can be None if sys.stdin was redefined
+                encoding = getattr(sys.stdin, 'encoding', None)
+                if encoding and not isinstance(line, unicode):
+                    line = line.decode(encoding)
             except EOFError:
                 console.write("\n")
                 break
diff --git a/lib_pypy/_sqlite3.py b/lib_pypy/_sqlite3.py
--- a/lib_pypy/_sqlite3.py
+++ b/lib_pypy/_sqlite3.py
@@ -24,6 +24,7 @@
 from ctypes import c_void_p, c_int, c_double, c_int64, c_char_p, cdll
 from ctypes import POINTER, byref, string_at, CFUNCTYPE, cast
 from ctypes import sizeof, c_ssize_t
+from collections import OrderedDict
 import datetime
 import sys
 import time
@@ -274,6 +275,28 @@
 def unicode_text_factory(x):
     return unicode(x, 'utf-8')
 
+
+class StatementCache(object):
+    def __init__(self, connection, maxcount):
+        self.connection = connection
+        self.maxcount = maxcount
+        self.cache = OrderedDict()
+
+    def get(self, sql, cursor, row_factory):
+        try:
+            stat = self.cache[sql]
+        except KeyError:
+            stat = Statement(self.connection, sql)
+            self.cache[sql] = stat
+            if len(self.cache) > self.maxcount:
+                self.cache.popitem(0)
+        #
+        if stat.in_use:
+            stat = Statement(self.connection, sql)
+        stat.set_row_factory(row_factory)
+        return stat
+
+
 class Connection(object):
     def __init__(self, database, timeout=5.0, detect_types=0, isolation_level="",
                  check_same_thread=True, factory=None, cached_statements=100):
@@ -291,6 +314,7 @@
         self.row_factory = None
         self._isolation_level = isolation_level
         self.detect_types = detect_types
+        self.statement_cache = StatementCache(self, cached_statements)
 
         self.cursors = []
 
@@ -399,7 +423,7 @@
         cur = Cursor(self)
         if not isinstance(sql, (str, unicode)):
             raise Warning("SQL is of wrong type. Must be string or unicode.")
-        statement = Statement(cur, sql, self.row_factory)
+        statement = self.statement_cache.get(sql, cur, self.row_factory)
         return statement
 
     def _get_isolation_level(self):
@@ -681,6 +705,8 @@
         from sqlite3.dump import _iterdump
         return _iterdump(self)
 
+DML, DQL, DDL = range(3)
+
 class Cursor(object):
     def __init__(self, con):
         if not isinstance(con, Connection):
@@ -708,12 +734,12 @@
         if type(sql) is unicode:
             sql = sql.encode("utf-8")
         self._check_closed()
-        self.statement = Statement(self, sql, self.row_factory)
+        self.statement = self.connection.statement_cache.get(sql, self, self.row_factory)
 
         if self.connection._isolation_level is not None:
-            if self.statement.kind == "DDL":
+            if self.statement.kind == DDL:
                 self.connection.commit()
-            elif self.statement.kind == "DML":
+            elif self.statement.kind == DML:
                 self.connection._begin()
 
         self.statement.set_params(params)
@@ -724,19 +750,18 @@
             self.statement.reset()
             raise self.connection._get_exception(ret)
 
-        if self.statement.kind == "DQL":
-            if ret == SQLITE_ROW:
-                self.statement._build_row_cast_map()
-                self.statement._readahead()
-            else:
-                self.statement.item = None
-                self.statement.exhausted = True
+        if self.statement.kind == DQL and ret == SQLITE_ROW:
+            self.statement._build_row_cast_map()
+            self.statement._readahead(self)
+        else:
+            self.statement.item = None
+            self.statement.exhausted = True
 
-        if self.statement.kind in ("DML", "DDL"):
+        if self.statement.kind == DML or self.statement.kind == DDL:
             self.statement.reset()
 
         self.rowcount = -1
-        if self.statement.kind == "DML":
+        if self.statement.kind == DML:
             self.rowcount = sqlite.sqlite3_changes(self.connection.db)
 
         return self
@@ -747,8 +772,9 @@
         if type(sql) is unicode:
             sql = sql.encode("utf-8")
         self._check_closed()
-        self.statement = Statement(self, sql, self.row_factory)
-        if self.statement.kind == "DML":
+        self.statement = self.connection.statement_cache.get(sql, self, self.row_factory)
+
+        if self.statement.kind == DML:
             self.connection._begin()
         else:
             raise ProgrammingError, "executemany is only for DML statements"
@@ -800,7 +826,7 @@
         return self
 
     def __iter__(self):
-        return self.statement
+        return iter(self.fetchone, None)
 
     def _check_reset(self):
         if self.reset:
@@ -817,7 +843,7 @@
             return None
 
         try:
-            return self.statement.next()
+            return self.statement.next(self)
         except StopIteration:
             return None
 
@@ -831,7 +857,7 @@
         if size is None:
             size = self.arraysize
         lst = []
-        for row in self.statement:
+        for row in self:
             lst.append(row)
             if len(lst) == size:
                 break
@@ -842,7 +868,7 @@
         self._check_reset()
         if self.statement is None:
             return []
-        return list(self.statement)
+        return list(self)
 
     def _getdescription(self):
         if self._description is None:
@@ -872,39 +898,47 @@
     lastrowid = property(_getlastrowid)
 
 class Statement(object):
-    def __init__(self, cur, sql, row_factory):
+    def __init__(self, connection, sql):
         self.statement = None
         if not isinstance(sql, str):
             raise ValueError, "sql must be a string"
-        self.con = cur.connection
-        self.cur = weakref.ref(cur)
+        self.con = connection
         self.sql = sql # DEBUG ONLY
-        self.row_factory = row_factory
         first_word = self._statement_kind = sql.lstrip().split(" ")[0].upper()
         if first_word in ("INSERT", "UPDATE", "DELETE", "REPLACE"):
-            self.kind = "DML"
+            self.kind = DML
         elif first_word in ("SELECT", "PRAGMA"):
-            self.kind = "DQL"
+            self.kind = DQL
         else:
-            self.kind = "DDL"
+            self.kind = DDL
         self.exhausted = False
+        self.in_use = False
+        #
+        # set by set_row_factory
+        self.row_factory = None
 
         self.statement = c_void_p()
         next_char = c_char_p()
-        ret = sqlite.sqlite3_prepare_v2(self.con.db, sql, -1, byref(self.statement), byref(next_char))
+        sql_char = c_char_p(sql)
+        ret = sqlite.sqlite3_prepare_v2(self.con.db, sql_char, -1, byref(self.statement), byref(next_char))
         if ret == SQLITE_OK and self.statement.value is None:
             # an empty statement, we work around that, as it's the least trouble
             ret = sqlite.sqlite3_prepare_v2(self.con.db, "select 42", -1, byref(self.statement), byref(next_char))
-            self.kind = "DQL"
+            self.kind = DQL
 
         if ret != SQLITE_OK:
             raise self.con._get_exception(ret)
         self.con._remember_statement(self)
         if _check_remaining_sql(next_char.value):
-            raise Warning, "One and only one statement required"
+            raise Warning, "One and only one statement required: %r" % (
+                next_char.value,)
+        # sql_char should remain alive until here
 
         self._build_row_cast_map()
 
+    def set_row_factory(self, row_factory):
+        self.row_factory = row_factory
+
     def _build_row_cast_map(self):
         self.row_cast_map = []
         for i in xrange(sqlite.sqlite3_column_count(self.statement)):
@@ -974,6 +1008,7 @@
         ret = sqlite.sqlite3_reset(self.statement)
         if ret != SQLITE_OK:
             raise self.con._get_exception(ret)
+        self.mark_dirty()
 
         if params is None:
             if sqlite.sqlite3_bind_parameter_count(self.statement) != 0:
@@ -1004,10 +1039,7 @@
                     raise ProgrammingError("missing parameter '%s'" %param)
                 self.set_param(idx, param)
 
-    def __iter__(self):
-        return self
-
-    def next(self):
+    def next(self, cursor):
         self.con._check_closed()
         self.con._check_thread()
         if self.exhausted:
@@ -1023,10 +1055,10 @@
             sqlite.sqlite3_reset(self.statement)
             raise exc
 
-        self._readahead()
+        self._readahead(cursor)
         return item
 
-    def _readahead(self):
+    def _readahead(self, cursor):
         self.column_count = sqlite.sqlite3_column_count(self.statement)
         row = []
         for i in xrange(self.column_count):
@@ -1061,23 +1093,30 @@
 
         row = tuple(row)
         if self.row_factory is not None:
-            row = self.row_factory(self.cur(), row)
+            row = self.row_factory(cursor, row)
         self.item = row
 
     def reset(self):
         self.row_cast_map = None
-        return sqlite.sqlite3_reset(self.statement)
+        ret = sqlite.sqlite3_reset(self.statement)
+        self.in_use = False
+        self.exhausted = False
+        return ret
 
     def finalize(self):
         sqlite.sqlite3_finalize(self.statement)
         self.statement = None
+        self.in_use = False
+
+    def mark_dirty(self):
+        self.in_use = True
 
     def __del__(self):
         sqlite.sqlite3_finalize(self.statement)
         self.statement = None
 
     def _get_description(self):
-        if self.kind == "DML":
+        if self.kind == DML:
             return None
         desc = []
         for i in xrange(sqlite.sqlite3_column_count(self.statement)):
diff --git a/lib_pypy/_subprocess.py b/lib_pypy/_subprocess.py
--- a/lib_pypy/_subprocess.py
+++ b/lib_pypy/_subprocess.py
@@ -35,7 +35,7 @@
 _DuplicateHandle.restype = ctypes.c_int
     
 _WaitForSingleObject = _kernel32.WaitForSingleObject
-_WaitForSingleObject.argtypes = [ctypes.c_int, ctypes.c_int]
+_WaitForSingleObject.argtypes = [ctypes.c_int, ctypes.c_uint]
 _WaitForSingleObject.restype = ctypes.c_int
 
 _GetExitCodeProcess = _kernel32.GetExitCodeProcess
diff --git a/lib_pypy/distributed/test/test_distributed.py b/lib_pypy/distributed/test/test_distributed.py
--- a/lib_pypy/distributed/test/test_distributed.py
+++ b/lib_pypy/distributed/test/test_distributed.py
@@ -9,7 +9,7 @@
 class AppTestDistributed(object):
     def setup_class(cls):
         cls.space = gettestobjspace(**{"objspace.std.withtproxy": True,
-            "usemodules":("_stackless",)})
+            "usemodules":("_continuation",)})
 
     def test_init(self):
         import distributed
@@ -91,10 +91,8 @@
 
 class AppTestDistributedTasklets(object):
     spaceconfig = {"objspace.std.withtproxy": True,
-                   "objspace.usemodules._stackless": True}
+                   "objspace.usemodules._continuation": True}
     def setup_class(cls):
-        #cls.space = gettestobjspace(**{"objspace.std.withtproxy": True,
-        #    "usemodules":("_stackless",)})
         cls.w_test_env = cls.space.appexec([], """():
         from distributed import test_env
         return test_env
diff --git a/lib_pypy/distributed/test/test_greensock.py b/lib_pypy/distributed/test/test_greensock.py
--- a/lib_pypy/distributed/test/test_greensock.py
+++ b/lib_pypy/distributed/test/test_greensock.py
@@ -10,7 +10,7 @@
         if not option.runappdirect:
             py.test.skip("Cannot run this on top of py.py because of PopenGateway")
         cls.space = gettestobjspace(**{"objspace.std.withtproxy": True,
-                                       "usemodules":("_stackless",)})
+                                       "usemodules":("_continuation",)})
         cls.w_remote_side_code = cls.space.appexec([], """():
         import sys
         sys.path.insert(0, '%s')
diff --git a/lib_pypy/distributed/test/test_socklayer.py b/lib_pypy/distributed/test/test_socklayer.py
--- a/lib_pypy/distributed/test/test_socklayer.py
+++ b/lib_pypy/distributed/test/test_socklayer.py
@@ -9,7 +9,8 @@
 class AppTestSocklayer:
     def setup_class(cls):
         cls.space = gettestobjspace(**{"objspace.std.withtproxy": True,
-                                       "usemodules":("_stackless","_socket", "select")})
+                                       "usemodules":("_continuation",
+                                                     "_socket", "select")})
     
     def test_socklayer(self):
         class X(object):
diff --git a/lib_pypy/greenlet.py b/lib_pypy/greenlet.py
--- a/lib_pypy/greenlet.py
+++ b/lib_pypy/greenlet.py
@@ -1,1 +1,151 @@
-from _stackless import greenlet
+import _continuation, sys
+
+
+# ____________________________________________________________
+# Exceptions
+
+class GreenletExit(Exception):
+    """This special exception does not propagate to the parent greenlet; it
+can be used to kill a single greenlet."""
+
+error = _continuation.error
+
+# ____________________________________________________________
+# Helper function
+
+def getcurrent():
+    "Returns the current greenlet (i.e. the one which called this function)."
+    try:
+        return _tls.current
+    except AttributeError:
+        # first call in this thread: current == main
+        _green_create_main()
+        return _tls.current
+
+# ____________________________________________________________
+# The 'greenlet' class
+
+_continulet = _continuation.continulet
+
+class greenlet(_continulet):
+    getcurrent = staticmethod(getcurrent)
+    error = error
+    GreenletExit = GreenletExit
+    __main = False
+    __started = False
+
+    def __new__(cls, *args, **kwds):
+        self = _continulet.__new__(cls)
+        self.parent = getcurrent()
+        return self
+
+    def __init__(self, run=None, parent=None):
+        if run is not None:
+            self.run = run
+        if parent is not None:
+            self.parent = parent
+
+    def switch(self, *args):
+        "Switch execution to this greenlet, optionally passing the values "
+        "given as argument(s).  Returns the value passed when switching back."
+        return self.__switch(_continulet.switch, args)
+
+    def throw(self, typ=GreenletExit, val=None, tb=None):
+        "raise exception in greenlet, return value passed when switching back"
+        return self.__switch(_continulet.throw, typ, val, tb)
+
+    def __switch(target, unbound_method, *args):
+        current = getcurrent()
+        #
+        while not target:
+            if not target.__started:
+                if unbound_method != _continulet.throw:
+                    greenlet_func = _greenlet_start
+                else:
+                    greenlet_func = _greenlet_throw
+                _continulet.__init__(target, greenlet_func, *args)
+                unbound_method = _continulet.switch
+                args = ()
+                target.__started = True
+                break
+            # already done, go to the parent instead
+            # (NB. infinite loop possible, but unlikely, unless you mess
+            # up the 'parent' explicitly.  Good enough, because a Ctrl-C
+            # will show that the program is caught in this loop here.)
+            target = target.parent
+        #
+        try:
+            if current.__main:
+                if target.__main:
+                    # switch from main to main
+                    if unbound_method == _continulet.throw:
+                        raise args[0], args[1], args[2]
+                    (args,) = args
+                else:
+                    # enter from main to target
+                    args = unbound_method(target, *args)
+            else:
+                if target.__main:
+                    # leave to go to target=main
+                    args = unbound_method(current, *args)
+                else:
+                    # switch from non-main to non-main
+                    args = unbound_method(current, *args, to=target)
+        except GreenletExit, e:
+            args = (e,)
+        finally:
+            _tls.current = current
+        #
+        if len(args) == 1:
+            return args[0]
+        else:
+            return args
+
+    def __nonzero__(self):
+        return self.__main or _continulet.is_pending(self)
+
+    @property
+    def dead(self):
+        return self.__started and not self
+
+    @property
+    def gr_frame(self):
+        raise NotImplementedError("attribute 'gr_frame' of greenlet objects")
+
+# ____________________________________________________________
+# Internal stuff
+
+try:
+    from thread import _local
+except ImportError:
+    class _local(object):    # assume no threads
+        pass
+
+_tls = _local()
+
+def _green_create_main():
+    # create the main greenlet for this thread
+    _tls.current = None
+    gmain = greenlet.__new__(greenlet)
+    gmain._greenlet__main = True
+    gmain._greenlet__started = True
+    assert gmain.parent is None
+    _tls.main = gmain
+    _tls.current = gmain
+
+def _greenlet_start(greenlet, args):
+    _tls.current = greenlet
+    try:
+        res = greenlet.run(*args)
+    finally:
+        if greenlet.parent is not _tls.main:
+            _continuation.permute(greenlet, greenlet.parent)
+    return (res,)
+
+def _greenlet_throw(greenlet, exc, value, tb):
+    _tls.current = greenlet
+    try:
+        raise exc, value, tb
+    finally:
+        if greenlet.parent is not _tls.main:
+            _continuation.permute(greenlet, greenlet.parent)
diff --git a/lib_pypy/pypy_test/test_coroutine.py b/lib_pypy/pypy_test/test_coroutine.py
--- a/lib_pypy/pypy_test/test_coroutine.py
+++ b/lib_pypy/pypy_test/test_coroutine.py
@@ -2,7 +2,7 @@
 from py.test import skip, raises
 
 try:
-    from lib_pypy.stackless import coroutine, CoroutineExit
+    from stackless import coroutine, CoroutineExit
 except ImportError, e:
     skip('cannot import stackless: %s' % (e,))
 
@@ -20,10 +20,6 @@
         assert not co.is_zombie
 
     def test_is_zombie_del_without_frame(self):
-        try:
-            import _stackless # are we on pypy with a stackless build?
-        except ImportError:
-            skip("only works on pypy-c-stackless")
         import gc
         res = []
         class MyCoroutine(coroutine):
@@ -45,10 +41,6 @@
         assert res[0], "is_zombie was False in __del__"
 
     def test_is_zombie_del_with_frame(self):
-        try:
-            import _stackless # are we on pypy with a stackless build?
-        except ImportError:
-            skip("only works on pypy-c-stackless")
         import gc
         res = []
         class MyCoroutine(coroutine):
diff --git a/lib_pypy/pyrepl/reader.py b/lib_pypy/pyrepl/reader.py
--- a/lib_pypy/pyrepl/reader.py
+++ b/lib_pypy/pyrepl/reader.py
@@ -401,13 +401,19 @@
             return "(arg: %s) "%self.arg
         if "\n" in self.buffer:
             if lineno == 0:
-                return self._ps2
+                res = self.ps2
             elif lineno == self.buffer.count("\n"):
-                return self._ps4
+                res = self.ps4
             else:
-                return self._ps3
+                res = self.ps3
         else:
-            return self._ps1
+            res = self.ps1
+        # Lazily call str() on self.psN, and cache the results using as key
+        # the object on which str() was called.  This ensures that even if the
+        # same object is used e.g. for ps1 and ps2, str() is called only once.
+        if res not in self._pscache:
+            self._pscache[res] = str(res)
+        return self._pscache[res]
 
     def push_input_trans(self, itrans):
         self.input_trans_stack.append(self.input_trans)
@@ -473,8 +479,7 @@
             self.pos = 0
             self.dirty = 1
             self.last_command = None
-            self._ps1, self._ps2, self._ps3, self._ps4 = \
-                           map(str, [self.ps1, self.ps2, self.ps3, self.ps4])
+            self._pscache = {}
         except:
             self.restore()
             raise
@@ -571,7 +576,7 @@
         self.console.push_char(char)
         self.handle1(0)
     
-    def readline(self):
+    def readline(self, returns_unicode=False):
         """Read a line.  The implementation of this method also shows
         how to drive Reader if you want more control over the event
         loop."""
@@ -580,6 +585,8 @@
             self.refresh()
             while not self.finished:
                 self.handle1()
+            if returns_unicode:
+                return self.get_unicode()
             return self.get_buffer()
         finally:
             self.restore()
diff --git a/lib_pypy/pyrepl/readline.py b/lib_pypy/pyrepl/readline.py
--- a/lib_pypy/pyrepl/readline.py
+++ b/lib_pypy/pyrepl/readline.py
@@ -33,7 +33,7 @@
 from pyrepl.unix_console import UnixConsole, _error
 
 
-ENCODING = 'latin1'     # XXX hard-coded
+ENCODING = sys.getfilesystemencoding() or 'latin1'     # XXX review
 
 __all__ = ['add_history',
            'clear_history',
@@ -198,7 +198,7 @@
         reader.ps1 = prompt
         return reader.readline()
 
-    def multiline_input(self, more_lines, ps1, ps2):
+    def multiline_input(self, more_lines, ps1, ps2, returns_unicode=False):
         """Read an input on possibly multiple lines, asking for more
         lines as long as 'more_lines(unicodetext)' returns an object whose
         boolean value is true.
@@ -209,7 +209,7 @@
             reader.more_lines = more_lines
             reader.ps1 = reader.ps2 = ps1
             reader.ps3 = reader.ps4 = ps2
-            return reader.readline()
+            return reader.readline(returns_unicode=returns_unicode)
         finally:
             reader.more_lines = saved
 
diff --git a/lib_pypy/pyrepl/simple_interact.py b/lib_pypy/pyrepl/simple_interact.py
--- a/lib_pypy/pyrepl/simple_interact.py
+++ b/lib_pypy/pyrepl/simple_interact.py
@@ -54,7 +54,8 @@
             ps1 = getattr(sys, 'ps1', '>>> ')
             ps2 = getattr(sys, 'ps2', '... ')
             try:
-                statement = multiline_input(more_lines, ps1, ps2)
+                statement = multiline_input(more_lines, ps1, ps2,
+                                            returns_unicode=True)
             except EOFError:
                 break
             more = console.push(statement)
diff --git a/lib_pypy/stackless.py b/lib_pypy/stackless.py
--- a/lib_pypy/stackless.py
+++ b/lib_pypy/stackless.py
@@ -4,121 +4,124 @@
 Please refer to their documentation.
 """
 
-DEBUG = True
-
-def dprint(*args):
-    for arg in args:
-        print arg,
-    print
 
 import traceback
-import sys
+import _continuation
+from functools import partial
+
+class TaskletExit(Exception):
+    pass
+
+CoroutineExit = TaskletExit
+
+class GWrap(_continuation.continulet):
+    """This is just a wrapper around continulet to allow
+       to stick additional attributes to a continulet.
+       To be more concrete, we need a backreference to
+       the coroutine object"""
+
+
+class coroutine(object):
+    "we can't have continulet as a base, because continulets can't be rebound"
+
+    def __init__(self):
+        self._frame = None
+        self.is_zombie = False
+
+    def __getattr__(self, attr):
+        return getattr(self._frame, attr)
+
+    def __del__(self):
+        self.is_zombie = True
+        del self._frame
+        self._frame = None
+
+    def bind(self, func, *argl, **argd):
+        """coro.bind(f, *argl, **argd) -> None.
+           binds function f to coro. f will be called with
+           arguments *argl, **argd
+        """
+        if self._frame is None or not self._frame.is_pending():
+
+            def _func(c, *args, **kwargs):
+                return func(*args, **kwargs)
+            
+            run = partial(_func, *argl, **argd)
+            self._frame = frame = GWrap(run)
+        else:
+            raise ValueError("cannot bind a bound coroutine")
+
+    def switch(self):
+        """coro.switch() -> returnvalue
+           switches to coroutine coro. If the bound function
+           f finishes, the returnvalue is that of f, otherwise
+           None is returned
+        """
+        current = _getcurrent()
+        current._jump_to(self)
+
+    def _jump_to(self, coroutine):
+        _tls.current_coroutine = coroutine
+        self._frame.switch(to=coroutine._frame)
+
+    def kill(self):
+        """coro.kill() : kill coroutine coro"""
+        _tls.current_coroutine = self
+        self._frame.throw(CoroutineExit)
+
+    def _is_alive(self):
+        if self._frame is None:
+            return False
+        return not self._frame.is_pending()
+    is_alive = property(_is_alive)
+    del _is_alive
+
+    def getcurrent():
+        """coroutine.getcurrent() -> the currently running coroutine"""
+        try:
+            return _getcurrent()
+        except AttributeError:
+            return _maincoro
+    getcurrent = staticmethod(getcurrent)
+
+    def __reduce__(self):
+        raise TypeError, 'pickling is not possible based upon continulets'
+
+
+def _getcurrent():
+    "Returns the current coroutine (i.e. the one which called this function)."
+    try:
+        return _tls.current_coroutine
+    except AttributeError:
+        # first call in this thread: current == main
+        _coroutine_create_main()
+        return _tls.current_coroutine
+
 try:
-    # If _stackless can be imported then TaskletExit and CoroutineExit are 
-    # automatically added to the builtins.
-    from _stackless import coroutine, greenlet
-except ImportError: # we are running from CPython
-    from greenlet import greenlet, GreenletExit
-    TaskletExit = CoroutineExit = GreenletExit
-    del GreenletExit
-    try:
-        from functools import partial
-    except ImportError: # we are not running python 2.5
-        class partial(object):
-            # just enough of 'partial' to be usefull
-            def __init__(self, func, *argl, **argd):
-                self.func = func
-                self.argl = argl
-                self.argd = argd
+    from thread import _local
+except ImportError:
+    class _local(object):    # assume no threads
+        pass
 
-            def __call__(self):
-                return self.func(*self.argl, **self.argd)
+_tls = _local()
 
-    class GWrap(greenlet):
-        """This is just a wrapper around greenlets to allow
-           to stick additional attributes to a greenlet.
-           To be more concrete, we need a backreference to
-           the coroutine object"""
+def _coroutine_create_main():
+    # create the main coroutine for this thread
+    _tls.current_coroutine = None
+    main_coroutine = coroutine()
+    main_coroutine.bind(lambda x:x)
+    _tls.main_coroutine = main_coroutine
+    _tls.current_coroutine = main_coroutine
+    return main_coroutine
 
-    class MWrap(object):
-        def __init__(self,something):
-            self.something = something
 
-        def __getattr__(self, attr):
-            return getattr(self.something, attr)
+_maincoro = _coroutine_create_main()
 
-    class coroutine(object):
-        "we can't have greenlet as a base, because greenlets can't be rebound"
-
-        def __init__(self):
-            self._frame = None
-            self.is_zombie = False
-
-        def __getattr__(self, attr):
-            return getattr(self._frame, attr)
-
-        def __del__(self):
-            self.is_zombie = True
-            del self._frame
-            self._frame = None
-
-        def bind(self, func, *argl, **argd):
-            """coro.bind(f, *argl, **argd) -> None.
-               binds function f to coro. f will be called with
-               arguments *argl, **argd
-            """
-            if self._frame is None or self._frame.dead:
-                self._frame = frame = GWrap()
-                frame.coro = self
-            if hasattr(self._frame, 'run') and self._frame.run:
-                raise ValueError("cannot bind a bound coroutine")
-            self._frame.run = partial(func, *argl, **argd)
-
-        def switch(self):
-            """coro.switch() -> returnvalue
-               switches to coroutine coro. If the bound function
-               f finishes, the returnvalue is that of f, otherwise
-               None is returned
-            """
-            try:
-                return greenlet.switch(self._frame)
-            except TypeError, exp: # self._frame is the main coroutine
-                return greenlet.switch(self._frame.something)
-
-        def kill(self):
-            """coro.kill() : kill coroutine coro"""
-            self._frame.throw()
-
-        def _is_alive(self):
-            if self._frame is None:
-                return False
-            return not self._frame.dead
-        is_alive = property(_is_alive)
-        del _is_alive
-
-        def getcurrent():
-            """coroutine.getcurrent() -> the currently running coroutine"""
-            try:
-                return greenlet.getcurrent().coro
-            except AttributeError:
-                return _maincoro
-        getcurrent = staticmethod(getcurrent)
-
-        def __reduce__(self):
-            raise TypeError, 'pickling is not possible based upon greenlets'
-
-    _maincoro = coroutine()
-    maingreenlet = greenlet.getcurrent()
-    _maincoro._frame = frame = MWrap(maingreenlet)
-    frame.coro = _maincoro
-    del frame
-    del maingreenlet
 
 from collections import deque
 
 import operator
-__all__ = 'run getcurrent getmain schedule tasklet channel coroutine \
-                greenlet'.split()
+__all__ = 'run getcurrent getmain schedule tasklet channel coroutine'.split()
 
 _global_task_id = 0
 _squeue = None
@@ -131,7 +134,8 @@
 def _scheduler_remove(value):
     try:
         del _squeue[operator.indexOf(_squeue, value)]
-    except ValueError:pass
+    except ValueError:
+        pass
 
 def _scheduler_append(value, normal=True):
     if normal:
diff --git a/pypy/annotation/builtin.py b/pypy/annotation/builtin.py
--- a/pypy/annotation/builtin.py
+++ b/pypy/annotation/builtin.py
@@ -308,9 +308,6 @@
             clsdef = clsdef.commonbase(cdef)
     return SomeInstance(clsdef)
 
-def robjmodel_we_are_translated():
-    return immutablevalue(True)
-
 def robjmodel_r_dict(s_eqfn, s_hashfn, s_force_non_null=None):
     if s_force_non_null is None:
         force_non_null = False
@@ -376,8 +373,6 @@
 
 BUILTIN_ANALYZERS[pypy.rlib.rarithmetic.intmask] = rarith_intmask
 BUILTIN_ANALYZERS[pypy.rlib.objectmodel.instantiate] = robjmodel_instantiate
-BUILTIN_ANALYZERS[pypy.rlib.objectmodel.we_are_translated] = (
-    robjmodel_we_are_translated)
 BUILTIN_ANALYZERS[pypy.rlib.objectmodel.r_dict] = robjmodel_r_dict
 BUILTIN_ANALYZERS[pypy.rlib.objectmodel.hlinvoke] = robjmodel_hlinvoke
 BUILTIN_ANALYZERS[pypy.rlib.objectmodel.keepalive_until_here] = robjmodel_keepalive_until_here
@@ -416,7 +411,8 @@
 from pypy.annotation.model import SomePtr
 from pypy.rpython.lltypesystem import lltype
 
-def malloc(s_T, s_n=None, s_flavor=None, s_zero=None, s_track_allocation=None):
+def malloc(s_T, s_n=None, s_flavor=None, s_zero=None, s_track_allocation=None,
+           s_add_memory_pressure=None):
     assert (s_n is None or s_n.knowntype == int
             or issubclass(s_n.knowntype, pypy.rlib.rarithmetic.base_int))
     assert s_T.is_constant()
@@ -432,6 +428,8 @@
     else:
         assert s_flavor.is_constant()
         assert s_track_allocation is None or s_track_allocation.is_constant()
+        assert (s_add_memory_pressure is None or
+                s_add_memory_pressure.is_constant())
         # not sure how to call malloc() for the example 'p' in the
         # presence of s_extraargs
         r = SomePtr(lltype.Ptr(s_T.const))
diff --git a/pypy/annotation/description.py b/pypy/annotation/description.py
--- a/pypy/annotation/description.py
+++ b/pypy/annotation/description.py
@@ -399,9 +399,7 @@
                 if b1 is object:
                     continue
                 if b1.__dict__.get('_mixin_', False):
-                    assert b1.__bases__ == () or b1.__bases__ == (object,), (
-                        "mixin class %r should have no base" % (b1,))
-                    self.add_sources_for_class(b1, mixin=True)
+                    self.add_mixin(b1)
                 else:
                     assert base is object, ("multiple inheritance only supported "
                                             "with _mixin_: %r" % (cls,))
@@ -469,6 +467,15 @@
                 return
         self.classdict[name] = Constant(value)
 
+    def add_mixin(self, base):
+        for subbase in base.__bases__:
+            if subbase is object:
+                continue
+            assert subbase.__dict__.get("_mixin_", False), ("Mixin class %r has non"
+                "mixin base class %r" % (base, subbase))
+            self.add_mixin(subbase)
+        self.add_sources_for_class(base, mixin=True)
+
     def add_sources_for_class(self, cls, mixin=False):
         for name, value in cls.__dict__.items():
             self.add_source_attribute(name, value, mixin)
diff --git a/pypy/config/makerestdoc.py b/pypy/config/makerestdoc.py
--- a/pypy/config/makerestdoc.py
+++ b/pypy/config/makerestdoc.py
@@ -134,7 +134,7 @@
         for child in self._children:
             subpath = fullpath + "." + child._name
             toctree.append(subpath)
-        content.add(Directive("toctree", *toctree, maxdepth=4))
+        content.add(Directive("toctree", *toctree, **{'maxdepth': 4}))
         content.join(
             ListItem(Strong("name:"), self._name),
             ListItem(Strong("description:"), self.doc))
diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py
--- a/pypy/config/pypyoption.py
+++ b/pypy/config/pypyoption.py
@@ -27,13 +27,14 @@
 # --allworkingmodules
 working_modules = default_modules.copy()
 working_modules.update(dict.fromkeys(
-    ["_socket", "unicodedata", "mmap", "fcntl", "_locale",
+    ["_socket", "unicodedata", "mmap", "fcntl", "_locale", "pwd",
      "rctime" , "select", "zipimport", "_lsprof",
      "crypt", "signal", "_rawffi", "termios", "zlib", "bz2",
      "struct", "_hashlib", "_md5", "_sha", "_minimal_curses", "cStringIO",
      "thread", "itertools", "pyexpat", "_ssl", "cpyext", "array",
      "_bisect", "binascii", "_multiprocessing", '_warnings',
-     "_collections", "_multibytecodec", "micronumpy", "_ffi"]
+     "_collections", "_multibytecodec", "micronumpy", "_ffi",
+     "_continuation"]
 ))
 
 translation_modules = default_modules.copy()
@@ -57,6 +58,7 @@
     # unix only modules
     del working_modules["crypt"]
     del working_modules["fcntl"]
+    del working_modules["pwd"]
     del working_modules["termios"]
     del working_modules["_minimal_curses"]
 
@@ -99,6 +101,7 @@
     "_ssl"      : ["pypy.module._ssl.interp_ssl"],
     "_hashlib"  : ["pypy.module._ssl.interp_ssl"],
     "_minimal_curses": ["pypy.module._minimal_curses.fficurses"],
+    "_continuation": ["pypy.rlib.rstacklet"],
     }
 
 def get_module_validator(modname):
diff --git a/pypy/config/test/test_config.py b/pypy/config/test/test_config.py
--- a/pypy/config/test/test_config.py
+++ b/pypy/config/test/test_config.py
@@ -1,5 +1,5 @@
 from pypy.config.config import *
-import py
+import py, sys
 
 def make_description():
     gcoption = ChoiceOption('name', 'GC name', ['ref', 'framework'], 'ref')
@@ -69,13 +69,15 @@
     attrs = dir(config)
     assert '__repr__' in attrs        # from the type
     assert '_cfgimpl_values' in attrs # from self
-    assert 'gc' in attrs              # custom attribute
-    assert 'objspace' in attrs        # custom attribute
+    if sys.version_info >= (2, 6):
+        assert 'gc' in attrs              # custom attribute
+        assert 'objspace' in attrs        # custom attribute
     #
     attrs = dir(config.gc)
-    assert 'name' in attrs
-    assert 'dummy' in attrs
-    assert 'float' in attrs
+    if sys.version_info >= (2, 6):
+        assert 'name' in attrs
+        assert 'dummy' in attrs
+        assert 'float' in attrs
 
 def test_arbitrary_option():
     descr = OptionDescription("top", "", [
@@ -279,11 +281,11 @@
 
 def test_underscore_in_option_name():
     descr = OptionDescription("opt", "", [
-        BoolOption("_stackless", "", default=False),
+        BoolOption("_foobar", "", default=False),
     ])
     config = Config(descr)
     parser = to_optparse(config)
-    assert parser.has_option("--_stackless")
+    assert parser.has_option("--_foobar")
 
 def test_none():
     dummy1 = BoolOption('dummy1', 'doc dummy', default=False, cmdline=None)
diff --git a/pypy/config/translationoption.py b/pypy/config/translationoption.py
--- a/pypy/config/translationoption.py
+++ b/pypy/config/translationoption.py
@@ -13,6 +13,10 @@
 DEFL_LOW_INLINE_THRESHOLD = DEFL_INLINE_THRESHOLD / 2.0
 
 DEFL_GC = "minimark"
+if sys.platform.startswith("linux"):
+    DEFL_ROOTFINDER_WITHJIT = "asmgcc"
+else:
+    DEFL_ROOTFINDER_WITHJIT = "shadowstack"
 
 IS_64_BITS = sys.maxint > 2147483647
 
@@ -24,10 +28,9 @@
 
 translation_optiondescription = OptionDescription(
         "translation", "Translation Options", [
-    BoolOption("stackless", "enable stackless features during compilation",
-               default=False, cmdline="--stackless",
-               requires=[("translation.type_system", "lltype"),
-                         ("translation.gcremovetypeptr", False)]),  # XXX?
+    BoolOption("continuation", "enable single-shot continuations",
+               default=False, cmdline="--continuation",
+               requires=[("translation.type_system", "lltype")]),
     ChoiceOption("type_system", "Type system to use when RTyping",
                  ["lltype", "ootype"], cmdline=None, default="lltype",
                  requires={
@@ -66,7 +69,8 @@
                      "statistics": [("translation.gctransformer", "framework")],
                      "generation": [("translation.gctransformer", "framework")],
                      "hybrid": [("translation.gctransformer", "framework")],
-                     "boehm": [("translation.gctransformer", "boehm")],
+                     "boehm": [("translation.gctransformer", "boehm"),
+                               ("translation.continuation", False)],  # breaks
                      "markcompact": [("translation.gctransformer", "framework")],
                      "minimark": [("translation.gctransformer", "framework")],
                      },
@@ -109,7 +113,7 @@
     BoolOption("jit", "generate a JIT",
                default=False,
                suggests=[("translation.gc", DEFL_GC),
-                         ("translation.gcrootfinder", "asmgcc"),
+                         ("translation.gcrootfinder", DEFL_ROOTFINDER_WITHJIT),
                          ("translation.list_comprehension_operations", True)]),
     ChoiceOption("jit_backend", "choose the backend for the JIT",
                  ["auto", "x86", "x86-without-sse2", "llvm"],
@@ -385,8 +389,6 @@
             config.translation.suggest(withsmallfuncsets=5)
         elif word == 'jit':
             config.translation.suggest(jit=True)
-            if config.translation.stackless:
-                raise NotImplementedError("JIT conflicts with stackless for now")
         elif word == 'removetypeptr':
             config.translation.suggest(gcremovetypeptr=True)
         else:
diff --git a/pypy/doc/_ref.txt b/pypy/doc/_ref.txt
--- a/pypy/doc/_ref.txt
+++ b/pypy/doc/_ref.txt
@@ -1,11 +1,10 @@
 .. _`ctypes_configure/doc/sample.py`: https://bitbucket.org/pypy/pypy/src/default/ctypes_configure/doc/sample.py
 .. _`demo/`: https://bitbucket.org/pypy/pypy/src/default/demo/
-.. _`demo/pickle_coroutine.py`: https://bitbucket.org/pypy/pypy/src/default/demo/pickle_coroutine.py
 .. _`lib-python/`: https://bitbucket.org/pypy/pypy/src/default/lib-python/
 .. _`lib-python/2.7/dis.py`: https://bitbucket.org/pypy/pypy/src/default/lib-python/2.7/dis.py
 .. _`lib_pypy/`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/
+.. _`lib_pypy/greenlet.py`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/greenlet.py
 .. _`lib_pypy/pypy_test/`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/pypy_test/
-.. _`lib_pypy/stackless.py`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/stackless.py
 .. _`lib_pypy/tputil.py`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/tputil.py
 .. _`pypy/annotation`:
 .. _`pypy/annotation/`: https://bitbucket.org/pypy/pypy/src/default/pypy/annotation/
@@ -55,7 +54,6 @@
 .. _`pypy/module`:
 .. _`pypy/module/`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/
 .. _`pypy/module/__builtin__/__init__.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/__builtin__/__init__.py
-.. _`pypy/module/_stackless/test/test_composable_coroutine.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/_stackless/test/test_composable_coroutine.py
 .. _`pypy/objspace`:
 .. _`pypy/objspace/`: https://bitbucket.org/pypy/pypy/src/default/pypy/objspace/
 .. _`pypy/objspace/dump.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/objspace/dump.py
@@ -117,6 +115,7 @@
 .. _`pypy/translator/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/
 .. _`pypy/translator/backendopt/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/backendopt/
 .. _`pypy/translator/c/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/c/
+.. _`pypy/translator/c/src/stacklet/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/c/src/stacklet/
 .. _`pypy/translator/cli/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/cli/
 .. _`pypy/translator/goal/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/goal/
 .. _`pypy/translator/jvm/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/jvm/
diff --git a/pypy/doc/architecture.rst b/pypy/doc/architecture.rst
--- a/pypy/doc/architecture.rst
+++ b/pypy/doc/architecture.rst
@@ -153,7 +153,7 @@
 
 * Optionally, `various transformations`_ can then be applied which, for
   example, perform optimizations such as inlining, add capabilities
-  such as stackless_-style concurrency, or insert code for the
+  such as stackless-style concurrency (deprecated), or insert code for the
   `garbage collector`_.
 
 * Then, the graphs are converted to source code for the target platform
@@ -255,7 +255,6 @@
 
 .. _Python: http://docs.python.org/reference/
 .. _Psyco: http://psyco.sourceforge.net
-.. _stackless: stackless.html
 .. _`generate Just-In-Time Compilers`: jit/index.html
 .. _`JIT Generation in PyPy`: jit/index.html
 .. _`implement your own interpreter`: http://morepypy.blogspot.com/2011/04/tutorial-writing-interpreter-with-pypy.html
diff --git a/pypy/doc/conf.py b/pypy/doc/conf.py
--- a/pypy/doc/conf.py
+++ b/pypy/doc/conf.py
@@ -45,9 +45,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '1.5'
+version = '1.6'
 # The full version, including alpha/beta/rc tags.
-release = '1.5'
+release = '1.6'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/pypy/doc/config/objspace.usemodules._stackless.txt b/pypy/doc/config/objspace.usemodules._continuation.txt
rename from pypy/doc/config/objspace.usemodules._stackless.txt
rename to pypy/doc/config/objspace.usemodules._continuation.txt
--- a/pypy/doc/config/objspace.usemodules._stackless.txt
+++ b/pypy/doc/config/objspace.usemodules._continuation.txt
@@ -1,6 +1,4 @@
-Use the '_stackless' module. 
+Use the '_continuation' module. 
 
-Exposes the `stackless` primitives, and also implies a stackless build. 
-See also :config:`translation.stackless`.
-
-.. _`stackless`: ../stackless.html
+Exposes the `continulet` app-level primitives.
+See also :config:`translation.continuation`.
diff --git a/pypy/doc/config/objspace.usemodules.pwd.txt b/pypy/doc/config/objspace.usemodules.pwd.txt
new file mode 100644
--- /dev/null
+++ b/pypy/doc/config/objspace.usemodules.pwd.txt
@@ -0,0 +1,2 @@
+Use the 'pwd' module. 
+This module is expected to be fully working.
diff --git a/pypy/doc/config/translation.stackless.txt b/pypy/doc/config/translation.continuation.txt
rename from pypy/doc/config/translation.stackless.txt
rename to pypy/doc/config/translation.continuation.txt
--- a/pypy/doc/config/translation.stackless.txt
+++ b/pypy/doc/config/translation.continuation.txt
@@ -1,5 +1,2 @@
-Run the `stackless transform`_ on each generated graph, which enables the use
-of coroutines at RPython level and the "stackless" module when translating
-PyPy.
-
-.. _`stackless transform`: ../stackless.html
+Enable the use of a stackless-like primitive called "stacklet".
+In PyPy, this is exposed at app-level by the "_continuation" module.
diff --git a/pypy/doc/contributor.rst b/pypy/doc/contributor.rst
--- a/pypy/doc/contributor.rst
+++ b/pypy/doc/contributor.rst
@@ -9,22 +9,22 @@
     Armin Rigo
     Maciej Fijalkowski
     Carl Friedrich Bolz
+    Antonio Cuni
     Amaury Forgeot d'Arc
-    Antonio Cuni
     Samuele Pedroni
     Michael Hudson
     Holger Krekel
+    Benjamin Peterson
     Christian Tismer
-    Benjamin Peterson
+    Hakan Ardo
+    Alex Gaynor
     Eric van Riet Paap
-    Anders Chrigstr&#246;m
-    H&#229;kan Ard&#246;
+    Anders Chrigstrom
+    David Schneider
     Richard Emslie
     Dan Villiom Podlaski Christiansen
     Alexander Schremmer
-    Alex Gaynor
-    David Schneider
-    Aureli&#233;n Campeas
+    Aurelien Campeas
     Anders Lehmann
     Camillo Bruni
     Niklaus Haldimann
@@ -35,16 +35,17 @@
     Bartosz Skowron
     Jakub Gustak
     Guido Wesdorp
+    Daniel Roberts
     Adrien Di Mascio
     Laura Creighton
     Ludovic Aubry
     Niko Matsakis
-    Daniel Roberts
     Jason Creighton
-    Jacob Hall&#233;n
+    Jacob Hallen
     Alex Martelli
     Anders Hammarquist
     Jan de Mooij
+    Wim Lavrijsen
     Stephan Diehl
     Michael Foord
     Stefan Schwarzer
@@ -55,9 +56,13 @@
     Alexandre Fayolle
     Marius Gedminas
     Simon Burton
+    Justin Peel
     Jean-Paul Calderone
     John Witulski
+    Lukas Diekmann
+    holger krekel
     Wim Lavrijsen
+    Dario Bertini
     Andreas St&#252;hrk
     Jean-Philippe St. Pierre
     Guido van Rossum
@@ -69,15 +74,16 @@
     Georg Brandl
     Gerald Klix
     Wanja Saatkamp
+    Ronny Pfannschmidt
     Boris Feigin
     Oscar Nierstrasz
-    Dario Bertini
     David Malcolm
     Eugene Oden
     Henry Mason
+    Sven Hager
     Lukas Renggli
+    Ilya Osadchiy
     Guenter Jantzen
-    Ronny Pfannschmidt
     Bert Freudenberg
     Amit Regmi
     Ben Young
@@ -94,8 +100,8 @@
     Jared Grubb
     Karl Bartel
     Gabriel Lavoie
+    Victor Stinner
     Brian Dorsey
-    Victor Stinner
     Stuart Williams
     Toby Watson
     Antoine Pitrou
@@ -106,19 +112,23 @@
     Jonathan David Riehl
     Elmo M&#228;ntynen
     Anders Qvist
-    Beatrice D&#252;ring
+    Beatrice During
     Alexander Sedov
+    Timo Paulssen
+    Corbin Simpson
     Vincent Legoll
+    Romain Guillebert
     Alan McIntyre
-    Romain Guillebert
     Alex Perry
     Jens-Uwe Mager
+    Simon Cross
     Dan Stromberg
-    Lukas Diekmann
+    Guillebert Romain
     Carl Meyer
     Pieter Zieschang
     Alejandro J. Cura
     Sylvain Thenault
+    Christoph Gerum
     Travis Francis Athougies
     Henrik Vendelbo
     Lutz Paelike
@@ -129,6 +139,7 @@
     Miguel de Val Borro
     Ignas Mikalajunas
     Artur Lisiecki
+    Philip Jenvey
     Joshua Gilbert
     Godefroid Chappelle
     Yusei Tahara
@@ -137,24 +148,29 @@
     Gustavo Niemeyer
     William Leslie
     Akira Li
-    Kristj&#225;n Valur J&#243;nsson
+    Kristjan Valur Jonsson
     Bobby Impollonia
+    Michael Hudson-Doyle
     Andrew Thompson
     Anders Sigfridsson
+    Floris Bruynooghe
     Jacek Generowicz
     Dan Colish
-    Sven Hager
     Zooko Wilcox-O Hearn
+    Dan Villiom Podlaski Christiansen
     Anders Hammarquist
+    Chris Lambacher
     Dinu Gherman
     Dan Colish
+    Brett Cannon
     Daniel Neuh&#228;user
     Michael Chermside
     Konrad Delong
     Anna Ravencroft
     Greg Price
     Armin Ronacher
+    Christian Muirhead
     Jim Baker
-    Philip Jenvey
     Rodrigo Ara&#250;jo
+    Romain Guillebert
 
diff --git a/pypy/doc/cpython_differences.rst b/pypy/doc/cpython_differences.rst
--- a/pypy/doc/cpython_differences.rst
+++ b/pypy/doc/cpython_differences.rst
@@ -24,6 +24,7 @@
     _bisect
     _codecs
     _collections
+    `_continuation`_
     `_ffi`_
     _hashlib
     _io
@@ -84,9 +85,12 @@
 
     _winreg
 
-  Extra module with Stackless_ only:
-
-    _stackless
+  Note that only some of these modules are built-in in a typical
+  CPython installation, and the rest is from non built-in extension
+  modules.  This means that e.g. ``import parser`` will, on CPython,
+  find a local file ``parser.py``, while ``import sys`` will not find a
+  local file ``sys.py``.  In PyPy the difference does not exist: all
+  these modules are built-in.
 
 * Supported by being rewritten in pure Python (possibly using ``ctypes``):
   see the `lib_pypy/`_ directory.  Examples of modules that we
@@ -101,11 +105,11 @@
 
 .. the nonstandard modules are listed below...
 .. _`__pypy__`: __pypy__-module.html
+.. _`_continuation`: stackless.html
 .. _`_ffi`: ctypes-implementation.html
 .. _`_rawffi`: ctypes-implementation.html
 .. _`_minimal_curses`: config/objspace.usemodules._minimal_curses.html
 .. _`cpyext`: http://morepypy.blogspot.com/2010/04/using-cpython-extension-modules-with.html
-.. _Stackless: stackless.html
 
 
 Differences related to garbage collection strategies
@@ -280,7 +284,14 @@
   never a dictionary as it sometimes is in CPython. Assigning to
   ``__builtins__`` has no effect.
 
-* object identity of immutable keys in dictionaries is not necessarily preserved.
-  Never compare immutable objects with ``is``.
+* Do not compare immutable objects with ``is``.  For example on CPython
+  it is true that ``x is 0`` works, i.e. does the same as ``type(x) is
+  int and x == 0``, but it is so by accident.  If you do instead
+  ``x is 1000``, then it stops working, because 1000 is too large and
+  doesn't come from the internal cache.  In PyPy it fails to work in
+  both cases, because we have no need for a cache at all.
+
+* Also, object identity of immutable keys in dictionaries is not necessarily
+  preserved.
 
 .. include:: _ref.txt
diff --git a/pypy/doc/faq.rst b/pypy/doc/faq.rst
--- a/pypy/doc/faq.rst
+++ b/pypy/doc/faq.rst
@@ -315,6 +315,28 @@
 
 .. _`Andrew Brown's tutorial`: http://morepypy.blogspot.com/2011/04/tutorial-writing-interpreter-with-pypy.html
 
+---------------------------------------------------------
+Can RPython modules for PyPy be translated independently?
+---------------------------------------------------------
+
+No, you have to rebuild the entire interpreter.  This means two things:
+
+* It is imperative to use test-driven development.  You have to test
+  exhaustively your module in pure Python, before even attempting to
+  translate it.  Once you translate it, you should have only a few typing
+  issues left to fix, but otherwise the result should work out of the box.
+
+* Second, and perhaps most important: do you have a really good reason
+  for writing the module in RPython in the first place?  Nowadays you
+  should really look at alternatives, like writing it in pure Python,
+  using ctypes if it needs to call C code.  Other alternatives are being
+  developed too (as of summer 2011), like a Cython binding.
+
+In this context it is not that important to be able to translate
+RPython modules independently of translating the complete interpreter.
+(It could be done given enough efforts, but it's a really serious
+undertaking.  Consider it as quite unlikely for now.)
+
 ----------------------------------------------------------
 Why does PyPy draw a Mandelbrot fractal while translating?
 ----------------------------------------------------------
diff --git a/pypy/doc/garbage_collection.rst b/pypy/doc/garbage_collection.rst
--- a/pypy/doc/garbage_collection.rst
+++ b/pypy/doc/garbage_collection.rst
@@ -147,7 +147,7 @@
 You can read more about them at the start of
 `pypy/rpython/memory/gc/minimark.py`_.
 
-In more details:
+In more detail:
 
 - The small newly malloced objects are allocated in the nursery (case 1).
   All objects living in the nursery are "young".
diff --git a/pypy/doc/getting-started-python.rst b/pypy/doc/getting-started-python.rst
--- a/pypy/doc/getting-started-python.rst
+++ b/pypy/doc/getting-started-python.rst
@@ -32,7 +32,10 @@
 .. _`windows document`: windows.html
 
 You can translate the whole of PyPy's Python interpreter to low level C code,
-or `CLI code`_.
+or `CLI code`_.  If you intend to build using gcc, check to make sure that
+the version you have is not 4.2 or you will run into `this bug`_.
+
+.. _`this bug`: https://bugs.launchpad.net/ubuntu/+source/gcc-4.2/+bug/187391
 
 1. First `download a pre-built PyPy`_ for your architecture which you will
    use to translate your Python interpreter.  It is, of course, possible to
@@ -64,7 +67,6 @@
    * ``libssl-dev`` (for the optional ``_ssl`` module)
    * ``libgc-dev`` (for the Boehm garbage collector: only needed when translating with `--opt=0, 1` or `size`)
    * ``python-sphinx`` (for the optional documentation build.  You need version 1.0.7 or later)
-   * ``python-greenlet`` (for the optional stackless support in interpreted mode/testing)
 
 
 3. Translation is time-consuming -- 45 minutes on a very fast machine --
@@ -102,7 +104,7 @@
 
     $ ./pypy-c
     Python 2.7.0 (61ef2a11b56a, Mar 02 2011, 03:00:11)
-    [PyPy 1.5.0-alpha0 with GCC 4.4.3] on linux2
+    [PyPy 1.6.0 with GCC 4.4.3] on linux2
     Type "help", "copyright", "credits" or "license" for more information.
     And now for something completely different: ``this sentence is false''
     >>>> 46 - 4
@@ -117,19 +119,8 @@
 Installation_ below.
 
 The ``translate.py`` script takes a very large number of options controlling
-what to translate and how.  See ``translate.py -h``. Some of the more
-interesting options (but for now incompatible with the JIT) are:
-
-   * ``--stackless``: this produces a pypy-c that includes features
-     inspired by `Stackless Python <http://www.stackless.com>`__.
-
-   * ``--gc=boehm|ref|marknsweep|semispace|generation|hybrid|minimark``:
-     choose between using
-     the `Boehm-Demers-Weiser garbage collector`_, our reference
-     counting implementation or one of own collector implementations
-     (the default depends on the optimization level but is usually
-     ``minimark``).
-
+what to translate and how.  See ``translate.py -h``. The default options
+should be suitable for mostly everybody by now.
 Find a more detailed description of the various options in our `configuration
 sections`_.
 
@@ -162,7 +153,7 @@
 
     $ ./pypy-cli
     Python 2.7.0 (61ef2a11b56a, Mar 02 2011, 03:00:11)
-    [PyPy 1.5.0-alpha0] on linux2
+    [PyPy 1.6.0] on linux2
     Type "help", "copyright", "credits" or "license" for more information.
     And now for something completely different: ``distopian and utopian chairs''
     >>>> 
@@ -199,7 +190,7 @@
 
         $ ./pypy-jvm 
         Python 2.7.0 (61ef2a11b56a, Mar 02 2011, 03:00:11)
-        [PyPy 1.5.0-alpha0] on linux2
+        [PyPy 1.6.0] on linux2
         Type "help", "copyright", "credits" or "license" for more information.
         And now for something completely different: ``# assert did not crash''
         >>>> 
@@ -238,7 +229,7 @@
 the ``bin/pypy`` executable.
 
 To install PyPy system wide on unix-like systems, it is recommended to put the
-whole hierarchy alone (e.g. in ``/opt/pypy1.5``) and put a symlink to the
+whole hierarchy alone (e.g. in ``/opt/pypy1.6``) and put a symlink to the
 ``pypy`` executable into ``/usr/bin`` or ``/usr/local/bin``
 
 If the executable fails to find suitable libraries, it will report
diff --git a/pypy/doc/getting-started.rst b/pypy/doc/getting-started.rst
--- a/pypy/doc/getting-started.rst
+++ b/pypy/doc/getting-started.rst
@@ -53,11 +53,11 @@
 PyPy is ready to be executed as soon as you unpack the tarball or the zip
 file, with no need to install it in any specific location::
 
-    $ tar xf pypy-1.5-linux.tar.bz2
+    $ tar xf pypy-1.6-linux.tar.bz2
 
-    $ ./pypy-1.5-linux/bin/pypy
+    $ ./pypy-1.6/bin/pypy
     Python 2.7.1 (?, Apr 27 2011, 12:44:21)
-    [PyPy 1.5.0-alpha0 with GCC 4.4.3] on linux2
+    [PyPy 1.6.0 with GCC 4.4.3] on linux2
     Type "help", "copyright", "credits" or "license" for more information.
     And now for something completely different: ``implementing LOGO in LOGO:
     "turtles all the way down"''
@@ -73,16 +73,16 @@
 
     $ curl -O http://python-distribute.org/distribute_setup.py
 
-    $ curl -O https://github.com/pypa/pip/raw/master/contrib/get-pip.py
+    $ curl -O https://raw.github.com/pypa/pip/master/contrib/get-pip.py
 
-    $ ./pypy-1.5-linux/bin/pypy distribute_setup.py
+    $ ./pypy-1.6/bin/pypy distribute_setup.py
 
-    $ ./pypy-1.5-linux/bin/pypy get-pip.py
+    $ ./pypy-1.6/bin/pypy get-pip.py
 
-    $ ./pypy-1.5-linux/bin/pip install pygments  # for example
+    $ ./pypy-1.6/bin/pip install pygments  # for example
 
-3rd party libraries will be installed in ``pypy-1.5-linux/site-packages``, and
-the scripts in ``pypy-1.5-linux/bin``.
+3rd party libraries will be installed in ``pypy-1.6/site-packages``, and
+the scripts in ``pypy-1.6/bin``.
 
 Installing using virtualenv
 ---------------------------
diff --git a/pypy/doc/how-to-release.rst b/pypy/doc/how-to-release.rst
--- a/pypy/doc/how-to-release.rst
+++ b/pypy/doc/how-to-release.rst
@@ -21,8 +21,8 @@
 Release Steps
 ----------------
 
-* at code freeze make a release branch under
-  http://codepeak.net/svn/pypy/release/x.y(.z). IMPORTANT: bump the
+* at code freeze make a release branch using release-x.x.x in mercurial
+  IMPORTANT: bump the
   pypy version number in module/sys/version.py and in
   module/cpyext/include/patchlevel.h, notice that the branch
   will capture the revision number of this change for the release;
@@ -42,18 +42,11 @@
     JIT: windows, linux, os/x
     no JIT: windows, linux, os/x
     sandbox: linux, os/x
-    stackless: windows, linux, os/x
 
 * write release announcement pypy/doc/release-x.y(.z).txt
   the release announcement should contain a direct link to the download page
 * update pypy.org (under extradoc/pypy.org), rebuild and commit
 
-* update http://codespeak.net/pypy/trunk:
-   code0> + chmod -R yourname:users /www/codespeak.net/htdocs/pypy/trunk
-   local> cd ..../pypy/doc && py.test
-   local> cd ..../pypy
-   local> rsync -az doc codespeak.net:/www/codespeak.net/htdocs/pypy/trunk/pypy/
-
 * post announcement on morepypy.blogspot.com
 * send announcements to pypy-dev, python-list,
   python-announce, python-dev ...
diff --git a/pypy/doc/index-of-release-notes.rst b/pypy/doc/index-of-release-notes.rst
--- a/pypy/doc/index-of-release-notes.rst
+++ b/pypy/doc/index-of-release-notes.rst
@@ -16,3 +16,4 @@
    release-1.4.0beta.rst
    release-1.4.1.rst
    release-1.5.0.rst
+   release-1.6.0.rst
diff --git a/pypy/doc/index.rst b/pypy/doc/index.rst
--- a/pypy/doc/index.rst
+++ b/pypy/doc/index.rst
@@ -15,7 +15,7 @@
 
 * `FAQ`_: some frequently asked questions.
 
-* `Release 1.5`_: the latest official release
+* `Release 1.6`_: the latest official release
 
 * `PyPy Blog`_: news and status info about PyPy 
 
@@ -35,7 +35,7 @@
 
   * `Differences between PyPy and CPython`_
   * `What PyPy can do for your objects`_
-  * `Stackless and coroutines`_
+  * `Continulets and greenlets`_
   * `JIT Generation in PyPy`_ 
   * `Sandboxing Python code`_
 
@@ -77,7 +77,7 @@
 .. _`Getting Started`: getting-started.html
 .. _`Papers`: extradoc.html
 .. _`Videos`: video-index.html
-.. _`Release 1.5`: http://pypy.org/download.html
+.. _`Release 1.6`: http://pypy.org/download.html
 .. _`speed.pypy.org`: http://speed.pypy.org
 .. _`RPython toolchain`: translation.html
 .. _`potential project ideas`: project-ideas.html
@@ -122,9 +122,9 @@
 Windows, on top of .NET, and on top of Java.
 To dig into PyPy it is recommended to try out the current
 Mercurial default branch, which is always working or mostly working,
-instead of the latest release, which is `1.5`__.
+instead of the latest release, which is `1.6`__.
 
-.. __: release-1.5.0.html
+.. __: release-1.6.0.html
 
 PyPy is mainly developed on Linux and Mac OS X.  Windows is supported,
 but platform-specific bugs tend to take longer before we notice and fix
@@ -292,8 +292,6 @@
 
 `pypy/translator/jvm/`_            the Java backend
 
-`pypy/translator/stackless/`_      the `Stackless Transform`_
-
 `pypy/translator/tool/`_           helper tools for translation, including the Pygame
                                    `graph viewer`_
 
@@ -318,7 +316,7 @@
 .. _`transparent proxies`: objspace-proxies.html#tproxy
 .. _`Differences between PyPy and CPython`: cpython_differences.html
 .. _`What PyPy can do for your objects`: objspace-proxies.html
-.. _`Stackless and coroutines`: stackless.html
+.. _`Continulets and greenlets`: stackless.html
 .. _StdObjSpace: objspace.html#the-standard-object-space 
 .. _`abstract interpretation`: http://en.wikipedia.org/wiki/Abstract_interpretation
 .. _`rpython`: coding-guide.html#rpython 
@@ -337,7 +335,6 @@
 .. _`low-level type system`: rtyper.html#low-level-type
 .. _`object-oriented type system`: rtyper.html#oo-type
 .. _`garbage collector`: garbage_collection.html
-.. _`Stackless Transform`: translation.html#the-stackless-transform
 .. _`main PyPy-translation scripts`: getting-started-python.html#translating-the-pypy-python-interpreter
 .. _`.NET`: http://www.microsoft.com/net/
 .. _Mono: http://www.mono-project.com/
diff --git a/pypy/doc/jit/pyjitpl5.rst b/pypy/doc/jit/pyjitpl5.rst
--- a/pypy/doc/jit/pyjitpl5.rst
+++ b/pypy/doc/jit/pyjitpl5.rst
@@ -103,7 +103,7 @@
 
 The meta-interpreter starts interpreting the JIT bytecode.  Each operation is
 executed and then recorded in a list of operations, called the trace.
-Operations can have a list of boxes that operate on, arguments.  Some operations
+Operations can have a list of boxes they operate on, arguments.  Some operations
 (like GETFIELD and GETARRAYITEM) also have special objects that describe how
 their arguments are laid out in memory.  All possible operations generated by
 tracing are listed in metainterp/resoperation.py.  When a (interpreter-level)
diff --git a/pypy/doc/project-ideas.rst b/pypy/doc/project-ideas.rst
--- a/pypy/doc/project-ideas.rst
+++ b/pypy/doc/project-ideas.rst
@@ -48,12 +48,6 @@
 
 .. image:: image/jitviewer.png
 
-We would like to add one level to this hierarchy, by showing the generated
-machine code for each jit operation.  The necessary information is already in
-the log file produced by the JIT, so it is "only" a matter of teaching the
-jitviewer to display it.  Ideally, the machine code should be hidden by
-default and viewable on request.
-
 The jitviewer is a web application based on flask and jinja2 (and jQuery on
 the client): if you have great web developing skills and want to help PyPy,
 this is an ideal task to get started, because it does not require any deep
diff --git a/pypy/doc/release-1.6.0.rst b/pypy/doc/release-1.6.0.rst
new file mode 100644
--- /dev/null
+++ b/pypy/doc/release-1.6.0.rst
@@ -0,0 +1,95 @@
+========================
+PyPy 1.6 - kickass panda
+========================
+
+We're pleased to announce the 1.6 release of PyPy. This release brings a lot
+of bugfixes and performance improvements over 1.5, and improves support for
+Windows 32bit and OS X 64bit. This version fully implements Python 2.7.1 and
+has beta level support for loading CPython C extensions.  You can download it
+here:
+
+    http://pypy.org/download.html
+
+What is PyPy?
+=============
+
+PyPy is a very compliant Python interpreter, almost a drop-in replacement for
+CPython 2.7.1. It's fast (`pypy 1.5 and cpython 2.6.2`_ performance comparison)
+due to its integrated tracing JIT compiler.
+
+This release supports x86 machines running Linux 32/64 or Mac OS X.  Windows 32
+is beta (it roughly works but a lot of small issues have not been fixed so
+far).  Windows 64 is not yet supported.
+
+The main topics of this release are speed and stability: on average on
+our benchmark suite, PyPy 1.6 is between **20% and 30%** faster than PyPy 1.5,
+which was already much faster than CPython on our set of benchmarks.
+
+The speed improvements have been made possible by optimizing many of the
+layers which compose PyPy.  In particular, we improved: the Garbage Collector,
+the JIT warmup time, the optimizations performed by the JIT, the quality of
+the generated machine code and the implementation of our Python interpreter.
+
+.. _`pypy 1.5 and cpython 2.6.2`: http://speed.pypy.org
+
+
+Highlights
+==========
+
+* Numerous performance improvements, overall giving considerable speedups:
+
+  - better GC behavior when dealing with very large objects and arrays
+
+  - **fast ctypes:** now calls to ctypes functions are seen and optimized
+    by the JIT, and they are up to 60 times faster than PyPy 1.5 and 10 times
+    faster than CPython
+
+  - improved generators(1): simple generators now are inlined into the caller
+    loop, making performance up to 3.5 times faster than PyPy 1.5.
+
+  - improved generators(2): thanks to other optimizations, even generators
+    that are not inlined are between 10% and 20% faster than PyPy 1.5.
+
+  - faster warmup time for the JIT
+
+  - JIT support for single floats (e.g., for ``array('f')``)
+
+  - optimized dictionaries: the internal representation of dictionaries is now
+    dynamically selected depending on the type of stored objects, resulting in
+    faster code and smaller memory footprint.  For example, dictionaries whose
+    keys are all strings, or all integers. Other dictionaries are also smaller
+    due to bugfixes.
+
+* JitViewer: this is the first official release which includes the JitViewer,
+  a web-based tool which helps you to see which parts of your Python code have
+  been compiled by the JIT, down until the assembler. The `jitviewer`_ 0.1 has
+  already been release and works well with PyPy 1.6.
+
+* The CPython extension module API has been improved and now supports many
+  more extensions. For information on which one are supported, please refer to
+  our `compatibility wiki`_.
+
+* Multibyte encoding support: this was of of the last areas in which we were
+  still behind CPython, but now we fully support them.
+
+* Preliminary support for NumPy: this release includes a preview of a very
+  fast NumPy module integrated with the PyPy JIT.  Unfortunately, this does
+  not mean that you can expect to take an existing NumPy program and run it on
+  PyPy, because the module is still unfinished and supports only some of the
+  numpy API. However, barring some details, what works should be
+  blazingly fast :-)
+
+* Bugfixes: since the 1.5 release we fixed 53 bugs in our `bug tracker`_, not
+  counting the numerous bugs that were found and reported through other
+  channels than the bug tracker.
+
+Cheers,
+
+Hakan Ardo, Carl Friedrich Bolz, Laura Creighton, Antonio Cuni,
+Maciej Fijalkowski, Amaury Forgeot d'Arc, Alex Gaynor,
+Armin Rigo and the PyPy team
+
+.. _`jitviewer`: http://morepypy.blogspot.com/2011/08/visualization-of-jitted-code.html
+.. _`bug tracker`: https://bugs.pypy.org
+.. _`compatibility wiki`: https://bitbucket.org/pypy/compatibility/wiki/Home
+
diff --git a/pypy/doc/rlib.rst b/pypy/doc/rlib.rst
--- a/pypy/doc/rlib.rst
+++ b/pypy/doc/rlib.rst
@@ -134,69 +134,6 @@
 a hierarchy of Address classes, in a typical static-OO-programming style.
 
 
-``rstack``
-==========
-
-The `pypy/rlib/rstack.py`_ module allows an RPython program to control its own execution stack.
-This is only useful if the program is translated using stackless. An old
-description of the exposed functions is below.
-
-We introduce an RPython type ``frame_stack_top`` and a built-in function
-``yield_current_frame_to_caller()`` that work as follows (see example below):
-
-* The built-in function ``yield_current_frame_to_caller()`` causes the current
-  function's state to be captured in a new ``frame_stack_top`` object that is
-  returned to the parent.  Only one frame, the current one, is captured this
-  way.  The current frame is suspended and the caller continues to run.  Note
-  that the caller is only resumed once: when
-  ``yield_current_frame_to_caller()`` is called.  See below.
-
-* A ``frame_stack_top`` object can be jumped to by calling its ``switch()``
-  method with no argument.
-
-* ``yield_current_frame_to_caller()`` and ``switch()`` themselves return a new
-  ``frame_stack_top`` object: the freshly captured state of the caller of the
-  source ``switch()`` that was just executed, or None in the case described
-  below.
-
-* the function that called ``yield_current_frame_to_caller()`` also has a
-  normal return statement, like all functions.  This statement must return
-  another ``frame_stack_top`` object.  The latter is *not* returned to the
-  original caller; there is no way to return several times to the caller.
-  Instead, it designates the place to which the execution must jump, as if by
-  a ``switch()``.  The place to which we jump this way will see a None as the
-  source frame stack top.
-
-* every frame stack top must be resumed once and only once.  Not resuming
-  it at all causes a leak.  Resuming it several times causes a crash.
-
-* a function that called ``yield_current_frame_to_caller()`` should not raise.
-  It would have no implicit parent frame to propagate the exception to.  That
-  would be a crashingly bad idea.
-
-The following example would print the numbers from 1 to 7 in order::
-
-    def g():
-        print 2
-        frametop_before_5 = yield_current_frame_to_caller()
-        print 4
-        frametop_before_7 = frametop_before_5.switch()
-        print 6
-        return frametop_before_7
-
-    def f():
-        print 1
-        frametop_before_4 = g()
-        print 3
-        frametop_before_6 = frametop_before_4.switch()
-        print 5
-        frametop_after_return = frametop_before_6.switch()
-        print 7
-        assert frametop_after_return is None
-
-    f()
-
-
 ``streamio``
 ============
 
diff --git a/pypy/doc/stackless.rst b/pypy/doc/stackless.rst
--- a/pypy/doc/stackless.rst
+++ b/pypy/doc/stackless.rst
@@ -8,446 +8,294 @@
 ================
 
 PyPy can expose to its user language features similar to the ones
-present in `Stackless Python`_: **no recursion depth limit**, and the
-ability to write code in a **massively concurrent style**.  It actually
-exposes three different paradigms to choose from:
+present in `Stackless Python`_: the ability to write code in a
+**massively concurrent style**.  (It does not (any more) offer the
+ability to run with no `recursion depth limit`_, but the same effect
+can be achieved indirectly.)
 
-* `Tasklets and channels`_;
+This feature is based on a custom primitive called a continulet_.
+Continulets can be directly used by application code, or it is possible
+to write (entirely at app-level) more user-friendly interfaces.
 
-* Greenlets_;
+Currently PyPy implements greenlets_ on top of continulets.  It would be
+easy to implement tasklets and channels as well, emulating the model
+of `Stackless Python`_.
 
-* Plain coroutines_.
+Continulets are extremely light-weight, which means that PyPy should be
+able to handle programs containing large amounts of them.  However, due
+to an implementation restriction, a PyPy compiled with
+``--gcrootfinder=shadowstack`` consumes at least one page of physical
+memory (4KB) per live continulet, and half a megabyte of virtual memory
+on 32-bit or a complete megabyte on 64-bit.  Moreover, the feature is
+only available (so far) on x86 and x86-64 CPUs; for other CPUs you need
+to add a short page of custom assembler to
+`pypy/translator/c/src/stacklet/`_.
 
-All of them are extremely light-weight, which means that PyPy should be
-able to handle programs containing large amounts of coroutines, tasklets
-and greenlets.
 
+Theory
+======
 
-Requirements
-++++++++++++++++
+The fundamental idea is that, at any point in time, the program happens
+to run one stack of frames (or one per thread, in case of
+multi-threading).  To see the stack, start at the top frame and follow
+the chain of ``f_back`` until you reach the bottom frame.  From the
+point of view of one of these frames, it has a ``f_back`` pointing to
+another frame (unless it is the bottom frame), and it is itself being
+pointed to by another frame (unless it is the top frame).
 
-If you are running py.py on top of CPython, then you need to enable
-the _stackless module by running it as follows::
+The theory behind continulets is to literally take the previous sentence
+as definition of "an O.K. situation".  The trick is that there are
+O.K. situations that are more complex than just one stack: you will
+always have one stack, but you can also have in addition one or more
+detached *cycles* of frames, such that by following the ``f_back`` chain
+you run in a circle.  But note that these cycles are indeed completely
+detached: the top frame (the currently running one) is always the one
+which is not the ``f_back`` of anybody else, and it is always the top of
+a stack that ends with the bottom frame, never a part of these extra
+cycles.
 
-    py.py --withmod-_stackless
+How do you create such cycles?  The fundamental operation to do so is to
+take two frames and *permute* their ``f_back`` --- i.e. exchange them.
+You can permute any two ``f_back`` without breaking the rule of "an O.K.
+situation".  Say for example that ``f`` is some frame halfway down the
+stack, and you permute its ``f_back`` with the ``f_back`` of the top
+frame.  Then you have removed from the normal stack all intermediate
+frames, and turned them into one stand-alone cycle.  By doing the same
+permutation again you restore the original situation.
 
-This is implemented internally using greenlets, so it only works on a
-platform where `greenlets`_ are supported.  A few features do
-not work this way, though, and really require a translated
-``pypy-c``.
+In practice, in PyPy, you cannot change the ``f_back`` of an abitrary
+frame, but only of frames stored in ``continulets``.
 
-To obtain a translated version of ``pypy-c`` that includes Stackless
-support, run translate.py as follows::
-
-    cd pypy/translator/goal
-    python translate.py --stackless
+Continulets are internally implemented using stacklets.  Stacklets are a
+bit more primitive (they are really one-shot continuations), but that
+idea only works in C, not in Python.  The basic idea of continulets is
+to have at any point in time a complete valid stack; this is important
+e.g. to correctly propagate exceptions (and it seems to give meaningful
+tracebacks too).
 
 
 Application level interface
 =============================
 
-A stackless PyPy contains a module called ``stackless``.  The interface
-exposed by this module have not been refined much, so it should be
-considered in-flux (as of 2007).
 
-So far, PyPy does not provide support for ``stackless`` in a threaded
-environment.  This limitation is not fundamental, as previous experience
-has shown, so supporting this would probably be reasonably easy.
+.. _continulet:
 
-An interesting point is that the same ``stackless`` module can provide
-a number of different concurrency paradigms at the same time.  From a
-theoretical point of view, none of above-mentioned existing three
-paradigms considered on its own is new: two of them are from previous
-Python work, and the third one is a variant of the classical coroutine.
-The new part is that the PyPy implementation manages to provide all of
-them and let the user implement more.  Moreover - and this might be an
-important theoretical contribution of this work - we manage to provide
-these concurrency concepts in a "composable" way.  In other words, it
-is possible to naturally mix in a single application multiple
-concurrency paradigms, and multiple unrelated usages of the same
-paradigm.  This is discussed in the Composability_ section below.
+Continulets
++++++++++++
 
+A translated PyPy contains by default a module called ``_continuation``
+exporting the type ``continulet``.  A ``continulet`` object from this
+module is a container that stores a "one-shot continuation".  It plays
+the role of an extra frame you can insert in the stack, and whose
+``f_back`` can be changed.
 
-Infinite recursion
-++++++++++++++++++
+To make a continulet object, call ``continulet()`` with a callable and
+optional extra arguments.
 
-Any stackless PyPy executable natively supports recursion that is only
-limited by the available memory.  As in normal Python, though, there is
-an initial recursion limit (which is 5000 in all pypy-c's, and 1000 in
-CPython).  It can be changed with ``sys.setrecursionlimit()``.  With a
-stackless PyPy, any value is acceptable - use ``sys.maxint`` for
-unlimited.
+Later, the first time you ``switch()`` to the continulet, the callable
+is invoked with the same continulet object as the extra first argument.
+At that point, the one-shot continuation stored in the continulet points
+to the caller of ``switch()``.  In other words you have a perfectly
+normal-looking stack of frames.  But when ``switch()`` is called again,
+this stored one-shot continuation is exchanged with the current one; it
+means that the caller of ``switch()`` is suspended with its continuation
+stored in the container, and the old continuation from the continulet
+object is resumed.
 
-In some cases, you can write Python code that causes interpreter-level
-infinite recursion -- i.e. infinite recursion without going via
-application-level function calls.  It is possible to limit that too,
-with ``_stackless.set_stack_depth_limit()``, or to unlimit it completely
-by setting it to ``sys.maxint``.
+The most primitive API is actually 'permute()', which just permutes the
+one-shot continuation stored in two (or more) continulets.
 
+In more details:
 
-Coroutines
-++++++++++
+* ``continulet(callable, *args, **kwds)``: make a new continulet.
+  Like a generator, this only creates it; the ``callable`` is only
+  actually called the first time it is switched to.  It will be
+  called as follows::
 
-A Coroutine is similar to a very small thread, with no preemptive scheduling.
-Within a family of coroutines, the flow of execution is explicitly
-transferred from one to another by the programmer.  When execution is
-transferred to a coroutine, it begins to execute some Python code.  When
-it transfers execution away from itself it is temporarily suspended, and
-when execution returns to it it resumes its execution from the
-point where it was suspended.  Conceptually, only one coroutine is
-actively running at any given time (but see Composability_ below).
+      callable(cont, *args, **kwds)
 
-The ``stackless.coroutine`` class is instantiated with no argument.
-It provides the following methods and attributes:
+  where ``cont`` is the same continulet object.
 
-* ``stackless.coroutine.getcurrent()``
+  Note that it is actually ``cont.__init__()`` that binds
+  the continulet.  It is also possible to create a not-bound-yet
+  continulet by calling explicitly ``continulet.__new__()``, and
+  only bind it later by calling explicitly ``cont.__init__()``.
 
-    Static method returning the currently running coroutine.  There is a
-    so-called "main" coroutine object that represents the "outer"
-    execution context, where your main program started and where it runs
-    as long as it does not switch to another coroutine.
+* ``cont.switch(value=None, to=None)``: start the continulet if
+  it was not started yet.  Otherwise, store the current continuation
+  in ``cont``, and activate the target continuation, which is the
+  one that was previously stored in ``cont``.  Note that the target
+  continuation was itself previously suspended by another call to
+  ``switch()``; this older ``switch()`` will now appear to return.
+  The ``value`` argument is any object that is carried to the target
+  and returned by the target's ``switch()``.
 
-* ``coro.bind(callable, *args, **kwds)``
+  If ``to`` is given, it must be another continulet object.  In
+  that case, performs a "double switch": it switches as described
+  above to ``cont``, and then immediately switches again to ``to``.
+  This is different from switching directly to ``to``: the current
+  continuation gets stored in ``cont``, the old continuation from
+  ``cont`` gets stored in ``to``, and only then we resume the
+  execution from the old continuation out of ``to``.
 
-    Bind the coroutine so that it will execute ``callable(*args,
-    **kwds)``.  The call is not performed immediately, but only the
-    first time we call the ``coro.switch()`` method.  A coroutine must
-    be bound before it is switched to.  When the coroutine finishes
-    (because the call to the callable returns), the coroutine exits and
-    implicitly switches back to another coroutine (its "parent"); after
-    this point, it is possible to bind it again and switch to it again.
-    (Which coroutine is the parent of which is not documented, as it is
-    likely to change when the interface is refined.)
+* ``cont.throw(type, value=None, tb=None, to=None)``: similar to
+  ``switch()``, except that immediately after the switch is done, raise
+  the given exception in the target.
 
-* ``coro.switch()``
+* ``cont.is_pending()``: return True if the continulet is pending.
+  This is False when it is not initialized (because we called
+  ``__new__`` and not ``__init__``) or when it is finished (because
+  the ``callable()`` returned).  When it is False, the continulet
+  object is empty and cannot be ``switch()``-ed to.
 
-    Suspend the current (caller) coroutine, and resume execution in the
-    target coroutine ``coro``.
+* ``permute(*continulets)``: a global function that permutes the
+  continuations stored in the given continulets arguments.  Mostly
+  theoretical.  In practice, using ``cont.switch()`` is easier and
+  more efficient than using ``permute()``; the latter does not on
+  its own change the currently running frame.
 
-* ``coro.kill()``
 
-    Kill ``coro`` by sending a CoroutineExit exception and switching
-    execution immediately to it. This exception can be caught in the 
-    coroutine itself and can be raised from any call to ``coro.switch()``. 
-    This exception isn't propagated to the parent coroutine.
+Genlets
++++++++
 
-* ``coro.throw(type, value)``
+The ``_continuation`` module also exposes the ``generator`` decorator::
 
-    Insert an exception in ``coro`` an resume switches execution
-    immediately to it. In the coroutine itself, this exception
-    will come from any call to ``coro.switch()`` and can be caught. If the
-    exception isn't caught, it will be propagated to the parent coroutine.
+    @generator
+    def f(cont, a, b):
+        cont.switch(a + b)
+        cont.switch(a + b + 1)
 
-When a coroutine is garbage-collected, it gets the ``.kill()`` method sent to
-it. This happens at the point the next ``.switch`` method is called, so the
-target coroutine of this call will be executed only after the ``.kill`` has
-finished.
+    for i in f(10, 20):
+        print i
 
-Example
-~~~~~~~
+This example prints 30 and 31.  The only advantage over using regular
+generators is that the generator itself is not limited to ``yield``
+statements that must all occur syntactically in the same function.
+Instead, we can pass around ``cont``, e.g. to nested sub-functions, and
+call ``cont.switch(x)`` from there.
 
-Here is a classical producer/consumer example: an algorithm computes a
-sequence of values, while another consumes them.  For our purposes we
-assume that the producer can generate several values at once, and the
-consumer can process up to 3 values in a batch - it can also process
-batches with fewer than 3 values without waiting for the producer (which
-would be messy to express with a classical Python generator). ::
+The ``generator`` decorator can also be applied to methods::
 
-    def producer(lst):
-        while True:
-            ...compute some more values...
-            lst.extend(new_values)
-            coro_consumer.switch()
-
-    def consumer(lst):
-        while True:
-            # First ask the producer for more values if needed
-            while len(lst) == 0:
-                coro_producer.switch()
-            # Process the available values in a batch, but at most 3
-            batch = lst[:3]
-            del lst[:3]
-            ...process batch...
-
-    # Initialize two coroutines with a shared list as argument
-    exchangelst = []
-    coro_producer = coroutine()
-    coro_producer.bind(producer, exchangelst)
-    coro_consumer = coroutine()
-    coro_consumer.bind(consumer, exchangelst)
-
-    # Start running the consumer coroutine
-    coro_consumer.switch()
-
-
-Tasklets and channels
-+++++++++++++++++++++
-
-The ``stackless`` module also provides an interface that is roughly
-compatible with the interface of the ``stackless`` module in `Stackless
-Python`_: it contains ``stackless.tasklet`` and ``stackless.channel``
-classes.  Tasklets are also similar to microthreads, but (like coroutines)
-they don't actually run in parallel with other microthreads; instead,
-they synchronize and exchange data with each other over Channels, and
-these exchanges determine which Tasklet runs next.
-
-For usage reference, see the documentation on the `Stackless Python`_
-website.
-
-Note that Tasklets and Channels are implemented at application-level in
-`lib_pypy/stackless.py`_ on top of coroutines_.  You can refer to this
-module for more details and API documentation.
-
-The stackless.py code tries to resemble the stackless C code as much
-as possible. This makes the code somewhat unpythonic.
-
-Bird's eye view of tasklets and channels
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Tasklets are a bit like threads: they encapsulate a function in such a way that
-they can be suspended/restarted any time. Unlike threads, they won't
-run concurrently, but must be cooperative. When using stackless
-features, it is vitally important that no action is performed that blocks
-everything else.  In particular, blocking input/output should be centralized
-to a single tasklet.
-
-Communication between tasklets is done via channels. 
-There are three ways for a tasklet to give up control:
-
-1. call ``stackless.schedule()``
-2. send something over a channel
-3. receive something from a channel
-
-A (live) tasklet can either be running, waiting to get scheduled, or be
-blocked by a channel.
-
-Scheduling is done in strictly round-robin manner. A blocked tasklet
-is removed from the scheduling queue and will be reinserted when it
-becomes unblocked.
-
-Example
-~~~~~~~
-
-Here is a many-producers many-consumers example, where any consumer can
-process the result of any producer.  For this situation we set up a
-single channel where all producer send, and on which all consumers
-wait::
-
-    def producer(chan):
-        while True:
-            chan.send(...next value...)
-
-    def consumer(chan):
-        while True:
-            x = chan.receive()
-            ...do something with x...
-
-    # Set up the N producer and M consumer tasklets
-    common_channel = stackless.channel()
-    for i in range(N):
-        stackless.tasklet(producer, common_channel)()
-    for i in range(M):
-        stackless.tasklet(consumer, common_channel)()
-
-    # Run it all
-    stackless.run()
-
-Each item sent over the channel is received by one of the waiting
-consumers; which one is not specified.  The producers block until their
-item is consumed: the channel is not a queue, but rather a meeting point
-which causes tasklets to block until both a consumer and a producer are
-ready.  In practice, the reason for having several consumers receiving
-on a single channel is that some of the consumers can be busy in other
-ways part of the time.  For example, each consumer might receive a
-database request, process it, and send the result to a further channel
-before it asks for the next request.  In this situation, further
-requests can still be received by other consumers.
+    class X:
+        @generator
+        def f(self, cont, a, b):
+            ...
 
 
 Greenlets
 +++++++++
 
-A Greenlet is a kind of primitive Tasklet with a lower-level interface
-and with exact control over the execution order.  Greenlets are similar
-to Coroutines, with a slightly different interface: greenlets put more
-emphasis on a tree structure.  The various greenlets of a program form a
-precise tree, which fully determines their order of execution.
+Greenlets are implemented on top of continulets in `lib_pypy/greenlet.py`_.
+See the official `documentation of the greenlets`_.
 
-For usage reference, see the `documentation of the greenlets`_.
-The PyPy interface is identical.  You should use ``greenlet.greenlet``
-instead of ``stackless.greenlet`` directly, because the greenlet library
-can give you the latter when you ask for the former on top of PyPy.
+Note that unlike the CPython greenlets, this version does not suffer
+from GC issues: if the program "forgets" an unfinished greenlet, it will
+always be collected at the next garbage collection.
 
-PyPy's greenlets do not suffer from the cyclic GC limitation that the
-CPython greenlets have: greenlets referencing each other via local
-variables tend to leak on top of CPython (where it is mostly impossible
-to do the right thing).  It works correctly on top of PyPy.
 
+Unimplemented features
+++++++++++++++++++++++
 
-Coroutine Pickling
-++++++++++++++++++
+The following features (present in some past Stackless version of PyPy)
+are for the time being not supported any more:
 
-Coroutines and tasklets can be pickled and unpickled, i.e. serialized to
-a string of bytes for the purpose of storage or transmission.  This
-allows "live" coroutines or tasklets to be made persistent, moved to
-other machines, or cloned in any way.  The standard ``pickle`` module
-works with coroutines and tasklets (at least in a translated ``pypy-c``;
-unpickling live coroutines or tasklets cannot be easily implemented on
-top of CPython).
+* Tasklets and channels (currently ``stackless.py`` seems to import,
+  but you have tasklets on top of coroutines on top of greenlets on
+  top of continulets on top of stacklets, and it's probably not too
+  hard to cut two of these levels by adapting ``stackless.py`` to
+  use directly continulets)
 
-To be able to achieve this result, we have to consider many objects that
-are not normally pickleable in CPython.  Here again, the `Stackless
-Python`_ implementation has paved the way, and we follow the same
-general design decisions: simple internal objects like bound method
-objects and various kinds of iterators are supported; frame objects can
-be fully pickled and unpickled
-(by serializing a reference to the bytecode they are
-running in addition to all the local variables).  References to globals
-and modules are pickled by name, similarly to references to functions
-and classes in the traditional CPython ``pickle``.
+* Coroutines (could be rewritten at app-level)
 
-The "magic" part of this process is the implementation of the unpickling
-of a chain of frames.  The Python interpreter of PyPy uses
-interpreter-level recursion to represent application-level calls.  The
-reason for this is that it tremendously simplifies the implementation of
-the interpreter itself.  Indeed, in Python, almost any operation can
-potentially result in a non-tail-recursive call to another Python
-function.  This makes writing a non-recursive interpreter extremely
-tedious; instead, we rely on lower-level transformations during the
-translation process to control this recursion.  This is the `Stackless
-Transform`_, which is at the heart of PyPy's support for stackless-style
-concurrency.
+* Pickling and unpickling continulets (*)
 
-At any point in time, a chain of Python-level frames corresponds to a
-chain of interpreter-level frames (e.g. C frames in pypy-c), where each
-single Python-level frame corresponds to one or a few interpreter-level
-frames - depending on the length of the interpreter-level call chain
-from one bytecode evaluation loop to the next (recursively invoked) one.
+* Continuing execution of a continulet in a different thread (*)
 
-This means that it is not sufficient to simply create a chain of Python
-frame objects in the heap of a process before we can resume execution of
-these newly built frames.  We must recreate a corresponding chain of
-interpreter-level frames.  To this end, we have inserted a few *named
-resume points* (see 3.2.4, in `D07.1 Massive Parallelism and Translation Aspects`_) in the Python interpreter of PyPy.  This is the
-motivation for implementing the interpreter-level primitives
-``resume_state_create()`` and ``resume_state_invoke()``, the powerful
-interface that allows an RPython program to artificially rebuild a chain
-of calls in a reflective way, completely from scratch, and jump to it.
+* Automatic unlimited stack (must be emulated__ so far)
 
-.. _`D07.1 Massive Parallelism and Translation Aspects`: http://codespeak.net/pypy/extradoc/eu-report/D07.1_Massive_Parallelism_and_Translation_Aspects-2007-02-28.pdf
+* Support for other CPUs than x86 and x86-64
 
-Example
-~~~~~~~
+.. __: `recursion depth limit`_
 
-(See `demo/pickle_coroutine.py`_ for the complete source of this demo.)
+(*) Pickling, as well as changing threads, could be implemented by using
+a "soft" stack switching mode again.  We would get either "hard" or
+"soft" switches, similarly to Stackless Python 3rd version: you get a
+"hard" switch (like now) when the C stack contains non-trivial C frames
+to save, and a "soft" switch (like previously) when it contains only
+simple calls from Python to Python.  Soft-switched continulets would
+also consume a bit less RAM, and the switch might be a bit faster too
+(unsure about that; what is the Stackless Python experience?).
 
-Consider a program which contains a part performing a long-running
-computation::
 
-    def ackermann(x, y):
-        if x == 0:
-            return y + 1
-        if y == 0:
-            return ackermann(x - 1, 1)
-        return ackermann(x - 1, ackermann(x, y - 1))
+Recursion depth limit
++++++++++++++++++++++
 
-By using pickling, we can save the state of the computation while it is
-running, for the purpose of restoring it later and continuing the
-computation at another time or on a different machine.  However,
-pickling does not produce a whole-program dump: it can only pickle
-individual coroutines.  This means that the computation should be
-started in its own coroutine::
+You can use continulets to emulate the infinite recursion depth present
+in Stackless Python and in stackless-enabled older versions of PyPy.
 
-    # Make a coroutine that will run 'ackermann(3, 8)'
-    coro = coroutine()
-    coro.bind(ackermann, 3, 8)
+The trick is to start a continulet "early", i.e. when the recursion
+depth is very low, and switch to it "later", i.e. when the recursion
+depth is high.  Example::
 
-    # Now start running the coroutine
-    result = coro.switch()
+    from _continuation import continulet
 
-The coroutine itself must switch back to the main program when it needs
-to be interrupted (we can only pickle suspended coroutines).  Due to
-current limitations this requires an explicit check in the
-``ackermann()`` function::
+    def invoke(_, callable, arg):
+        return callable(arg)
 
-    def ackermann(x, y):
-        if interrupt_flag:      # test a global flag
-            main.switch()       # and switch back to 'main' if it is set
-        if x == 0:
-            return y + 1
-        if y == 0:
-            return ackermann(x - 1, 1)
-        return ackermann(x - 1, ackermann(x, y - 1))
+    def bootstrap(c):
+        # this loop runs forever, at a very low recursion depth
+        callable, arg = c.switch()
+        while True:
+            # start a new continulet from here, and switch to
+            # it using an "exchange", i.e. a switch with to=.
+            to = continulet(invoke, callable, arg)
+            callable, arg = c.switch(to=to)
 
-The global ``interrupt_flag`` would be set for example by a timeout, or
-by a signal handler reacting to Ctrl-C, etc.  It causes the coroutine to
-transfer control back to the main program.  The execution comes back
-just after the line ``coro.switch()``, where we can pickle the coroutine
-if necessary::
+    c = continulet(bootstrap)
+    c.switch()
 
-    if not coro.is_alive:
-        print "finished; the result is:", result
-    else:
-        # save the state of the suspended coroutine
-        f = open('demo.pickle', 'w')
-        pickle.dump(coro, f)
-        f.close()
 
-The process can then stop.  At any later time, or on another machine,
-we can reload the file and restart the coroutine with::
+    def recursive(n):
+        if n == 0:
+            return ("ok", n)
+        if n % 200 == 0:
+            prev = c.switch((recursive, n - 1))
+        else:
+            prev = recursive(n - 1)
+        return (prev[0], prev[1] + 1)
 
-    f = open('demo.pickle', 'r')
-    coro = pickle.load(f)
-    f.close()
-    result = coro.switch()
+    print recursive(999999)     # prints ('ok', 999999)
 
-Limitations
-~~~~~~~~~~~
+Note that if you press Ctrl-C while running this example, the traceback
+will be built with *all* recursive() calls so far, even if this is more
+than the number that can possibly fit in the C stack.  These frames are
+"overlapping" each other in the sense of the C stack; more precisely,
+they are copied out of and into the C stack as needed.
 
-Coroutine pickling is subject to some limitations.  First of all, it is
-not a whole-program "memory dump".  It means that only the "local" state
-of a coroutine is saved.  The local state is defined to include the
-chain of calls and the local variables, but not for example the value of
-any global variable.
+(The example above also makes use of the following general "guideline"
+to help newcomers write continulets: in ``bootstrap(c)``, only call
+methods on ``c``, not on another continulet object.  That's why we wrote
+``c.switch(to=to)`` and not ``to.switch()``, which would mess up the
+state.  This is however just a guideline; in general we would recommend
+to use other interfaces like genlets and greenlets.)
 
-As in normal Python, the pickle will not include any function object's
-code, any class definition, etc., but only references to functions and
-classes.  Unlike normal Python, the pickle contains frames.  A pickled
-frame stores a bytecode index, representing the current execution
-position.  This means that the user program cannot be modified *at all*
-between pickling and unpickling!
 
-On the other hand, the pickled data is fairly independent from the
-platform and from the PyPy version.
-
-Pickling/unpickling fails if the coroutine is suspended in a state that
-involves Python frames which were *indirectly* called.  To define this
-more precisely, a Python function can issue a regular function or method
-call to invoke another Python function - this is a *direct* call and can
-be pickled and unpickled.  But there are many ways to invoke a Python
-function indirectly.  For example, most operators can invoke a special
-method ``__xyz__()`` on a class, various built-in functions can call
-back Python functions, signals can invoke signal handlers, and so on.
-These cases are not supported yet.
-
-
-Composability
-+++++++++++++
+Theory of composability
++++++++++++++++++++++++
 
 Although the concept of coroutines is far from new, they have not been
 generally integrated into mainstream languages, or only in limited form
 (like generators in Python and iterators in C#).  We can argue that a
 possible reason for that is that they do not scale well when a program's
 complexity increases: they look attractive in small examples, but the
-models that require explicit switching, by naming the target coroutine,
-do not compose naturally.  This means that a program that uses
-coroutines for two unrelated purposes may run into conflicts caused by
-unexpected interactions.
+models that require explicit switching, for example by naming the target
+coroutine, do not compose naturally.  This means that a program that
+uses coroutines for two unrelated purposes may run into conflicts caused
+by unexpected interactions.
 
 To illustrate the problem, consider the following example (simplified
-code; see the full source in
-`pypy/module/_stackless/test/test_composable_coroutine.py`_).  First, a
-simple usage of coroutine::
+code using a theorical ``coroutine`` class).  First, a simple usage of
+coroutine::
 
     main_coro = coroutine.getcurrent()    # the main (outer) coroutine
     data = []
@@ -530,74 +378,35 @@
 main coroutine, which confuses the ``generator_iterator.next()`` method
 (it gets resumed, but not as a result of a call to ``Yield()``).
 
-As part of trying to combine multiple different paradigms into a single
-application-level module, we have built a way to solve this problem.
-The idea is to avoid the notion of a single, global "main" coroutine (or
-a single main greenlet, or a single main tasklet).  Instead, each
-conceptually separated user of one of these concurrency interfaces can
-create its own "view" on what the main coroutine/greenlet/tasklet is,
-which other coroutine/greenlet/tasklets there are, and which of these is
-the currently running one.  Each "view" is orthogonal to the others.  In
-particular, each view has one (and exactly one) "current"
-coroutine/greenlet/tasklet at any point in time.  When the user switches
-to a coroutine/greenlet/tasklet, it implicitly means that he wants to
-switch away from the current coroutine/greenlet/tasklet *that belongs to
-the same view as the target*.
+Thus the notion of coroutine is *not composable*.  By opposition, the
+primitive notion of continulets is composable: if you build two
+different interfaces on top of it, or have a program that uses twice the
+same interface in two parts, then assuming that both parts independently
+work, the composition of the two parts still works.
 
-The precise application-level interface has not been fixed yet; so far,
-"views" in the above sense are objects of the type
-``stackless.usercostate``.  The above two examples can be rewritten in
-the following way::
+A full proof of that claim would require careful definitions, but let us
+just claim that this fact is true because of the following observation:
+the API of continulets is such that, when doing a ``switch()``, it
+requires the program to have some continulet to explicitly operate on.
+It shuffles the current continuation with the continuation stored in
+that continulet, but has no effect outside.  So if a part of a program
+has a continulet object, and does not expose it as a global, then the
+rest of the program cannot accidentally influence the continuation
+stored in that continulet object.
 
-    producer_view = stackless.usercostate()   # a local view
-    main_coro = producer_view.getcurrent()    # the main (outer) coroutine
-    ...
-    producer_coro = producer_view.newcoroutine()
-    ...
-
-and::
-
-    generators_view = stackless.usercostate()
-
-    def generator(f):
-        def wrappedfunc(*args, **kwds):
-            g = generators_view.newcoroutine(generator_iterator)
-            ...
-
-            ...generators_view.getcurrent()...
-
-Then the composition ``grab_values()`` works as expected, because the
-two views are independent.  The coroutine captured as ``self.caller`` in
-the ``generator_iterator.next()`` method is the main coroutine of the
-``generators_view``.  It is no longer the same object as the main
-coroutine of the ``producer_view``, so when ``data_producer()`` issues
-the following command::
-
-    main_coro.switch()
-
-the control flow cannot accidentally jump back to
-``generator_iterator.next()``.  In other words, from the point of view
-of ``producer_view``, the function ``grab_next_value()`` always runs in
-its main coroutine ``main_coro`` and the function ``data_producer`` in
-its coroutine ``producer_coro``.  This is the case independently of
-which ``generators_view``-based coroutine is the current one when
-``grab_next_value()`` is called.
-
-Only code that has explicit access to the ``producer_view`` or its
-coroutine objects can perform switches that are relevant for the
-generator code.  If the view object and the coroutine objects that share
-this view are all properly encapsulated inside the generator logic, no
-external code can accidentally temper with the expected control flow any
-longer.
-
-In conclusion: we will probably change the app-level interface of PyPy's
-stackless module in the future to not expose coroutines and greenlets at
-all, but only views.  They are not much more difficult to use, and they
-scale automatically to larger programs.
+In other words, if we regard the continulet object as being essentially
+a modifiable ``f_back``, then it is just a link between the frame of
+``callable()`` and the parent frame --- and it cannot be arbitrarily
+changed by unrelated code, as long as they don't explicitly manipulate
+the continulet object.  Typically, both the frame of ``callable()``
+(commonly a local function) and its parent frame (which is the frame
+that switched to it) belong to the same class or module; so from that
+point of view the continulet is a purely local link between two local
+frames.  It doesn't make sense to have a concept that allows this link
+to be manipulated from outside.
 
 
 .. _`Stackless Python`: http://www.stackless.com
 .. _`documentation of the greenlets`: http://packages.python.org/greenlet/
-.. _`Stackless Transform`: translation.html#the-stackless-transform
 
 .. include:: _ref.txt
diff --git a/pypy/doc/translation.rst b/pypy/doc/translation.rst
--- a/pypy/doc/translation.rst
+++ b/pypy/doc/translation.rst
@@ -552,14 +552,15 @@
 
 The stackless transform converts functions into a form that knows how
 to save the execution point and active variables into a heap structure
-and resume execution at that point.  This is used to implement
+and resume execution at that point.  This was used to implement
 coroutines as an RPython-level feature, which in turn are used to
-implement `coroutines, greenlets and tasklets`_ as an application
+implement coroutines, greenlets and tasklets as an application
 level feature for the Standard Interpreter.
 
-Enable the stackless transformation with :config:`translation.stackless`.
+The stackless transformation has been deprecated and is no longer
+available in trunk.  It has been replaced with continulets_.
 
-.. _`coroutines, greenlets and tasklets`: stackless.html
+.. _continulets: stackless.html
 
 .. _`preparing the graphs for source generation`:
 
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -8,13 +8,13 @@
 from pypy.interpreter.miscutils import ThreadLocals
 from pypy.tool.cache import Cache
 from pypy.tool.uid import HUGEVAL_BYTES
-from pypy.rlib.objectmodel import we_are_translated
+from pypy.rlib.objectmodel import we_are_translated, newlist
 from pypy.rlib.debug import make_sure_not_resized
 from pypy.rlib.timer import DummyTimer, Timer
 from pypy.rlib.rarithmetic import r_uint
 from pypy.rlib import jit
 from pypy.tool.sourcetools import func_with_new_name
-import os, sys, py
+import os, sys
 
 __all__ = ['ObjSpace', 'OperationError', 'Wrappable', 'W_Root']
 
@@ -111,6 +111,9 @@
     def setslotvalue(self, index, w_val):
         raise NotImplementedError
 
+    def delslotvalue(self, index):
+        raise NotImplementedError
+
     def descr_call_mismatch(self, space, opname, RequiredClass, args):
         if RequiredClass is None:
             classname = '?'
@@ -623,9 +626,9 @@
             self.default_compiler = compiler
             return compiler
 
-    def createframe(self, code, w_globals, closure=None):
+    def createframe(self, code, w_globals, outer_func=None):
         "Create an empty PyFrame suitable for this code object."
-        return self.FrameClass(self, code, w_globals, closure)
+        return self.FrameClass(self, code, w_globals, outer_func)
 
     def allocate_lock(self):
         """Return an interp-level Lock object if threads are enabled,
@@ -754,7 +757,18 @@
         w_iterator = self.iter(w_iterable)
         # If we know the expected length we can preallocate.
         if expected_length == -1:
-            items = []
+            try:
+                lgt_estimate = self.len_w(w_iterable)
+            except OperationError, o:
+                if (not o.match(self, self.w_AttributeError) and
+                    not o.match(self, self.w_TypeError)):
+                    raise
+                items = []
+            else:
+                try:
+                    items = newlist(lgt_estimate)
+                except MemoryError:
+                    items = [] # it might have lied
         else:
             items = [None] * expected_length
         idx = 0
diff --git a/pypy/interpreter/executioncontext.py b/pypy/interpreter/executioncontext.py
--- a/pypy/interpreter/executioncontext.py
+++ b/pypy/interpreter/executioncontext.py
@@ -81,58 +81,6 @@
 
     # ________________________________________________________________
 
-
-    class Subcontext(object):
-        # coroutine: subcontext support
-
-        def __init__(self):
-            self.topframe = None
-            self.w_tracefunc = None
-            self.profilefunc = None
-            self.w_profilefuncarg = None
-            self.is_tracing = 0
-
-        def enter(self, ec):
-            ec.topframeref = jit.non_virtual_ref(self.topframe)
-            ec.w_tracefunc = self.w_tracefunc
-            ec.profilefunc = self.profilefunc
-            ec.w_profilefuncarg = self.w_profilefuncarg
-            ec.is_tracing = self.is_tracing
-            ec.space.frame_trace_action.fire()
-
-        def leave(self, ec):
-            self.topframe = ec.gettopframe()
-            self.w_tracefunc = ec.w_tracefunc
-            self.profilefunc = ec.profilefunc
-            self.w_profilefuncarg = ec.w_profilefuncarg
-            self.is_tracing = ec.is_tracing
-
-        def clear_framestack(self):
-            self.topframe = None
-
-        # the following interface is for pickling and unpickling
-        def getstate(self, space):
-            if self.topframe is None:
-                return space.w_None
-            return self.topframe
-
-        def setstate(self, space, w_state):
-            from pypy.interpreter.pyframe import PyFrame
-            if space.is_w(w_state, space.w_None):
-                self.topframe = None
-            else:
-                self.topframe = space.interp_w(PyFrame, w_state)
-
-        def getframestack(self):
-            lst = []
-            f = self.topframe
-            while f is not None:
-                lst.append(f)
-                f = f.f_backref()
-            lst.reverse()
-            return lst
-        # coroutine: I think this is all, folks!
-
     def c_call_trace(self, frame, w_func, args=None):
         "Profile the call of a builtin function"
         self._c_call_return_trace(frame, w_func, args, 'c_call')
diff --git a/pypy/interpreter/function.py b/pypy/interpreter/function.py
--- a/pypy/interpreter/function.py
+++ b/pypy/interpreter/function.py
@@ -30,7 +30,7 @@
     can_change_code = True
     _immutable_fields_ = ['code?',
                           'w_func_globals?',
-                          'closure?',
+                          'closure?[*]',
                           'defs_w?[*]',
                           'name?']
 
@@ -96,7 +96,7 @@
             assert isinstance(code, PyCode)
             if nargs < 5:
                 new_frame = self.space.createframe(code, self.w_func_globals,
-                                                   self.closure)
+                                                   self)
                 for i in funccallunrolling:
                     if i < nargs:
                         new_frame.locals_stack_w[i] = args_w[i]
@@ -156,7 +156,7 @@
     def _flat_pycall(self, code, nargs, frame):
         # code is a PyCode
         new_frame = self.space.createframe(code, self.w_func_globals,
-                                                   self.closure)
+                                                   self)
         for i in xrange(nargs):
             w_arg = frame.peekvalue(nargs-1-i)
             new_frame.locals_stack_w[i] = w_arg
@@ -167,7 +167,7 @@
     def _flat_pycall_defaults(self, code, nargs, frame, defs_to_load):
         # code is a PyCode
         new_frame = self.space.createframe(code, self.w_func_globals,
-                                                   self.closure)
+                                                   self)
         for i in xrange(nargs):
             w_arg = frame.peekvalue(nargs-1-i)
             new_frame.locals_stack_w[i] = w_arg
@@ -242,8 +242,10 @@
             # we have been seen by other means so rtyping should not choke
             # on us
             identifier = self.code.identifier
-            assert Function._all.get(identifier, self) is self, ("duplicate "
-                                                                 "function ids")
+            previous = Function._all.get(identifier, self)
+            assert previous is self, (
+                "duplicate function ids with identifier=%r: %r and %r" % (
+                identifier, previous, self))
             self.add_to_table()
         return False
 
diff --git a/pypy/interpreter/gateway.py b/pypy/interpreter/gateway.py
--- a/pypy/interpreter/gateway.py
+++ b/pypy/interpreter/gateway.py
@@ -64,7 +64,7 @@
                 self.visit_self(el[1], *args)
             else:
                 self.visit_function(el, *args)
-        else:
+        elif isinstance(el, type):
             for typ in self.bases_order:
                 if issubclass(el, typ):
                     visit = getattr(self, "visit__%s" % (typ.__name__,))
@@ -73,6 +73,8 @@
             else:
                 raise Exception("%s: no match for unwrap_spec element %s" % (
                     self.__class__.__name__, el))
+        else:
+            raise Exception("unable to dispatch, %s, perhaps your parameter should have started with w_?" % el)
 
     def apply_over(self, unwrap_spec, *extra):
         dispatch = self.dispatch
diff --git a/pypy/interpreter/miscutils.py b/pypy/interpreter/miscutils.py
--- a/pypy/interpreter/miscutils.py
+++ b/pypy/interpreter/miscutils.py
@@ -167,3 +167,7 @@
 
     def getmainthreadvalue(self):
         return self._value
+
+    def getallvalues(self):
+        return {0: self._value}
+
diff --git a/pypy/interpreter/nestedscope.py b/pypy/interpreter/nestedscope.py
--- a/pypy/interpreter/nestedscope.py
+++ b/pypy/interpreter/nestedscope.py
@@ -8,7 +8,7 @@
 
 class Cell(Wrappable):
     "A simple container for a wrapped value."
-    
+
     def __init__(self, w_value=None):
         self.w_value = w_value
 
@@ -90,32 +90,33 @@
     #     variables coming from a parent function in which i'm nested
     # 'closure' is a list of Cell instances: the received free vars.
 
-    cells = None
-
     @jit.unroll_safe
-    def initialize_frame_scopes(self, closure, code):
-        super_initialize_frame_scopes(self, closure, code)
+    def initialize_frame_scopes(self, outer_func, code):
+        super_initialize_frame_scopes(self, outer_func, code)
         ncellvars = len(code.co_cellvars)
         nfreevars = len(code.co_freevars)
         if not nfreevars:
             if not ncellvars:
+                self.cells = []
                 return            # no self.cells needed - fast path
-            if closure is None:
-                closure = []
-        elif closure is None:
+        elif outer_func is None:
             space = self.space
             raise OperationError(space.w_TypeError,
                                  space.wrap("directly executed code object "
                                             "may not contain free variables"))
-        if len(closure) != nfreevars:
+        if outer_func and outer_func.closure:
+            closure_size = len(outer_func.closure)
+        else:
+            closure_size = 0
+        if closure_size != nfreevars:
             raise ValueError("code object received a closure with "
                                  "an unexpected number of free variables")
         self.cells = [None] * (ncellvars + nfreevars)
         for i in range(ncellvars):
             self.cells[i] = Cell()
         for i in range(nfreevars):
-            self.cells[i + ncellvars] = closure[i]
-    
+            self.cells[i + ncellvars] = outer_func.closure[i]
+
     def _getcells(self):
         return self.cells
 
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -10,7 +10,7 @@
 from pypy.interpreter.argument import Signature
 from pypy.interpreter.error import OperationError
 from pypy.interpreter.gateway import NoneNotWrapped, unwrap_spec
-from pypy.interpreter.astcompiler.consts import (CO_OPTIMIZED,
+from pypy.interpreter.astcompiler.consts import (
     CO_OPTIMIZED, CO_NEWLOCALS, CO_VARARGS, CO_VARKEYWORDS, CO_NESTED,
     CO_GENERATOR, CO_CONTAINSGLOBALS)
 from pypy.rlib.rarithmetic import intmask
@@ -198,7 +198,7 @@
 
     def funcrun(self, func, args):
         frame = self.space.createframe(self, func.w_func_globals,
-                                  func.closure)
+                                  func)
         sig = self._signature
         # speed hack
         fresh_frame = jit.hint(frame, access_directly=True,
@@ -211,7 +211,7 @@
 
     def funcrun_obj(self, func, w_obj, args):
         frame = self.space.createframe(self, func.w_func_globals,
-                                  func.closure)
+                                  func)
         sig = self._signature
         # speed hack
         fresh_frame = jit.hint(frame, access_directly=True,
diff --git a/pypy/interpreter/pyframe.py b/pypy/interpreter/pyframe.py
--- a/pypy/interpreter/pyframe.py
+++ b/pypy/interpreter/pyframe.py
@@ -51,7 +51,10 @@
     is_being_profiled        = False
     escaped                  = False  # see mark_as_escaped()
 
-    def __init__(self, space, code, w_globals, closure):
+    def __init__(self, space, code, w_globals, outer_func):
+        if not we_are_translated():
+            assert type(self) in (space.FrameClass, CPythonFrame), (
+                "use space.FrameClass(), not directly PyFrame()")
         self = hint(self, access_directly=True, fresh_virtualizable=True)
         assert isinstance(code, pycode.PyCode)
         self.pycode = code
@@ -63,11 +66,11 @@
         make_sure_not_resized(self.locals_stack_w)
         check_nonneg(self.nlocals)
         #
-        if space.config.objspace.honor__builtins__:
+        if space.config.objspace.honor__builtins__ and w_globals is not None:
             self.builtin = space.builtin.pick_builtin(w_globals)
         # regular functions always have CO_OPTIMIZED and CO_NEWLOCALS.
         # class bodies only have CO_NEWLOCALS.
-        self.initialize_frame_scopes(closure, code)
+        self.initialize_frame_scopes(outer_func, code)
         self.f_lineno = code.co_firstlineno
 
     def mark_as_escaped(self):
@@ -80,7 +83,7 @@
         self.escaped = True
 
     def append_block(self, block):
-        block.previous = self.lastblock
+        assert block.previous is self.lastblock
         self.lastblock = block
 
     def pop_block(self):
@@ -106,15 +109,16 @@
         while i >= 0:
             block = lst[i]
             i -= 1
-            self.append_block(block)
+            block.previous = self.lastblock
+            self.lastblock = block
 
     def get_builtin(self):
         if self.space.config.objspace.honor__builtins__:
             return self.builtin
         else:
             return self.space.builtin
-        
-    def initialize_frame_scopes(self, closure, code): 
+
+    def initialize_frame_scopes(self, outer_func, code):
         # regular functions always have CO_OPTIMIZED and CO_NEWLOCALS.
         # class bodies only have CO_NEWLOCALS.
         # CO_NEWLOCALS: make a locals dict unless optimized is also set
@@ -381,7 +385,11 @@
         
         # do not use the instance's __init__ but the base's, because we set
         # everything like cells from here
-        PyFrame.__init__(self, space, pycode, w_globals, closure)
+        # XXX hack
+        from pypy.interpreter.function import Function
+        outer_func = Function(space, None, closure=closure,
+                             forcename="fake")
+        PyFrame.__init__(self, space, pycode, w_globals, outer_func)
         f_back = space.interp_w(PyFrame, w_f_back, can_be_None=True)
         new_frame.f_backref = jit.non_virtual_ref(f_back)
 
@@ -606,7 +614,8 @@
         return self.get_builtin().getdict(space)
 
     def fget_f_back(self, space):
-        return self.space.wrap(self.f_backref())
+        f_backref = ExecutionContext.getnextframe_nohidden(self)
+        return self.space.wrap(f_backref)
 
     def fget_f_lasti(self, space):
         return self.space.wrap(self.last_instr)
diff --git a/pypy/interpreter/pyopcode.py b/pypy/interpreter/pyopcode.py
--- a/pypy/interpreter/pyopcode.py
+++ b/pypy/interpreter/pyopcode.py
@@ -892,16 +892,16 @@
         raise BytecodeCorruption, "old opcode, no longer in use"
 
     def SETUP_LOOP(self, offsettoend, next_instr):
-        block = LoopBlock(self, next_instr + offsettoend)
-        self.append_block(block)
+        block = LoopBlock(self, next_instr + offsettoend, self.lastblock)
+        self.lastblock = block
 
     def SETUP_EXCEPT(self, offsettoend, next_instr):
-        block = ExceptBlock(self, next_instr + offsettoend)
-        self.append_block(block)
+        block = ExceptBlock(self, next_instr + offsettoend, self.lastblock)
+        self.lastblock = block
 
     def SETUP_FINALLY(self, offsettoend, next_instr):
-        block = FinallyBlock(self, next_instr + offsettoend)
-        self.append_block(block)
+        block = FinallyBlock(self, next_instr + offsettoend, self.lastblock)
+        self.lastblock = block
 
     def SETUP_WITH(self, offsettoend, next_instr):
         w_manager = self.peekvalue()
@@ -915,8 +915,8 @@
         w_exit = self.space.get(w_descr, w_manager)
         self.settopvalue(w_exit)
         w_result = self.space.get_and_call_function(w_enter, w_manager)
-        block = WithBlock(self, next_instr + offsettoend)
-        self.append_block(block)
+        block = WithBlock(self, next_instr + offsettoend, self.lastblock)
+        self.lastblock = block
         self.pushvalue(w_result)
 
     def WITH_CLEANUP(self, oparg, next_instr):
@@ -1247,10 +1247,10 @@
 
     _immutable_ = True
 
-    def __init__(self, frame, handlerposition):
+    def __init__(self, frame, handlerposition, previous):
         self.handlerposition = handlerposition
         self.valuestackdepth = frame.valuestackdepth
-        self.previous = None # this makes a linked list of blocks
+        self.previous = previous   # this makes a linked list of blocks
 
     def __eq__(self, other):
         return (self.__class__ is other.__class__ and
@@ -1523,10 +1523,8 @@
 
         if not isinstance(prog, codetype):
             filename = '<string>'
-            if not isinstance(prog, str):
-                if isinstance(prog, basestring):
-                    prog = str(prog)
-                elif isinstance(prog, file):
+            if not isinstance(prog, basestring):
+                if isinstance(prog, file):
                     filename = prog.name
                     prog = prog.read()
                 else:
diff --git a/pypy/interpreter/pyparser/future.py b/pypy/interpreter/pyparser/future.py
--- a/pypy/interpreter/pyparser/future.py
+++ b/pypy/interpreter/pyparser/future.py
@@ -109,25 +109,19 @@
             self.getc() == self.getc(+2)):
             self.pos += 3
             while 1: # Deal with a triple quoted docstring
-                if self.getc() == '\\':
-                    self.pos += 2
+                c = self.getc()
+                if c == '\\':
+                    self.pos += 1
+                    self._skip_next_char_from_docstring()
+                elif c != endchar:
+                    self._skip_next_char_from_docstring()
                 else:
-                    c = self.getc()
-                    if c != endchar:
-                        self.pos += 1
-                        if c == '\n':
-                            self.atbol()
-                        elif c == '\r':
-                            if self.getc() == '\n':
-                                self.pos += 1
-                                self.atbol()
-                    else:
-                        self.pos += 1
-                        if (self.getc() == endchar and
-                            self.getc(+1) == endchar):
-                            self.pos += 2
-                            self.consume_empty_line()
-                            break
+                    self.pos += 1
+                    if (self.getc() == endchar and
+                        self.getc(+1) == endchar):
+                        self.pos += 2
+                        self.consume_empty_line()
+                        break
 
         else: # Deal with a single quoted docstring
             self.pos += 1
@@ -138,17 +132,21 @@
                     self.consume_empty_line()
                     return
                 elif c == '\\':
-                    # Deal with linefeeds
-                    if self.getc() != '\r':
-                        self.pos += 1
-                    else:
-                        self.pos += 1
-                        if self.getc() == '\n':
-                            self.pos += 1
+                    self._skip_next_char_from_docstring()
                 elif c in '\r\n':
                     # Syntax error
                     return
 
+    def _skip_next_char_from_docstring(self):
+        c = self.getc()
+        self.pos += 1
+        if c == '\n':
+            self.atbol()
+        elif c == '\r':
+            if self.getc() == '\n':
+                self.pos += 1
+            self.atbol()
+
     def consume_continuation(self):
         c = self.getc()
         if c in '\n\r':
diff --git a/pypy/interpreter/pyparser/test/test_futureautomaton.py b/pypy/interpreter/pyparser/test/test_futureautomaton.py
--- a/pypy/interpreter/pyparser/test/test_futureautomaton.py
+++ b/pypy/interpreter/pyparser/test/test_futureautomaton.py
@@ -221,6 +221,14 @@
     assert f.lineno == 3
     assert f.col_offset == 0
 
+def test_lots_of_continuation_lines():
+    s = "\\\n\\\n\\\n\\\n\\\n\\\n\nfrom __future__ import with_statement\n"
+    f = run(s)
+    assert f.pos == len(s)
+    assert f.flags == fut.CO_FUTURE_WITH_STATEMENT
+    assert f.lineno == 8
+    assert f.col_offset == 0
+
 # This looks like a bug in cpython parser
 # and would require extensive modifications
 # to future.py in order to emulate the same behaviour
@@ -239,3 +247,19 @@
         raise AssertionError('IndentationError not raised')
     assert f.lineno == 2
     assert f.col_offset == 0
+
+def test_continuation_lines_in_docstring_single_quoted():
+    s = '"\\\n\\\n\\\n\\\n\\\n\\\n"\nfrom  __future__ import division\n'
+    f = run(s)
+    assert f.pos == len(s)
+    assert f.flags == fut.CO_FUTURE_DIVISION
+    assert f.lineno == 8
+    assert f.col_offset == 0
+
+def test_continuation_lines_in_docstring_triple_quoted():
+    s = '"""\\\n\\\n\\\n\\\n\\\n\\\n"""\nfrom  __future__ import division\n'
+    f = run(s)
+    assert f.pos == len(s)
+    assert f.flags == fut.CO_FUTURE_DIVISION
+    assert f.lineno == 8
+    assert f.col_offset == 0
diff --git a/pypy/interpreter/test/test_exec.py b/pypy/interpreter/test/test_exec.py
--- a/pypy/interpreter/test/test_exec.py
+++ b/pypy/interpreter/test/test_exec.py
@@ -219,3 +219,30 @@
             raise e
 
         assert res == 1
+
+    def test_exec_unicode(self):
+        # 's' is a string
+        s = "x = u'\xd0\xb9\xd1\x86\xd1\x83\xd0\xba\xd0\xb5\xd0\xbd'"
+        # 'u' is a unicode
+        u = s.decode('utf-8')
+        exec u
+        assert len(x) == 6
+        assert ord(x[0]) == 0x0439
+        assert ord(x[1]) == 0x0446
+        assert ord(x[2]) == 0x0443
+        assert ord(x[3]) == 0x043a
+        assert ord(x[4]) == 0x0435
+        assert ord(x[5]) == 0x043d
+
+    def test_eval_unicode(self):
+        u = "u'%s'" % unichr(0x1234)
+        v = eval(u)
+        assert v == unichr(0x1234)
+
+    def test_compile_unicode(self):
+        s = "x = u'\xd0\xb9\xd1\x86\xd1\x83\xd0\xba\xd0\xb5\xd0\xbd'"
+        u = s.decode('utf-8')
+        c = compile(u, '<input>', 'exec')
+        exec c
+        assert len(x) == 6
+        assert ord(x[0]) == 0x0439
diff --git a/pypy/interpreter/test/test_gateway.py b/pypy/interpreter/test/test_gateway.py
--- a/pypy/interpreter/test/test_gateway.py
+++ b/pypy/interpreter/test/test_gateway.py
@@ -704,7 +704,7 @@
 class TestPassThroughArguments_CALL_METHOD(TestPassThroughArguments):
 
     def setup_class(cls):
-        space = gettestobjspace(usemodules=('_stackless',), **{
+        space = gettestobjspace(usemodules=('itertools',), **{
             "objspace.opcodes.CALL_METHOD": True
             })
         cls.space = space
diff --git a/pypy/interpreter/test/test_objspace.py b/pypy/interpreter/test/test_objspace.py
--- a/pypy/interpreter/test/test_objspace.py
+++ b/pypy/interpreter/test/test_objspace.py
@@ -71,6 +71,23 @@
         assert err.value.match(space, space.w_ValueError)
         err = raises(OperationError, space.unpackiterable, w_l, 5)
         assert err.value.match(space, space.w_ValueError)
+        w_a = space.appexec((), """():
+        class A(object):
+            def __iter__(self):
+                return self
+            def next(self):
+                raise StopIteration
+            def __len__(self):
+                1/0
+        return A()
+        """)
+        try:
+            space.unpackiterable(w_a)
+        except OperationError, o:
+            if not o.match(space, space.w_ZeroDivisionError):
+                raise Exception("DID NOT RAISE")
+        else:
+            raise Exception("DID NOT RAISE")
 
     def test_fixedview(self):
         space = self.space
diff --git a/pypy/interpreter/test/test_pyframe.py b/pypy/interpreter/test/test_pyframe.py
--- a/pypy/interpreter/test/test_pyframe.py
+++ b/pypy/interpreter/test/test_pyframe.py
@@ -1,4 +1,5 @@
 from pypy.tool import udir
+from pypy.conftest import option
 
 
 class AppTestPyFrame:
@@ -6,6 +7,15 @@
     def setup_class(cls):
         cls.w_udir = cls.space.wrap(str(udir.udir))
         cls.w_tempfile1 = cls.space.wrap(str(udir.udir.join('tempfile1')))
+        if not option.runappdirect:
+            w_call_further = cls.space.appexec([], """():
+                def call_further(f):
+                    return f()
+                return call_further
+            """)
+            assert not w_call_further.code.hidden_applevel
+            w_call_further.code.hidden_applevel = True       # hack
+            cls.w_call_further = w_call_further
 
     # test for the presence of the attributes, not functionality
 
@@ -107,6 +117,22 @@
         frame = f()
         assert frame.f_back.f_code.co_name == 'f'
 
+    def test_f_back_hidden(self):
+        if not hasattr(self, 'call_further'):
+            skip("not for runappdirect testing")
+        import sys
+        def f():
+            return (sys._getframe(0),
+                    sys._getframe(1),
+                    sys._getframe(0).f_back)
+        def main():
+            return self.call_further(f)
+        f0, f1, f1bis = main()
+        assert f0.f_code.co_name == 'f'
+        assert f1.f_code.co_name == 'main'
+        assert f1bis is f1
+        assert f0.f_back is f1
+
     def test_f_exc_xxx(self):
         import sys
 
diff --git a/pypy/interpreter/typedef.py b/pypy/interpreter/typedef.py
--- a/pypy/interpreter/typedef.py
+++ b/pypy/interpreter/typedef.py
@@ -258,6 +258,11 @@
                     self.slots_w = [None] * nslots
             def setslotvalue(self, index, w_value):
                 self.slots_w[index] = w_value
+            def delslotvalue(self, index):
+                if self.slots_w[index] is None:
+                    return False
+                self.slots_w[index] = None
+                return True
             def getslotvalue(self, index):
                 return self.slots_w[index]
         add(Proto)
@@ -530,11 +535,10 @@
         """member.__delete__(obj)
         Delete the value of the slot 'member' from the given 'obj'."""
         self.typecheck(space, w_obj)
-        w_oldresult = w_obj.getslotvalue(self.index)
-        if w_oldresult is None:
+        success = w_obj.delslotvalue(self.index)
+        if not success:
             raise OperationError(space.w_AttributeError,
                                  space.wrap(self.name)) # XXX better message
-        w_obj.setslotvalue(self.index, None)
 
 Member.typedef = TypeDef(
     "member_descriptor",
diff --git a/pypy/jit/backend/llgraph/llimpl.py b/pypy/jit/backend/llgraph/llimpl.py
--- a/pypy/jit/backend/llgraph/llimpl.py
+++ b/pypy/jit/backend/llgraph/llimpl.py
@@ -57,6 +57,12 @@
     else:
         return LLSupport.from_rstr(s)
 
+FLOAT_ARRAY_TP = lltype.Ptr(lltype.Array(lltype.Float, hints={"nolength": True}))
+def maybe_uncast(TP, array):
+    if array._TYPE.TO._hints.get("uncast_on_llgraph"):
+        array = rffi.cast(TP, array)
+    return array
+
 # a list of argtypes of all operations - couldn't find any and it's
 # very useful.  Note however that the table is half-broken here and
 # there, in ways that are sometimes a bit hard to fix; that's why
@@ -1071,13 +1077,15 @@
         return heaptracker.adr2int(llmemory.cast_ptr_to_adr(x))
     if TP == llmemory.Address:
         return heaptracker.adr2int(x)
+    if TP is lltype.SingleFloat:
+        return longlong.singlefloat2int(x)
     return lltype.cast_primitive(lltype.Signed, x)
 
 def cast_from_int(TYPE, x):
     if isinstance(TYPE, lltype.Ptr):
         if isinstance(x, (int, long, llmemory.AddressAsInt)):
             x = llmemory.cast_int_to_adr(x)
-        if TYPE is rffi.VOIDP:
+        if TYPE is rffi.VOIDP or TYPE.TO._hints.get("uncast_on_llgraph"):
             # assume that we want a "C-style" cast, without typechecking the value
             return rffi.cast(TYPE, x)
         return llmemory.cast_adr_to_ptr(x, TYPE)
@@ -1086,6 +1094,9 @@
             x = llmemory.cast_int_to_adr(x)
         assert lltype.typeOf(x) == llmemory.Address
         return x
+    elif TYPE is lltype.SingleFloat:
+        assert lltype.typeOf(x) is lltype.Signed
+        return longlong.int2singlefloat(x)
     else:
         if lltype.typeOf(x) == llmemory.Address:
             x = heaptracker.adr2int(x)
@@ -1140,6 +1151,7 @@
     del _future_values[:]
 
 def set_future_value_int(index, value):
+    assert lltype.typeOf(value) is lltype.Signed
     set_future_value_ref(index, value)
 
 def set_future_value_float(index, value):
@@ -1323,8 +1335,8 @@
     return cast_to_floatstorage(array.getitem(index))
 
 def do_getarrayitem_raw_float(array, index):
-    array = array.adr.ptr._obj
-    return cast_to_floatstorage(array.getitem(index))
+    array = maybe_uncast(FLOAT_ARRAY_TP, array.adr.ptr)
+    return cast_to_floatstorage(array._obj.getitem(index))
 
 def do_getarrayitem_gc_ptr(array, index):
     array = array._obj.container
@@ -1386,8 +1398,9 @@
     newvalue = cast_from_floatstorage(ITEMTYPE, newvalue)
     array.setitem(index, newvalue)
 
+
 def do_setarrayitem_raw_float(array, index, newvalue):
-    array = array.adr.ptr
+    array = maybe_uncast(FLOAT_ARRAY_TP, array.adr.ptr)
     ITEMTYPE = lltype.typeOf(array).TO.OF
     newvalue = cast_from_floatstorage(ITEMTYPE, newvalue)
     array._obj.setitem(index, newvalue)
@@ -1488,6 +1501,7 @@
     'i': lltype.Signed,
     'f': lltype.Float,
     'L': lltype.SignedLongLong,
+    'S': lltype.SingleFloat,
     'v': lltype.Void,
     }
 
diff --git a/pypy/jit/backend/llgraph/runner.py b/pypy/jit/backend/llgraph/runner.py
--- a/pypy/jit/backend/llgraph/runner.py
+++ b/pypy/jit/backend/llgraph/runner.py
@@ -25,13 +25,14 @@
 class Descr(history.AbstractDescr):
 
     def __init__(self, ofs, typeinfo, extrainfo=None, name=None,
-                 arg_types=None, count_fields_if_immut=-1):
+                 arg_types=None, count_fields_if_immut=-1, ffi_flags=0):
         self.ofs = ofs
         self.typeinfo = typeinfo
         self.extrainfo = extrainfo
         self.name = name
         self.arg_types = arg_types
         self.count_fields_if_immut = count_fields_if_immut
+        self.ffi_flags = ffi_flags
 
     def get_arg_types(self):
         return self.arg_types
@@ -67,6 +68,9 @@
     def count_fields_if_immutable(self):
         return self.count_fields_if_immut
 
+    def get_ffi_flags(self):
+        return self.ffi_flags
+
     def __lt__(self, other):
         raise TypeError("cannot use comparison on Descrs")
     def __le__(self, other):
@@ -91,6 +95,7 @@
 class BaseCPU(model.AbstractCPU):
     supports_floats = True
     supports_longlong = llimpl.IS_32_BIT
+    supports_singlefloats = True
 
     def __init__(self, rtyper, stats=None, opts=None,
                  translate_support_code=False,
@@ -113,14 +118,14 @@
         return False
 
     def getdescr(self, ofs, typeinfo='?', extrainfo=None, name=None,
-                 arg_types=None, count_fields_if_immut=-1):
+                 arg_types=None, count_fields_if_immut=-1, ffi_flags=0):
         key = (ofs, typeinfo, extrainfo, name, arg_types,
-               count_fields_if_immut)
+               count_fields_if_immut, ffi_flags)
         try:
             return self._descrs[key]
         except KeyError:
             descr = Descr(ofs, typeinfo, extrainfo, name, arg_types,
-                          count_fields_if_immut)
+                          count_fields_if_immut, ffi_flags)
             self._descrs[key] = descr
             return descr
 
@@ -311,7 +316,7 @@
         token = history.getkind(getattr(S, fieldname))
         return self.getdescr(ofs, token[0], name=fieldname)
 
-    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo):
         arg_types = []
         for ARG in ARGS:
             token = history.getkind(ARG)
@@ -325,16 +330,21 @@
         return self.getdescr(0, token[0], extrainfo=extrainfo,
                              arg_types=''.join(arg_types))
 
-    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo=None):
+    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo, ffi_flags):
         from pypy.jit.backend.llsupport.ffisupport import get_ffi_type_kind
+        from pypy.jit.backend.llsupport.ffisupport import UnsupportedKind
         arg_types = []
-        for arg in ffi_args:
-            kind = get_ffi_type_kind(arg)
-            if kind != history.VOID:
-                arg_types.append(kind)
-        reskind = get_ffi_type_kind(ffi_result)
+        try:
+            for arg in ffi_args:
+                kind = get_ffi_type_kind(self, arg)
+                if kind != history.VOID:
+                    arg_types.append(kind)
+            reskind = get_ffi_type_kind(self, ffi_result)
+        except UnsupportedKind:
+            return None
         return self.getdescr(0, reskind, extrainfo=extrainfo,
-                             arg_types=''.join(arg_types))
+                             arg_types=''.join(arg_types),
+                             ffi_flags=ffi_flags)
 
 
     def grab_exc_value(self):
@@ -517,7 +527,7 @@
         return FieldDescr.new(T1, fieldname)
 
     @staticmethod
-    def calldescrof(FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(FUNC, ARGS, RESULT, extrainfo):
         return StaticMethDescr.new(FUNC, ARGS, RESULT, extrainfo)
 
     @staticmethod
diff --git a/pypy/jit/backend/llgraph/test/test_llgraph.py b/pypy/jit/backend/llgraph/test/test_llgraph.py
--- a/pypy/jit/backend/llgraph/test/test_llgraph.py
+++ b/pypy/jit/backend/llgraph/test/test_llgraph.py
@@ -19,6 +19,9 @@
     def setup_method(self, _):
         self.cpu = self.cpu_type(None)
 
+    def test_memoryerror(self):
+        py.test.skip("does not make much sense on the llgraph backend")
+
 
 def test_cast_adr_to_int_and_back():
     X = lltype.Struct('X', ('foo', lltype.Signed))
diff --git a/pypy/jit/backend/llsupport/descr.py b/pypy/jit/backend/llsupport/descr.py
--- a/pypy/jit/backend/llsupport/descr.py
+++ b/pypy/jit/backend/llsupport/descr.py
@@ -260,10 +260,12 @@
     _clsname = ''
     loop_token = None
     arg_classes = ''     # <-- annotation hack
+    ffi_flags = 0
 
-    def __init__(self, arg_classes, extrainfo=None):
+    def __init__(self, arg_classes, extrainfo=None, ffi_flags=0):
         self.arg_classes = arg_classes    # string of "r" and "i" (ref/int)
         self.extrainfo = extrainfo
+        self.ffi_flags = ffi_flags
 
     def __repr__(self):
         res = '%s(%s)' % (self.__class__.__name__, self.arg_classes)
@@ -284,6 +286,13 @@
     def get_extra_info(self):
         return self.extrainfo
 
+    def get_ffi_flags(self):
+        return self.ffi_flags
+
+    def get_call_conv(self):
+        from pypy.rlib.clibffi import get_call_conv
+        return get_call_conv(self.ffi_flags, True)
+
     def get_arg_types(self):
         return self.arg_classes
 
@@ -303,6 +312,8 @@
                 c = 'f'
             elif c == 'f' and longlong.supports_longlong:
                 return 'longlong.getrealfloat(%s)' % (process('L'),)
+            elif c == 'S':
+                return 'longlong.int2singlefloat(%s)' % (process('i'),)
             arg = 'args_%s[%d]' % (c, seen[c])
             seen[c] += 1
             return arg
@@ -318,6 +329,8 @@
                 return lltype.Void
             elif arg == 'L':
                 return lltype.SignedLongLong
+            elif arg == 'S':
+                return lltype.SingleFloat
             else:
                 raise AssertionError(arg)
 
@@ -334,6 +347,8 @@
             result = 'rffi.cast(lltype.SignedLongLong, res)'
         elif self.get_return_type() == history.VOID:
             result = 'None'
+        elif self.get_return_type() == 'S':
+            result = 'longlong.singlefloat2int(res)'
         else:
             assert 0
         source = py.code.Source("""
@@ -344,14 +359,15 @@
         """ % locals())
         ARGS = [TYPE(arg) for arg in self.arg_classes]
         FUNC = lltype.FuncType(ARGS, RESULT)
-        d = locals().copy()
-        d.update(globals())
+        d = globals().copy()
+        d.update(locals())
         exec source.compile() in d
         self.call_stub = d['call_stub']
 
     def verify_types(self, args_i, args_r, args_f, return_type):
         assert self._return_type in return_type
-        assert self.arg_classes.count('i') == len(args_i or ())
+        assert (self.arg_classes.count('i') +
+                self.arg_classes.count('S')) == len(args_i or ())
         assert self.arg_classes.count('r') == len(args_r or ())
         assert (self.arg_classes.count('f') +
                 self.arg_classes.count('L')) == len(args_f or ())
@@ -384,8 +400,8 @@
     """
     _clsname = 'DynamicIntCallDescr'
 
-    def __init__(self, arg_classes, result_size, result_sign, extrainfo=None):
-        BaseIntCallDescr.__init__(self, arg_classes, extrainfo)
+    def __init__(self, arg_classes, result_size, result_sign, extrainfo=None, ffi_flags=0):
+        BaseIntCallDescr.__init__(self, arg_classes, extrainfo, ffi_flags)
         assert isinstance(result_sign, bool)
         self._result_size = chr(result_size)
         self._result_sign = result_sign
@@ -428,23 +444,39 @@
     def get_result_size(self, translate_support_code):
         return 0
 
+_SingleFloatCallDescr = None   # built lazily
+
 def getCallDescrClass(RESULT):
     if RESULT is lltype.Void:
         return VoidCallDescr
     if RESULT is lltype.Float:
         return FloatCallDescr
+    if RESULT is lltype.SingleFloat:
+        global _SingleFloatCallDescr
+        if _SingleFloatCallDescr is None:
+            assert rffi.sizeof(rffi.UINT) == rffi.sizeof(RESULT)
+            class SingleFloatCallDescr(getCallDescrClass(rffi.UINT)):
+                _clsname = 'SingleFloatCallDescr'
+                _return_type = 'S'
+            _SingleFloatCallDescr = SingleFloatCallDescr
+        return _SingleFloatCallDescr
     if is_longlong(RESULT):
         return LongLongCallDescr
     return getDescrClass(RESULT, BaseIntCallDescr, GcPtrCallDescr,
                          NonGcPtrCallDescr, 'Call', 'get_result_size',
                          Ellipsis,  # <= floatattrname should not be used here
                          '_is_result_signed')
+getCallDescrClass._annspecialcase_ = 'specialize:memo'
 
 def get_call_descr(gccache, ARGS, RESULT, extrainfo=None):
     arg_classes = []
     for ARG in ARGS:
         kind = getkind(ARG)
-        if   kind == 'int': arg_classes.append('i')
+        if   kind == 'int':
+            if ARG is lltype.SingleFloat:
+                arg_classes.append('S')
+            else:
+                arg_classes.append('i')
         elif kind == 'ref': arg_classes.append('r')
         elif kind == 'float':
             if is_longlong(ARG):
@@ -476,6 +508,9 @@
             return GcPtrDescr
         else:
             return NonGcPtrDescr
+    if TYPE is lltype.SingleFloat:
+        assert rffi.sizeof(rffi.UINT) == rffi.sizeof(TYPE)
+        TYPE = rffi.UINT
     try:
         return _cache[nameprefix, TYPE]
     except KeyError:
diff --git a/pypy/jit/backend/llsupport/ffisupport.py b/pypy/jit/backend/llsupport/ffisupport.py
--- a/pypy/jit/backend/llsupport/ffisupport.py
+++ b/pypy/jit/backend/llsupport/ffisupport.py
@@ -1,41 +1,58 @@
 from pypy.rlib.rarithmetic import intmask
 from pypy.jit.metainterp import history
-from pypy.jit.backend.llsupport.descr import DynamicIntCallDescr, NonGcPtrCallDescr,\
-    FloatCallDescr, VoidCallDescr
+from pypy.rpython.lltypesystem import rffi
+from pypy.jit.backend.llsupport.descr import (
+    DynamicIntCallDescr, NonGcPtrCallDescr, FloatCallDescr, VoidCallDescr,
+    LongLongCallDescr, getCallDescrClass)
 
 class UnsupportedKind(Exception):
     pass
 
-def get_call_descr_dynamic(ffi_args, ffi_result, extrainfo=None):
+def get_call_descr_dynamic(cpu, ffi_args, ffi_result, extrainfo=None, ffi_flags=0):
     """Get a call descr: the types of result and args are represented by
     rlib.libffi.types.*"""
     try:
-        reskind = get_ffi_type_kind(ffi_result)
-        argkinds = [get_ffi_type_kind(arg) for arg in ffi_args]
+        reskind = get_ffi_type_kind(cpu, ffi_result)
+        argkinds = [get_ffi_type_kind(cpu, arg) for arg in ffi_args]
     except UnsupportedKind:
-        return None # ??
+        return None
     arg_classes = ''.join(argkinds)
     if reskind == history.INT:
         size = intmask(ffi_result.c_size)
         signed = is_ffi_type_signed(ffi_result)
-        return DynamicIntCallDescr(arg_classes, size, signed, extrainfo)
+        return DynamicIntCallDescr(arg_classes, size, signed, extrainfo,
+                                   ffi_flags=ffi_flags)
     elif reskind == history.REF:
-        return  NonGcPtrCallDescr(arg_classes, extrainfo)
+        return  NonGcPtrCallDescr(arg_classes, extrainfo,
+                                  ffi_flags=ffi_flags)
     elif reskind == history.FLOAT:
-        return FloatCallDescr(arg_classes, extrainfo)
+        return FloatCallDescr(arg_classes, extrainfo,
+                              ffi_flags=ffi_flags)
     elif reskind == history.VOID:
-        return VoidCallDescr(arg_classes, extrainfo)
+        return VoidCallDescr(arg_classes, extrainfo,
+                             ffi_flags=ffi_flags)
+    elif reskind == 'L':
+        return LongLongCallDescr(arg_classes, extrainfo,
+                                 ffi_flags=ffi_flags)
+    elif reskind == 'S':
+        SingleFloatCallDescr = getCallDescrClass(rffi.FLOAT)
+        return SingleFloatCallDescr(arg_classes, extrainfo,
+                                    ffi_flags=ffi_flags)
     assert False
 
-def get_ffi_type_kind(ffi_type):
+def get_ffi_type_kind(cpu, ffi_type):
     from pypy.rlib.libffi import types
     kind = types.getkind(ffi_type)
     if kind == 'i' or kind == 'u':
         return history.INT
-    elif kind == 'f':
+    elif cpu.supports_floats and kind == 'f':
         return history.FLOAT
     elif kind == 'v':
         return history.VOID
+    elif cpu.supports_longlong and (kind == 'I' or kind == 'U'):     # longlong
+        return 'L'
+    elif cpu.supports_singlefloats and kind == 's':    # singlefloat
+        return 'S'
     raise UnsupportedKind("Unsupported kind '%s'" % kind)
 
 def is_ffi_type_signed(ffi_type):
diff --git a/pypy/jit/backend/llsupport/gc.py b/pypy/jit/backend/llsupport/gc.py
--- a/pypy/jit/backend/llsupport/gc.py
+++ b/pypy/jit/backend/llsupport/gc.py
@@ -366,36 +366,92 @@
 
     def add_jit2gc_hooks(self, jit2gc):
         #
-        def collect_jit_stack_root(callback, gc, addr):
-            if addr.signed[0] != GcRootMap_shadowstack.MARKER:
-                # common case
-                if gc.points_to_valid_gc_object(addr):
-                    callback(gc, addr)
-                return WORD
-            else:
-                # case of a MARKER followed by an assembler stack frame
-                follow_stack_frame_of_assembler(callback, gc, addr)
-                return 2 * WORD
+        # ---------------
+        # This is used to enumerate the shadowstack in the presence
+        # of the JIT.  It is also used by the stacklet support in
+        # rlib/_stacklet_shadowstack.  That's why it is written as
+        # an iterator that can also be used with a custom_trace.
         #
-        def follow_stack_frame_of_assembler(callback, gc, addr):
-            frame_addr = addr.signed[1]
-            addr = llmemory.cast_int_to_adr(frame_addr + self.force_index_ofs)
-            force_index = addr.signed[0]
-            if force_index < 0:
-                force_index = ~force_index
-            callshape = self._callshapes[force_index]
-            n = 0
-            while True:
-                offset = rffi.cast(lltype.Signed, callshape[n])
-                if offset == 0:
-                    break
-                addr = llmemory.cast_int_to_adr(frame_addr + offset)
-                if gc.points_to_valid_gc_object(addr):
-                    callback(gc, addr)
-                n += 1
+        class RootIterator:
+            _alloc_flavor_ = "raw"
+
+            def next(iself, gc, next, range_highest):
+                # Return the "next" valid GC object' address.  This usually
+                # means just returning "next", until we reach "range_highest",
+                # except that we are skipping NULLs.  If "next" contains a
+                # MARKER instead, then we go into JIT-frame-lookup mode.
+                #
+                while True:
+                    #
+                    # If we are not iterating right now in a JIT frame
+                    if iself.frame_addr == 0:
+                        #
+                        # Look for the next shadowstack address that
+                        # contains a valid pointer
+                        while next != range_highest:
+                            if next.signed[0] == self.MARKER:
+                                break
+                            if gc.points_to_valid_gc_object(next):
+                                return next
+                            next += llmemory.sizeof(llmemory.Address)
+                        else:
+                            return llmemory.NULL     # done
+                        #
+                        # It's a JIT frame.  Save away 'next' for later, and
+                        # go into JIT-frame-exploring mode.
+                        next += llmemory.sizeof(llmemory.Address)
+                        frame_addr = next.signed[0]
+                        iself.saved_next = next
+                        iself.frame_addr = frame_addr
+                        addr = llmemory.cast_int_to_adr(frame_addr +
+                                                        self.force_index_ofs)
+                        addr = iself.translateptr(iself.context, addr)
+                        force_index = addr.signed[0]
+                        if force_index < 0:
+                            force_index = ~force_index
+                        # NB: the next line reads a still-alive _callshapes,
+                        # because we ensure that just before we called this
+                        # piece of assembler, we put on the (same) stack a
+                        # pointer to a loop_token that keeps the force_index
+                        # alive.
+                        callshape = self._callshapes[force_index]
+                    else:
+                        # Continuing to explore this JIT frame
+                        callshape = iself.callshape
+                    #
+                    # 'callshape' points to the next INT of the callshape.
+                    # If it's zero we are done with the JIT frame.
+                    while rffi.cast(lltype.Signed, callshape[0]) != 0:
+                        #
+                        # Non-zero: it's an offset inside the JIT frame.
+                        # Read it and increment 'callshape'.
+                        offset = rffi.cast(lltype.Signed, callshape[0])
+                        callshape = lltype.direct_ptradd(callshape, 1)
+                        addr = llmemory.cast_int_to_adr(iself.frame_addr +
+                                                        offset)
+                        addr = iself.translateptr(iself.context, addr)
+                        if gc.points_to_valid_gc_object(addr):
+                            #
+                            # The JIT frame contains a valid GC pointer at
+                            # this address (as opposed to NULL).  Save
+                            # 'callshape' for the next call, and return the
+                            # address.
+                            iself.callshape = callshape
+                            return addr
+                    #
+                    # Restore 'prev' and loop back to the start.
+                    iself.frame_addr = 0
+                    next = iself.saved_next
+                    next += llmemory.sizeof(llmemory.Address)
+
+        # ---------------
         #
+        root_iterator = RootIterator()
+        root_iterator.frame_addr = 0
+        root_iterator.context = llmemory.NULL
+        root_iterator.translateptr = lambda context, addr: addr
         jit2gc.update({
-            'rootstackhook': collect_jit_stack_root,
+            'root_iterator': root_iterator,
             })
 
     def initialize(self):
@@ -544,18 +600,19 @@
         assert self.GCClass.inline_simple_malloc
         assert self.GCClass.inline_simple_malloc_varsize
 
-        # make a malloc function, with three arguments
+        # make a malloc function, with two arguments
         def malloc_basic(size, tid):
             type_id = llop.extract_ushort(llgroup.HALFWORD, tid)
             has_finalizer = bool(tid & (1<<llgroup.HALFSHIFT))
             check_typeid(type_id)
-            try:
-                res = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
-                                                      type_id, size, True,
-                                                      has_finalizer, False)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                res = lltype.nullptr(llmemory.GCREF.TO)
+            res = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
+                                                  type_id, size,
+                                                  has_finalizer, False)
+            # In case the operation above failed, we are returning NULL
+            # from this function to assembler.  There is also an RPython
+            # exception set, typically MemoryError; but it's easier and
+            # faster to check for the NULL return value, as done by
+            # translator/exceptiontransform.py.
             #llop.debug_print(lltype.Void, "\tmalloc_basic", size, type_id,
             #                 "-->", res)
             return res
@@ -571,14 +628,10 @@
         def malloc_array(itemsize, tid, num_elem):
             type_id = llop.extract_ushort(llgroup.HALFWORD, tid)
             check_typeid(type_id)
-            try:
-                return llop1.do_malloc_varsize_clear(
-                    llmemory.GCREF,
-                    type_id, num_elem, self.array_basesize, itemsize,
-                    self.array_length_ofs, True)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                return lltype.nullptr(llmemory.GCREF.TO)
+            return llop1.do_malloc_varsize_clear(
+                llmemory.GCREF,
+                type_id, num_elem, self.array_basesize, itemsize,
+                self.array_length_ofs)
         self.malloc_array = malloc_array
         self.GC_MALLOC_ARRAY = lltype.Ptr(lltype.FuncType(
             [lltype.Signed] * 3, llmemory.GCREF))
@@ -591,23 +644,15 @@
         unicode_type_id = self.layoutbuilder.get_type_id(rstr.UNICODE)
         #
         def malloc_str(length):
-            try:
-                return llop1.do_malloc_varsize_clear(
-                    llmemory.GCREF,
-                    str_type_id, length, str_basesize, str_itemsize,
-                    str_ofs_length, True)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                return lltype.nullptr(llmemory.GCREF.TO)
+            return llop1.do_malloc_varsize_clear(
+                llmemory.GCREF,
+                str_type_id, length, str_basesize, str_itemsize,
+                str_ofs_length)
         def malloc_unicode(length):
-            try:
-                return llop1.do_malloc_varsize_clear(
-                    llmemory.GCREF,
-                    unicode_type_id, length, unicode_basesize,unicode_itemsize,
-                    unicode_ofs_length, True)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                return lltype.nullptr(llmemory.GCREF.TO)
+            return llop1.do_malloc_varsize_clear(
+                llmemory.GCREF,
+                unicode_type_id, length, unicode_basesize,unicode_itemsize,
+                unicode_ofs_length)
         self.malloc_str = malloc_str
         self.malloc_unicode = malloc_unicode
         self.GC_MALLOC_STR_UNICODE = lltype.Ptr(lltype.FuncType(
@@ -628,16 +673,12 @@
             if self.DEBUG:
                 random_usage_of_xmm_registers()
             assert size >= self.minimal_size_in_nursery
-            try:
-                # NB. although we call do_malloc_fixedsize_clear() here,
-                # it's a bit of a hack because we set tid to 0 and may
-                # also use it to allocate varsized objects.  The tid
-                # and possibly the length are both set afterward.
-                gcref = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
-                                            0, size, True, False, False)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                return 0
+            # NB. although we call do_malloc_fixedsize_clear() here,
+            # it's a bit of a hack because we set tid to 0 and may
+            # also use it to allocate varsized objects.  The tid
+            # and possibly the length are both set afterward.
+            gcref = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
+                                        0, size, False, False)
             return rffi.cast(lltype.Signed, gcref)
         self.malloc_slowpath = malloc_slowpath
         self.MALLOC_SLOWPATH = lltype.FuncType([lltype.Signed], lltype.Signed)
diff --git a/pypy/jit/backend/llsupport/llmodel.py b/pypy/jit/backend/llsupport/llmodel.py
--- a/pypy/jit/backend/llsupport/llmodel.py
+++ b/pypy/jit/backend/llsupport/llmodel.py
@@ -254,13 +254,13 @@
         return ofs, size, sign
     unpack_arraydescr_size._always_inline_ = True
 
-    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo):
         return get_call_descr(self.gc_ll_descr, ARGS, RESULT, extrainfo)
 
-    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo=None):
+    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo, ffi_flags):
         from pypy.jit.backend.llsupport import ffisupport
-        return ffisupport.get_call_descr_dynamic(ffi_args, ffi_result,
-                                                 extrainfo)
+        return ffisupport.get_call_descr_dynamic(self, ffi_args, ffi_result,
+                                                 extrainfo, ffi_flags)
 
     def get_overflow_error(self):
         ovf_vtable = self.cast_adr_to_int(self._ovf_error_vtable)
@@ -499,7 +499,7 @@
     def bh_call_i(self, func, calldescr, args_i, args_r, args_f):
         assert isinstance(calldescr, BaseIntCallDescr)
         if not we_are_translated():
-            calldescr.verify_types(args_i, args_r, args_f, history.INT)
+            calldescr.verify_types(args_i, args_r, args_f, history.INT + 'S')
         return calldescr.call_stub(func, args_i, args_r, args_f)
 
     def bh_call_r(self, func, calldescr, args_i, args_r, args_f):
diff --git a/pypy/jit/backend/llsupport/regalloc.py b/pypy/jit/backend/llsupport/regalloc.py
--- a/pypy/jit/backend/llsupport/regalloc.py
+++ b/pypy/jit/backend/llsupport/regalloc.py
@@ -57,11 +57,13 @@
     all_regs              = []
     no_lower_byte_regs    = []
     save_around_call_regs = []
-    
+    frame_reg             = None
+
     def __init__(self, longevity, frame_manager=None, assembler=None):
         self.free_regs = self.all_regs[:]
         self.longevity = longevity
         self.reg_bindings = {}
+        self.bindings_to_frame_reg = {}
         self.position = -1
         self.frame_manager = frame_manager
         self.assembler = assembler
@@ -218,6 +220,10 @@
         self.reg_bindings[v] = loc
         return loc
 
+    def force_allocate_frame_reg(self, v):
+        """ Allocate the new variable v in the frame register."""
+        self.bindings_to_frame_reg[v] = None
+
     def force_spill_var(self, var):
         self._sync_var(var)
         try:
@@ -236,6 +242,8 @@
         try:
             return self.reg_bindings[box]
         except KeyError:
+            if box in self.bindings_to_frame_reg:
+                return self.frame_reg
             return self.frame_manager.loc(box)
 
     def return_constant(self, v, forbidden_vars=[], selected_reg=None):
@@ -264,8 +272,9 @@
         self._check_type(v)
         if isinstance(v, Const):
             return self.return_constant(v, forbidden_vars, selected_reg)
-        
         prev_loc = self.loc(v)
+        if prev_loc is self.frame_reg and selected_reg is None:
+            return prev_loc
         loc = self.force_allocate_reg(v, forbidden_vars, selected_reg,
                                       need_lower_byte=need_lower_byte)
         if prev_loc is not loc:
diff --git a/pypy/jit/backend/llsupport/test/test_descr.py b/pypy/jit/backend/llsupport/test/test_descr.py
--- a/pypy/jit/backend/llsupport/test/test_descr.py
+++ b/pypy/jit/backend/llsupport/test/test_descr.py
@@ -52,7 +52,8 @@
     S = lltype.GcStruct('S', ('x', lltype.Char),
                              ('y', lltype.Ptr(T)),
                              ('z', lltype.Ptr(U)),
-                             ('f', lltype.Float))
+                             ('f', lltype.Float),
+                             ('s', lltype.SingleFloat))
     assert getFieldDescrClass(lltype.Ptr(T)) is GcPtrFieldDescr
     assert getFieldDescrClass(lltype.Ptr(U)) is NonGcPtrFieldDescr
     cls = getFieldDescrClass(lltype.Char)
@@ -61,6 +62,10 @@
     clsf = getFieldDescrClass(lltype.Float)
     assert clsf != cls
     assert clsf == getFieldDescrClass(lltype.Float)
+    clss = getFieldDescrClass(lltype.SingleFloat)
+    assert clss not in (cls, clsf)
+    assert clss == getFieldDescrClass(lltype.SingleFloat)
+    assert clss == getFieldDescrClass(rffi.UINT)    # for now
     #
     c0 = GcCache(False)
     c1 = GcCache(True)
@@ -72,14 +77,17 @@
         descr_y = get_field_descr(c2, S, 'y')
         descr_z = get_field_descr(c2, S, 'z')
         descr_f = get_field_descr(c2, S, 'f')
+        descr_s = get_field_descr(c2, S, 's')
         assert descr_x.__class__ is cls
         assert descr_y.__class__ is GcPtrFieldDescr
         assert descr_z.__class__ is NonGcPtrFieldDescr
         assert descr_f.__class__ is clsf
+        assert descr_s.__class__ is clss
         assert descr_x.name == 'S.x'
         assert descr_y.name == 'S.y'
         assert descr_z.name == 'S.z'
         assert descr_f.name == 'S.f'
+        assert descr_s.name == 'S.s'
         if not tsc:
             assert descr_x.offset < descr_y.offset < descr_z.offset
             assert descr_x.sort_key() < descr_y.sort_key() < descr_z.sort_key()
@@ -87,23 +95,29 @@
             assert descr_y.get_field_size(False) == rffi.sizeof(lltype.Ptr(T))
             assert descr_z.get_field_size(False) == rffi.sizeof(lltype.Ptr(U))
             assert descr_f.get_field_size(False) == rffi.sizeof(lltype.Float)
+            assert descr_s.get_field_size(False) == rffi.sizeof(
+                                                            lltype.SingleFloat)
         else:
             assert isinstance(descr_x.offset, Symbolic)
             assert isinstance(descr_y.offset, Symbolic)
             assert isinstance(descr_z.offset, Symbolic)
             assert isinstance(descr_f.offset, Symbolic)
+            assert isinstance(descr_s.offset, Symbolic)
             assert isinstance(descr_x.get_field_size(True), Symbolic)
             assert isinstance(descr_y.get_field_size(True), Symbolic)
             assert isinstance(descr_z.get_field_size(True), Symbolic)
             assert isinstance(descr_f.get_field_size(True), Symbolic)
+            assert isinstance(descr_s.get_field_size(True), Symbolic)
         assert not descr_x.is_pointer_field()
         assert     descr_y.is_pointer_field()
         assert not descr_z.is_pointer_field()
         assert not descr_f.is_pointer_field()
+        assert not descr_s.is_pointer_field()
         assert not descr_x.is_float_field()
         assert not descr_y.is_float_field()
         assert not descr_z.is_float_field()
         assert     descr_f.is_float_field()
+        assert not descr_s.is_float_field()
 
 
 def test_get_field_descr_sign():
@@ -135,6 +149,7 @@
     A2 = lltype.GcArray(lltype.Ptr(T))
     A3 = lltype.GcArray(lltype.Ptr(U))
     A4 = lltype.GcArray(lltype.Float)
+    A5 = lltype.GcArray(lltype.SingleFloat)
     assert getArrayDescrClass(A2) is GcPtrArrayDescr
     assert getArrayDescrClass(A3) is NonGcPtrArrayDescr
     cls = getArrayDescrClass(A1)
@@ -143,25 +158,32 @@
     clsf = getArrayDescrClass(A4)
     assert clsf != cls
     assert clsf == getArrayDescrClass(lltype.GcArray(lltype.Float))
+    clss = getArrayDescrClass(A5)
+    assert clss not in (clsf, cls)
+    assert clss == getArrayDescrClass(lltype.GcArray(rffi.UINT))
     #
     c0 = GcCache(False)
     descr1 = get_array_descr(c0, A1)
     descr2 = get_array_descr(c0, A2)
     descr3 = get_array_descr(c0, A3)
     descr4 = get_array_descr(c0, A4)
+    descr5 = get_array_descr(c0, A5)
     assert descr1.__class__ is cls
     assert descr2.__class__ is GcPtrArrayDescr
     assert descr3.__class__ is NonGcPtrArrayDescr
     assert descr4.__class__ is clsf
+    assert descr5.__class__ is clss
     assert descr1 == get_array_descr(c0, lltype.GcArray(lltype.Char))
     assert not descr1.is_array_of_pointers()
     assert     descr2.is_array_of_pointers()
     assert not descr3.is_array_of_pointers()
     assert not descr4.is_array_of_pointers()
+    assert not descr5.is_array_of_pointers()
     assert not descr1.is_array_of_floats()
     assert not descr2.is_array_of_floats()
     assert not descr3.is_array_of_floats()
     assert     descr4.is_array_of_floats()
+    assert not descr5.is_array_of_floats()
     #
     def get_alignment(code):
         # Retrieve default alignment for the compiler/platform
@@ -170,27 +192,33 @@
     assert descr2.get_base_size(False) == get_alignment('p')
     assert descr3.get_base_size(False) == get_alignment('p')
     assert descr4.get_base_size(False) == get_alignment('d')
+    assert descr5.get_base_size(False) == get_alignment('f')
     assert descr1.get_ofs_length(False) == 0
     assert descr2.get_ofs_length(False) == 0
     assert descr3.get_ofs_length(False) == 0
     assert descr4.get_ofs_length(False) == 0
+    assert descr5.get_ofs_length(False) == 0
     assert descr1.get_item_size(False) == rffi.sizeof(lltype.Char)
     assert descr2.get_item_size(False) == rffi.sizeof(lltype.Ptr(T))
     assert descr3.get_item_size(False) == rffi.sizeof(lltype.Ptr(U))
     assert descr4.get_item_size(False) == rffi.sizeof(lltype.Float)
+    assert descr5.get_item_size(False) == rffi.sizeof(lltype.SingleFloat)
     #
     assert isinstance(descr1.get_base_size(True), Symbolic)
     assert isinstance(descr2.get_base_size(True), Symbolic)
     assert isinstance(descr3.get_base_size(True), Symbolic)
     assert isinstance(descr4.get_base_size(True), Symbolic)
+    assert isinstance(descr5.get_base_size(True), Symbolic)
     assert isinstance(descr1.get_ofs_length(True), Symbolic)
     assert isinstance(descr2.get_ofs_length(True), Symbolic)
     assert isinstance(descr3.get_ofs_length(True), Symbolic)
     assert isinstance(descr4.get_ofs_length(True), Symbolic)
+    assert isinstance(descr5.get_ofs_length(True), Symbolic)
     assert isinstance(descr1.get_item_size(True), Symbolic)
     assert isinstance(descr2.get_item_size(True), Symbolic)
     assert isinstance(descr3.get_item_size(True), Symbolic)
     assert isinstance(descr4.get_item_size(True), Symbolic)
+    assert isinstance(descr5.get_item_size(True), Symbolic)
     CA = rffi.CArray(lltype.Signed)
     descr = get_array_descr(c0, CA)
     assert not descr.is_array_of_floats()
@@ -210,6 +238,11 @@
     assert descr.is_array_of_floats()
     assert descr.get_base_size(False) == 0
     assert descr.get_ofs_length(False) == -1
+    CA = rffi.CArray(rffi.FLOAT)
+    descr = get_array_descr(c0, CA)
+    assert not descr.is_array_of_floats()
+    assert descr.get_base_size(False) == 0
+    assert descr.get_ofs_length(False) == -1
 
 
 def test_get_array_descr_sign():
@@ -257,6 +290,11 @@
     assert descr4.get_result_size(False) == rffi.sizeof(lltype.Float)
     assert descr4.get_return_type() == history.FLOAT
     assert descr4.arg_classes == "ff"
+    #
+    descr5 = get_call_descr(c0, [lltype.SingleFloat], lltype.SingleFloat)
+    assert descr5.get_result_size(False) == rffi.sizeof(lltype.SingleFloat)
+    assert descr5.get_return_type() == "S"
+    assert descr5.arg_classes == "S"
 
 def test_get_call_descr_not_translated_longlong():
     if sys.maxint > 2147483647:
@@ -286,6 +324,11 @@
     assert isinstance(descr4.get_result_size(True), Symbolic)
     assert descr4.get_return_type() == history.FLOAT
     assert descr4.arg_classes == "ff"
+    #
+    descr5 = get_call_descr(c1, [lltype.SingleFloat], lltype.SingleFloat)
+    assert isinstance(descr5.get_result_size(True), Symbolic)
+    assert descr5.get_return_type() == "S"
+    assert descr5.arg_classes == "S"
 
 def test_call_descr_extra_info():
     c1 = GcCache(True)
@@ -345,8 +388,11 @@
     #
     descr4f = get_call_descr(c0, [lltype.Char, lltype.Ptr(S)], lltype.Float)
     assert 'FloatCallDescr' in descr4f.repr_of_descr()
+    #
+    descr5f = get_call_descr(c0, [lltype.Char], lltype.SingleFloat)
+    assert 'SingleFloatCallDescr' in descr5f.repr_of_descr()
 
-def test_call_stubs():
+def test_call_stubs_1():
     c0 = GcCache(False)
     ARGS = [lltype.Char, lltype.Signed]
     RES = lltype.Char
@@ -360,6 +406,8 @@
     res = call_stub(rffi.cast(lltype.Signed, fnptr), [1, 2], None, None)
     assert res == ord('c')
 
+def test_call_stubs_2():
+    c0 = GcCache(False)
     ARRAY = lltype.GcArray(lltype.Signed)
     ARGS = [lltype.Float, lltype.Ptr(ARRAY)]
     RES = lltype.Float
@@ -375,3 +423,27 @@
     res = descr2.call_stub(rffi.cast(lltype.Signed, fnptr),
                            [], [opaquea], [longlong.getfloatstorage(3.5)])
     assert longlong.getrealfloat(res) == 4.5
+
+def test_call_stubs_single_float():
+    from pypy.rlib.longlong2float import uint2singlefloat, singlefloat2uint
+    from pypy.rlib.rarithmetic import r_singlefloat, intmask
+    #
+    c0 = GcCache(False)
+    ARGS = [lltype.SingleFloat, lltype.SingleFloat, lltype.SingleFloat]
+    RES = lltype.SingleFloat
+
+    def f(a, b, c):
+        a = float(a)
+        b = float(b)
+        c = float(c)
+        x = a - (b / c)
+        return r_singlefloat(x)
+
+    fnptr = llhelper(lltype.Ptr(lltype.FuncType(ARGS, RES)), f)
+    descr2 = get_call_descr(c0, ARGS, RES)
+    a = intmask(singlefloat2uint(r_singlefloat(-10.0)))
+    b = intmask(singlefloat2uint(r_singlefloat(3.0)))
+    c = intmask(singlefloat2uint(r_singlefloat(2.0)))
+    res = descr2.call_stub(rffi.cast(lltype.Signed, fnptr),
+                           [a, b, c], [], [])
+    assert float(uint2singlefloat(rffi.r_uint(res))) == -11.5
diff --git a/pypy/jit/backend/llsupport/test/test_ffisupport.py b/pypy/jit/backend/llsupport/test/test_ffisupport.py
--- a/pypy/jit/backend/llsupport/test/test_ffisupport.py
+++ b/pypy/jit/backend/llsupport/test/test_ffisupport.py
@@ -1,24 +1,56 @@
 from pypy.rlib.libffi import types
-from pypy.jit.backend.llsupport.ffisupport import get_call_descr_dynamic, \
-    VoidCallDescr, DynamicIntCallDescr
-    
+from pypy.jit.codewriter.longlong import is_64_bit
+from pypy.jit.backend.llsupport.ffisupport import *
+
+
+class FakeCPU:
+    def __init__(self, supports_floats=False, supports_longlong=False,
+                 supports_singlefloats=False):
+        self.supports_floats = supports_floats
+        self.supports_longlong = supports_longlong
+        self.supports_singlefloats = supports_singlefloats
+
+
 def test_call_descr_dynamic():
+    args = [types.sint, types.pointer]
+    descr = get_call_descr_dynamic(FakeCPU(), args, types.sint, ffi_flags=42)
+    assert isinstance(descr, DynamicIntCallDescr)
+    assert descr.arg_classes == 'ii'
+    assert descr.get_ffi_flags() == 42
 
     args = [types.sint, types.double, types.pointer]
-    descr = get_call_descr_dynamic(args, types.void)
+    descr = get_call_descr_dynamic(FakeCPU(), args, types.void)
+    assert descr is None    # missing floats
+    descr = get_call_descr_dynamic(FakeCPU(supports_floats=True),
+                                   args, types.void, ffi_flags=43)
     assert isinstance(descr, VoidCallDescr)
     assert descr.arg_classes == 'ifi'
+    assert descr.get_ffi_flags() == 43
 
-    descr = get_call_descr_dynamic([], types.sint8)
+    descr = get_call_descr_dynamic(FakeCPU(), [], types.sint8)
     assert isinstance(descr, DynamicIntCallDescr)
     assert descr.get_result_size(False) == 1
     assert descr.is_result_signed() == True
 
-    descr = get_call_descr_dynamic([], types.uint8)
+    descr = get_call_descr_dynamic(FakeCPU(), [], types.uint8)
     assert isinstance(descr, DynamicIntCallDescr)
     assert descr.get_result_size(False) == 1
     assert descr.is_result_signed() == False
 
-    descr = get_call_descr_dynamic([], types.float)
-    assert descr is None # single floats are not supported so far
-    
+    if not is_64_bit:
+        descr = get_call_descr_dynamic(FakeCPU(), [], types.slonglong)
+        assert descr is None   # missing longlongs
+        descr = get_call_descr_dynamic(FakeCPU(supports_longlong=True),
+                                       [], types.slonglong, ffi_flags=43)
+        assert isinstance(descr, LongLongCallDescr)
+        assert descr.get_ffi_flags() == 43
+    else:
+        assert types.slonglong is types.slong
+
+    descr = get_call_descr_dynamic(FakeCPU(), [], types.float)
+    assert descr is None   # missing singlefloats
+    descr = get_call_descr_dynamic(FakeCPU(supports_singlefloats=True),
+                                   [], types.float, ffi_flags=44)
+    SingleFloatCallDescr = getCallDescrClass(rffi.FLOAT)
+    assert isinstance(descr, SingleFloatCallDescr)
+    assert descr.get_ffi_flags() == 44
diff --git a/pypy/jit/backend/llsupport/test/test_gc.py b/pypy/jit/backend/llsupport/test/test_gc.py
--- a/pypy/jit/backend/llsupport/test/test_gc.py
+++ b/pypy/jit/backend/llsupport/test/test_gc.py
@@ -246,9 +246,8 @@
     def __init__(self):
         self.record = []
 
-    def do_malloc_fixedsize_clear(self, RESTYPE, type_id, size, can_collect,
+    def do_malloc_fixedsize_clear(self, RESTYPE, type_id, size,
                                   has_finalizer, contains_weakptr):
-        assert can_collect
         assert not contains_weakptr
         p = llmemory.raw_malloc(size)
         p = llmemory.cast_adr_to_ptr(p, RESTYPE)
@@ -258,8 +257,7 @@
         return p
 
     def do_malloc_varsize_clear(self, RESTYPE, type_id, length, size,
-                                itemsize, offset_to_length, can_collect):
-        assert can_collect
+                                itemsize, offset_to_length):
         p = llmemory.raw_malloc(size + itemsize * length)
         (p + offset_to_length).signed[0] = length
         p = llmemory.cast_adr_to_ptr(p, RESTYPE)
diff --git a/pypy/jit/backend/model.py b/pypy/jit/backend/model.py
--- a/pypy/jit/backend/model.py
+++ b/pypy/jit/backend/model.py
@@ -8,12 +8,13 @@
     # ^^^ This is only useful on 32-bit platforms.  If True,
     # longlongs are supported by the JIT, but stored as doubles.
     # Boxes and Consts are BoxFloats and ConstFloats.
+    supports_singlefloats = False
 
     done_with_this_frame_void_v = -1
     done_with_this_frame_int_v = -1
     done_with_this_frame_ref_v = -1
     done_with_this_frame_float_v = -1
-    exit_frame_with_exception_v = -1
+    propagate_exception_v = -1
     total_compiled_loops = 0
     total_compiled_bridges = 0
     total_freed_loops = 0
diff --git a/pypy/jit/backend/test/calling_convention_test.py b/pypy/jit/backend/test/calling_convention_test.py
--- a/pypy/jit/backend/test/calling_convention_test.py
+++ b/pypy/jit/backend/test/calling_convention_test.py
@@ -8,6 +8,7 @@
                                          ConstObj, BoxFloat, ConstFloat)
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.typesystem import deref
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.tool.oparser import parse
 from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi, rclass
 from pypy.rpython.ootypesystem import ootype
@@ -96,7 +97,8 @@
             FUNC = self.FuncType(funcargs, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             ops = '[%s]\n' % arguments
@@ -148,7 +150,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
@@ -190,7 +193,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
@@ -268,7 +272,8 @@
                 else:
                     ARGS.append(lltype.Signed)
             FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-                lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+                lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+                EffectInfo.MOST_GENERAL)
             ops = '''
             [%s]
             f99 = call_assembler(%s, descr=called_looptoken)
@@ -290,3 +295,59 @@
                 assert abs(x - expected_result) < 0.0001
             finally:
                 del self.cpu.done_with_this_frame_float_v
+
+    def test_call_with_singlefloats(self):
+        cpu = self.cpu
+        if not cpu.supports_floats or not cpu.supports_singlefloats:
+            py.test.skip('requires floats and singlefloats')
+
+        import random
+        from pypy.rlib.libffi import types
+        from pypy.rlib.rarithmetic import r_singlefloat
+
+        def func(*args):
+            res = 0.0
+            for i, x in enumerate(args):
+                res += (i + 1.1) * float(x)
+            return res
+
+        F = lltype.Float
+        S = lltype.SingleFloat
+        I = lltype.Signed
+        floats = [random.random() - 0.5 for i in range(8)]
+        singlefloats = [r_singlefloat(random.random() - 0.5) for i in range(8)]
+        ints = [random.randrange(-99, 99) for i in range(8)]
+        for repeat in range(100):
+            args = []
+            argvalues = []
+            argslist = []
+            local_floats = list(floats)
+            local_singlefloats = list(singlefloats)
+            local_ints = list(ints)
+            for i in range(8):
+                case = random.randrange(0, 3)
+                if case == 0:
+                    args.append(F)
+                    arg = local_floats.pop()
+                    argslist.append(boxfloat(arg))
+                elif case == 1:
+                    args.append(S)
+                    arg = local_singlefloats.pop()
+                    argslist.append(BoxInt(longlong.singlefloat2int(arg)))
+                else:
+                    args.append(I)
+                    arg = local_ints.pop()
+                    argslist.append(BoxInt(arg))
+                argvalues.append(arg)
+            FUNC = self.FuncType(args, F)
+            FPTR = self.Ptr(FUNC)
+            func_ptr = llhelper(FPTR, func)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
+            funcbox = self.get_funcbox(cpu, func_ptr)
+
+            res = self.execute_operation(rop.CALL,
+                                         [funcbox] + argslist,
+                                         'float', descr=calldescr)
+            expected = func(*argvalues)
+            assert abs(res.getfloat() - expected) < 0.0001
diff --git a/pypy/jit/backend/test/runner_test.py b/pypy/jit/backend/test/runner_test.py
--- a/pypy/jit/backend/test/runner_test.py
+++ b/pypy/jit/backend/test/runner_test.py
@@ -9,6 +9,7 @@
                                          ConstObj, BoxFloat, ConstFloat)
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.typesystem import deref
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.tool.oparser import parse
 from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi, rclass
 from pypy.rpython.ootypesystem import ootype
@@ -445,7 +446,8 @@
             return chr(ord(c) + 1)
         FPTR = self.Ptr(self.FuncType([lltype.Char], lltype.Char))
         func_ptr = llhelper(FPTR, func)
-        calldescr = cpu.calldescrof(deref(FPTR), (lltype.Char,), lltype.Char)
+        calldescr = cpu.calldescrof(deref(FPTR), (lltype.Char,), lltype.Char,
+                                    EffectInfo.MOST_GENERAL)
         x = cpu.bh_call_i(self.get_funcbox(cpu, func_ptr).value,
                           calldescr, [ord('A')], None, None)
         assert x == ord('B')
@@ -458,14 +460,15 @@
                                           lltype.Float))
             func_ptr = llhelper(FPTR, func)
             FTP = deref(FPTR)
-            calldescr = cpu.calldescrof(FTP, FTP.ARGS, FTP.RESULT)
+            calldescr = cpu.calldescrof(FTP, FTP.ARGS, FTP.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             x = cpu.bh_call_f(self.get_funcbox(cpu, func_ptr).value,
                               calldescr,
                               [42], None, [longlong.getfloatstorage(3.5)])
             assert longlong.getrealfloat(x) == 3.5 - 42
 
     def test_call(self):
-        from pypy.rlib.libffi import types
+        from pypy.rlib.libffi import types, FUNCFLAG_CDECL
 
         def func_int(a, b):
             return a + b
@@ -486,13 +489,16 @@
             FUNC = deref(FPTR)
             funcbox = self.get_funcbox(cpu, func_ptr)
             # first, try it with the "normal" calldescr
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             res = self.execute_operation(rop.CALL,
                                          [funcbox, BoxInt(num), BoxInt(num)],
                                          'int', descr=calldescr)
             assert res.value == 2 * num
             # then, try it with the dynamic calldescr
-            dyn_calldescr = cpu.calldescrof_dynamic([ffi_type, ffi_type], ffi_type)
+            dyn_calldescr = cpu.calldescrof_dynamic([ffi_type, ffi_type], ffi_type,
+                                                    EffectInfo.MOST_GENERAL,
+                                                    ffi_flags=FUNCFLAG_CDECL)
             res = self.execute_operation(rop.CALL,
                                          [funcbox, BoxInt(num), BoxInt(num)],
                                          'int', descr=dyn_calldescr)
@@ -507,7 +513,8 @@
             FUNC = self.FuncType([F] * 7 + [I] * 2 + [F] * 3, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
             args = ([boxfloat(.1) for i in range(7)] +
                     [BoxInt(1), BoxInt(2), boxfloat(.2), boxfloat(.3),
@@ -529,7 +536,8 @@
 
         FUNC = self.FuncType([lltype.Signed]*16, lltype.Signed)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         func_ptr = llhelper(FPTR, func)
         args = range(16)
         funcbox = self.get_funcbox(self.cpu, func_ptr)
@@ -552,7 +560,8 @@
             FPTR = self.Ptr(self.FuncType([TP] * nb_args, TP))
             func_ptr = llhelper(FPTR, func_ints)
             FUNC = deref(FPTR)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
             args = [280-24*i for i in range(nb_args)]
             res = self.execute_operation(rop.CALL,
@@ -566,7 +575,8 @@
 
         FUNC = self.FuncType([lltype.Float, lltype.Float], lltype.Float)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         func_ptr = llhelper(FPTR, func)
         funcbox = self.get_funcbox(self.cpu, func_ptr)
         res = self.execute_operation(rop.CALL, [funcbox, constfloat(1.5),
@@ -1589,7 +1599,8 @@
         '''
         FPTR = lltype.Ptr(lltype.FuncType([lltype.Signed], lltype.Void))
         fptr = llhelper(FPTR, func)
-        calldescr = self.cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT)
+        calldescr = self.cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT,
+                                         EffectInfo.MOST_GENERAL)
 
         xtp = lltype.malloc(rclass.OBJECT_VTABLE, immortal=True)
         xtp.subclassrange_min = 1
@@ -1807,7 +1818,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Void)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1850,7 +1862,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Signed)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1895,7 +1908,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Float)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1931,7 +1945,7 @@
         assert values == [1, 10]
 
     def test_call_to_c_function(self):
-        from pypy.rlib.libffi import CDLL, types, ArgChain
+        from pypy.rlib.libffi import CDLL, types, ArgChain, FUNCFLAG_CDECL
         from pypy.rpython.lltypesystem.ll2ctypes import libc_name
         libc = CDLL(libc_name)
         c_tolower = libc.getpointer('tolower', [types.uchar], types.sint)
@@ -1941,7 +1955,9 @@
         cpu = self.cpu
         func_adr = llmemory.cast_ptr_to_adr(c_tolower.funcsym)
         funcbox = ConstInt(heaptracker.adr2int(func_adr))
-        calldescr = cpu.calldescrof_dynamic([types.uchar], types.sint)
+        calldescr = cpu.calldescrof_dynamic([types.uchar], types.sint,
+                                            EffectInfo.MOST_GENERAL,
+                                            ffi_flags=FUNCFLAG_CDECL)
         i1 = BoxInt()
         i2 = BoxInt()
         tok = BoxInt()
@@ -1997,7 +2013,9 @@
         funcbox = ConstInt(heaptracker.adr2int(func_adr))
         calldescr = cpu.calldescrof_dynamic([types.pointer, types_size_t,
                                              types_size_t, types.pointer],
-                                            types.void)
+                                            types.void,
+                                            EffectInfo.MOST_GENERAL,
+                                            ffi_flags=clibffi.FUNCFLAG_CDECL)
         i0 = BoxInt()
         i1 = BoxInt()
         i2 = BoxInt()
@@ -2023,6 +2041,62 @@
         assert len(glob.lst) > 0
         lltype.free(raw, flavor='raw')
 
+    def test_call_to_winapi_function(self):
+        from pypy.rlib.clibffi import _WIN32, FUNCFLAG_STDCALL
+        if not _WIN32:
+            py.test.skip("Windows test only")
+        from pypy.rlib.libffi import CDLL, types, ArgChain
+        from pypy.rlib.rwin32 import DWORD
+        libc = CDLL('KERNEL32')
+        c_GetCurrentDir = libc.getpointer('GetCurrentDirectoryA',
+                                          [types.ulong, types.pointer],
+                                          types.ulong)
+
+        cwd = os.getcwd()
+        buflen = len(cwd) + 10
+        buffer = lltype.malloc(rffi.CCHARP.TO, buflen, flavor='raw')
+        argchain = ArgChain().arg(rffi.cast(DWORD, buflen)).arg(buffer)
+        res = c_GetCurrentDir.call(argchain, DWORD)
+        assert rffi.cast(lltype.Signed, res) == len(cwd)
+        assert rffi.charp2strn(buffer, buflen) == cwd
+        lltype.free(buffer, flavor='raw')
+
+        cpu = self.cpu
+        func_adr = llmemory.cast_ptr_to_adr(c_GetCurrentDir.funcsym)
+        funcbox = ConstInt(heaptracker.adr2int(func_adr))
+        calldescr = cpu.calldescrof_dynamic([types.ulong, types.pointer],
+                                            types.ulong,
+                                            EffectInfo.MOST_GENERAL,
+                                            ffi_flags=FUNCFLAG_STDCALL)
+        i1 = BoxInt()
+        i2 = BoxInt()
+        faildescr = BasicFailDescr(1)
+        # if the stdcall convention is ignored, then ESP is wrong after the
+        # call: 8 bytes too much.  If we repeat the call often enough, crash.
+        ops = []
+        for i in range(50):
+            i3 = BoxInt()
+            ops += [
+                ResOperation(rop.CALL_RELEASE_GIL, [funcbox, i1, i2], i3,
+                             descr=calldescr),
+                ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+                ]
+            ops[-1].setfailargs([])
+        ops += [
+            ResOperation(rop.FINISH, [i3], None, descr=BasicFailDescr(0))
+        ]
+        looptoken = LoopToken()
+        self.cpu.compile_loop([i1, i2], ops, looptoken)
+
+        buffer = lltype.malloc(rffi.CCHARP.TO, buflen, flavor='raw')
+        self.cpu.set_future_value_int(0, buflen)
+        self.cpu.set_future_value_int(1, rffi.cast(lltype.Signed, buffer))
+        fail = self.cpu.execute_token(looptoken)
+        assert fail.identifier == 0
+        assert self.cpu.get_latest_value_int(0) == len(cwd)
+        assert rffi.charp2strn(buffer, buflen) == cwd
+        lltype.free(buffer, flavor='raw')
+
     def test_guard_not_invalidated(self):
         cpu = self.cpu
         i0 = BoxInt()
@@ -2292,7 +2366,8 @@
         ARGS = [lltype.Signed] * 10
         RES = lltype.Signed
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         for i in range(10):
             self.cpu.set_future_value_int(i, i+1)
         res = self.cpu.execute_token(looptoken)
@@ -2332,7 +2407,8 @@
         ARGS = [lltype.Float, lltype.Float]
         RES = lltype.Float
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         
         ops = '''
         [f0, f1]
@@ -2422,7 +2498,8 @@
         ARGS = [lltype.Float, lltype.Float]
         RES = lltype.Float
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         
         ops = '''
         [f0, f1]
@@ -2634,7 +2711,8 @@
             #
             FUNC = self.FuncType([lltype.Signed], RESTYPE)
             FPTR = self.Ptr(FUNC)
-            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                             EffectInfo.MOST_GENERAL)
             x = self.cpu.bh_call_i(self.get_funcbox(self.cpu, f).value,
                                    calldescr, [value], None, None)
             assert x == expected, (
@@ -2667,7 +2745,8 @@
             #
             FUNC = self.FuncType([lltype.Signed], RESTYPE)
             FPTR = self.Ptr(FUNC)
-            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                             EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(self.cpu, f)
             res = self.execute_operation(rop.CALL, [funcbox, BoxInt(value)],
                                          'int', descr=calldescr)
@@ -2701,7 +2780,8 @@
         #
         FUNC = self.FuncType([lltype.SignedLongLong], lltype.SignedLongLong)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         x = self.cpu.bh_call_f(self.get_funcbox(self.cpu, f).value,
                                calldescr, None, None, [value])
         assert x == expected
@@ -2728,12 +2808,74 @@
         #
         FUNC = self.FuncType([lltype.SignedLongLong], lltype.SignedLongLong)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         funcbox = self.get_funcbox(self.cpu, f)
         res = self.execute_operation(rop.CALL, [funcbox, BoxFloat(value)],
                                      'float', descr=calldescr)
         assert res.getfloatstorage() == expected
 
+    def test_singlefloat_result_of_call_direct(self):
+        if not self.cpu.supports_singlefloats:
+            py.test.skip("singlefloat test")
+        from pypy.translator.tool.cbuild import ExternalCompilationInfo
+        from pypy.rlib.rarithmetic import r_singlefloat
+        eci = ExternalCompilationInfo(
+            separate_module_sources=["""
+            float fn_test_result_of_call(float x)
+            {
+                return x / 2.0f;
+            }
+            """],
+            export_symbols=['fn_test_result_of_call'])
+        f = rffi.llexternal('fn_test_result_of_call', [lltype.SingleFloat],
+                            lltype.SingleFloat,
+                            compilation_info=eci, _nowrapper=True)
+        value = r_singlefloat(-42.5)
+        expected = r_singlefloat(-21.25)
+        assert f(value) == expected
+        #
+        FUNC = self.FuncType([lltype.SingleFloat], lltype.SingleFloat)
+        FPTR = self.Ptr(FUNC)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
+        ivalue = longlong.singlefloat2int(value)
+        iexpected = longlong.singlefloat2int(expected)
+        x = self.cpu.bh_call_i(self.get_funcbox(self.cpu, f).value,
+                               calldescr, [ivalue], None, None)
+        assert x == iexpected
+
+    def test_singlefloat_result_of_call_compiled(self):
+        if not self.cpu.supports_singlefloats:
+            py.test.skip("test of singlefloat result")
+        from pypy.translator.tool.cbuild import ExternalCompilationInfo
+        from pypy.rlib.rarithmetic import r_singlefloat
+        eci = ExternalCompilationInfo(
+            separate_module_sources=["""
+            float fn_test_result_of_call(float x)
+            {
+                return x / 2.0f;
+            }
+            """],
+            export_symbols=['fn_test_result_of_call'])
+        f = rffi.llexternal('fn_test_result_of_call', [lltype.SingleFloat],
+                            lltype.SingleFloat,
+                            compilation_info=eci, _nowrapper=True)
+        value = r_singlefloat(-42.5)
+        expected = r_singlefloat(-21.25)
+        assert f(value) == expected
+        #
+        FUNC = self.FuncType([lltype.SingleFloat], lltype.SingleFloat)
+        FPTR = self.Ptr(FUNC)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
+        funcbox = self.get_funcbox(self.cpu, f)
+        ivalue = longlong.singlefloat2int(value)
+        iexpected = longlong.singlefloat2int(expected)
+        res = self.execute_operation(rop.CALL, [funcbox, BoxInt(ivalue)],
+                                     'int', descr=calldescr)
+        assert res.value == iexpected
+
     def test_free_loop_and_bridges(self):
         from pypy.jit.backend.llsupport.llmodel import AbstractLLCPU
         if not isinstance(self.cpu, AbstractLLCPU):
@@ -2748,6 +2890,26 @@
         assert mem2 < mem1
         assert mem2 == mem0
 
+    def test_memoryerror(self):
+        excdescr = BasicFailDescr(666)
+        self.cpu.propagate_exception_v = self.cpu.get_fail_descr_number(
+            excdescr)
+        self.cpu.setup_once()    # xxx redo it, because we added
+                                 # propagate_exception_v
+        i0 = BoxInt()
+        p0 = BoxPtr()
+        operations = [
+            ResOperation(rop.NEWUNICODE, [i0], p0),
+            ResOperation(rop.FINISH, [p0], None, descr=BasicFailDescr(1))
+            ]
+        inputargs = [i0]
+        looptoken = LoopToken()
+        self.cpu.compile_loop(inputargs, operations, looptoken)
+        # overflowing value:
+        self.cpu.set_future_value_int(0, sys.maxint // 4 + 1)
+        fail = self.cpu.execute_token(looptoken)
+        assert fail.identifier == excdescr.identifier
+
 
 class OOtypeBackendTest(BaseBackendTest):
 
diff --git a/pypy/jit/backend/test/test_ll_random.py b/pypy/jit/backend/test/test_ll_random.py
--- a/pypy/jit/backend/test/test_ll_random.py
+++ b/pypy/jit/backend/test/test_ll_random.py
@@ -6,6 +6,7 @@
 from pypy.jit.metainterp.history import BoxPtr, BoxInt
 from pypy.jit.metainterp.history import BasicFailDescr
 from pypy.jit.codewriter import heaptracker
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.rpython.annlowlevel import llhelper
 from pypy.rlib.rarithmetic import intmask
 from pypy.rpython.llinterp import LLException
@@ -468,6 +469,10 @@
         exec code in d
         return subset, d['f'], vtableptr
 
+    def getcalldescr(self, builder, TP):
+        ef = EffectInfo.MOST_GENERAL
+        return builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT, ef)
+
 # 1. non raising call and guard_no_exception
 class CallOperation(BaseCallOperation):
     def produce_into(self, builder, r):
@@ -481,7 +486,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         op = ResOperation(rop.GUARD_NO_EXCEPTION, [], None,
                           descr=BasicFailDescr())
@@ -501,7 +506,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         _, vtableptr = builder.get_random_structure_type_and_vtable(r)
         exc_box = ConstAddr(llmemory.cast_ptr_to_adr(vtableptr), builder.cpu)
@@ -523,7 +528,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         exc_box = ConstAddr(llmemory.cast_ptr_to_adr(exc), builder.cpu)
         op = ResOperation(rop.GUARD_EXCEPTION, [exc_box], BoxPtr(),
@@ -540,7 +545,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         op = ResOperation(rop.GUARD_NO_EXCEPTION, [], BoxPtr(),
                           descr=BasicFailDescr())
@@ -559,7 +564,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         while True:
             _, vtableptr = builder.get_random_structure_type_and_vtable(r)
diff --git a/pypy/jit/backend/x86/arch.py b/pypy/jit/backend/x86/arch.py
--- a/pypy/jit/backend/x86/arch.py
+++ b/pypy/jit/backend/x86/arch.py
@@ -27,3 +27,6 @@
 # which are used in the malloc itself.  They are:
 #   ecx, ebx, esi, edi               [32 and 64 bits]
 #   r8, r9, r10, r12, r13, r14, r15    [64 bits only]
+#
+# Note that with asmgcc, the locations corresponding to callee-save registers
+# are never used.
diff --git a/pypy/jit/backend/x86/assembler.py b/pypy/jit/backend/x86/assembler.py
--- a/pypy/jit/backend/x86/assembler.py
+++ b/pypy/jit/backend/x86/assembler.py
@@ -34,6 +34,7 @@
 from pypy.rlib.debug import (debug_print, debug_start, debug_stop,
                              have_debug_prints)
 from pypy.rlib import rgc
+from pypy.rlib.clibffi import FFI_DEFAULT_ABI
 from pypy.jit.backend.x86.jump import remap_frame_layout
 from pypy.jit.metainterp.history import ConstInt, BoxInt
 from pypy.jit.codewriter.effectinfo import EffectInfo
@@ -56,7 +57,9 @@
         self.exc = exc
         self.is_guard_not_invalidated = is_guard_not_invalidated
 
-DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed))
+DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed),
+                              ('bridge', lltype.Signed), # 0 or 1
+                              ('number', lltype.Signed))
 
 class Assembler386(object):
     _regalloc = None
@@ -89,6 +92,8 @@
         self._current_depths_cache = (0, 0)
         self.datablockwrapper = None
         self.stack_check_slowpath = 0
+        self.propagate_exception_path = 0
+        self.gcrootmap_retaddr_forced = 0
         self.teardown()
 
     def leave_jitted_hook(self):
@@ -125,6 +130,7 @@
             self._build_failure_recovery(True, withfloats=True)
             support.ensure_sse2_floats()
             self._build_float_constants()
+        self._build_propagate_exception_path()
         if gc_ll_descr.get_malloc_slowpath_addr is not None:
             self._build_malloc_slowpath()
         self._build_stack_check_slowpath()
@@ -138,6 +144,9 @@
         assert self.memcpy_addr != 0, "setup_once() not called?"
         self.current_clt = looptoken.compiled_loop_token
         self.pending_guard_tokens = []
+        if WORD == 8:
+            self.pending_memoryerror_trampoline_from = []
+            self.error_trampoline_64 = 0
         self.mc = codebuf.MachineCodeBlockWrapper()
         #assert self.datablockwrapper is None --- but obscure case
         # possible, e.g. getting MemoryError and continuing
@@ -147,6 +156,8 @@
 
     def teardown(self):
         self.pending_guard_tokens = None
+        if WORD == 8:
+            self.pending_memoryerror_trampoline_from = None
         self.mc = None
         self.looppos = -1
         self.currently_compiling_loop = None
@@ -155,9 +166,12 @@
     def finish_once(self):
         if self._debug:
             debug_start('jit-backend-counts')
-            for i in range(len(self.loop_run_counters)):
-                struct = self.loop_run_counters[i]
-                debug_print(str(i) + ':' + str(struct.i))
+            for struct in self.loop_run_counters:
+                if struct.bridge:
+                    prefix = 'bridge '
+                else:
+                    prefix = 'loop '
+                debug_print(prefix + str(struct.number) + ':' + str(struct.i))
             debug_stop('jit-backend-counts')
 
     def _build_float_constants(self):
@@ -181,6 +195,7 @@
         # instructions in assembler, with a mark_gc_roots in between.
         # With shadowstack, this is not needed, so we produce a single helper.
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        shadow_stack = (gcrootmap is not None and gcrootmap.is_shadow_stack)
         #
         # ---------- first helper for the slow path of malloc ----------
         mc = codebuf.MachineCodeBlockWrapper()
@@ -190,10 +205,19 @@
         mc.SUB_rr(edx.value, eax.value)       # compute the size we want
         addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
         #
-        if gcrootmap is not None and gcrootmap.is_shadow_stack:
+        # The registers to save in the copy area: with shadowstack, most
+        # registers need to be saved.  With asmgcc, the callee-saved registers
+        # don't need to.
+        save_in_copy_area = gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items()
+        if not shadow_stack:
+            save_in_copy_area = [(reg, ofs) for (reg, ofs) in save_in_copy_area
+                   if reg not in gpr_reg_mgr_cls.REGLOC_TO_GCROOTMAP_REG_INDEX]
+        #
+        for reg, ofs in save_in_copy_area:
+            mc.MOV_br(ofs, reg.value)
+        #
+        if shadow_stack:
             # ---- shadowstack ----
-            for reg, ofs in gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items():
-                mc.MOV_br(ofs, reg.value)
             mc.SUB_ri(esp.value, 16 - WORD)      # stack alignment of 16 bytes
             if IS_X86_32:
                 mc.MOV_sr(0, edx.value)          # push argument
@@ -201,15 +225,13 @@
                 mc.MOV_rr(edi.value, edx.value)
             mc.CALL(imm(addr))
             mc.ADD_ri(esp.value, 16 - WORD)
-            for reg, ofs in gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items():
-                mc.MOV_rb(reg.value, ofs)
         else:
             # ---- asmgcc ----
             if IS_X86_32:
                 mc.MOV_sr(WORD, edx.value)       # save it as the new argument
             elif IS_X86_64:
-                # rdi can be clobbered: its content was forced to the stack
-                # by _fastpath_malloc(), like all other save_around_call_regs.
+                # rdi can be clobbered: its content was saved in the
+                # copy area of the stack
                 mc.MOV_rr(edi.value, edx.value)
             mc.JMP(imm(addr))                    # tail call to the real malloc
             rawstart = mc.materialize(self.cpu.asmmemmgr, [])
@@ -217,18 +239,54 @@
             # ---------- second helper for the slow path of malloc ----------
             mc = codebuf.MachineCodeBlockWrapper()
         #
+        for reg, ofs in save_in_copy_area:
+            mc.MOV_rb(reg.value, ofs)
+            assert reg is not eax and reg is not edx
+        #
         if self.cpu.supports_floats:          # restore the XMM registers
             for i in range(self.cpu.NUM_REGS):# from where they were saved
                 mc.MOVSD_xs(i, (WORD*2)+8*i)
+        #
+        # Note: we check this after the code above, just because the code
+        # above is more than 127 bytes on 64-bits...
+        mc.TEST_rr(eax.value, eax.value)
+        mc.J_il8(rx86.Conditions['Z'], 0) # patched later
+        jz_location = mc.get_relative_pos()
+        #
         nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
         mc.MOV(edx, heap(nursery_free_adr))   # load this in EDX
         mc.RET()
+        #
+        # If the slowpath malloc failed, we raise a MemoryError that
+        # always interrupts the current loop, as a "good enough"
+        # approximation.  Also note that we didn't RET from this helper;
+        # but the code we jump to will actually restore the stack
+        # position based on EBP, which will get us out of here for free.
+        offset = mc.get_relative_pos() - jz_location
+        assert 0 < offset <= 127
+        mc.overwrite(jz_location-1, chr(offset))
+        mc.JMP(imm(self.propagate_exception_path))
+        #
         rawstart = mc.materialize(self.cpu.asmmemmgr, [])
         self.malloc_slowpath2 = rawstart
 
+    def _build_propagate_exception_path(self):
+        if self.cpu.propagate_exception_v < 0:
+            return      # not supported (for tests, or non-translated)
+        #
+        self.mc = codebuf.MachineCodeBlockWrapper()
+        # call on_leave_jitted_save_exc()
+        addr = self.cpu.get_on_leave_jitted_int(save_exception=True)
+        self.mc.CALL(imm(addr))
+        self.mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
+        self._call_footer()
+        rawstart = self.mc.materialize(self.cpu.asmmemmgr, [])
+        self.propagate_exception_path = rawstart
+        self.mc = None
+
     def _build_stack_check_slowpath(self):
         _, _, slowpathaddr = self.cpu.insert_stack_check()
-        if slowpathaddr == 0 or self.cpu.exit_frame_with_exception_v < 0:
+        if slowpathaddr == 0 or self.cpu.propagate_exception_v < 0:
             return      # no stack check (for tests, or non-translated)
         #
         # make a "function" that is called immediately at the start of
@@ -284,19 +342,11 @@
         offset = mc.get_relative_pos() - jnz_location
         assert 0 < offset <= 127
         mc.overwrite(jnz_location-1, chr(offset))
-        # clear the exception from the global position
-        mc.MOV(eax, heap(self.cpu.pos_exc_value()))
-        mc.MOV(heap(self.cpu.pos_exception()), imm0)
-        mc.MOV(heap(self.cpu.pos_exc_value()), imm0)
-        # save the current exception instance into fail_boxes_ptr[0]
-        adr = self.fail_boxes_ptr.get_addr_for_num(0)
-        mc.MOV(heap(adr), eax)
-        # call the helper function to set the GC flag on the fail_boxes_ptr
-        # array (note that there is no exception any more here)
-        addr = self.cpu.get_on_leave_jitted_int(save_exception=False)
+        # call on_leave_jitted_save_exc()
+        addr = self.cpu.get_on_leave_jitted_int(save_exception=True)
         mc.CALL(imm(addr))
         #
-        mc.MOV_ri(eax.value, self.cpu.exit_frame_with_exception_v)
+        mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
         #
         # footer -- note the ADD, which skips the return address of this
         # function, and will instead return to the caller's caller.  Note
@@ -309,6 +359,7 @@
         self.stack_check_slowpath = rawstart
 
     @staticmethod
+    @rgc.no_collect
     def _release_gil_asmgcc(css):
         # similar to trackgcroot.py:pypy_asm_stackwalk, first part
         from pypy.rpython.memory.gctransform import asmgcroot
@@ -324,6 +375,7 @@
             before()
 
     @staticmethod
+    @rgc.no_collect
     def _reacquire_gil_asmgcc(css):
         # first reacquire the GIL
         after = rffi.aroundstate.after
@@ -338,12 +390,14 @@
         next.prev = prev
 
     @staticmethod
+    @rgc.no_collect
     def _release_gil_shadowstack():
         before = rffi.aroundstate.before
         if before:
             before()
 
     @staticmethod
+    @rgc.no_collect
     def _reacquire_gil_shadowstack():
         after = rffi.aroundstate.after
         if after:
@@ -392,7 +446,7 @@
         self.setup(looptoken)
         self.currently_compiling_loop = looptoken
         if log:
-            self._register_counter()
+            self._register_counter(False, looptoken.number)
             operations = self._inject_debugging_code(looptoken, operations)
 
         regalloc = RegAlloc(self, self.cpu.translate_support_code)
@@ -461,7 +515,7 @@
 
         self.setup(original_loop_token)
         if log:
-            self._register_counter()
+            self._register_counter(True, descr_number)
             operations = self._inject_debugging_code(faildescr, operations)
 
         arglocs = self.rebuild_faillocs_from_descr(failure_recovery)
@@ -508,6 +562,8 @@
         # at the end of self.mc.
         for tok in self.pending_guard_tokens:
             tok.pos_recovery_stub = self.generate_quick_failure(tok)
+        if WORD == 8 and len(self.pending_memoryerror_trampoline_from) > 0:
+            self.error_trampoline_64 = self.generate_propagate_error_64()
 
     def patch_pending_failure_recoveries(self, rawstart):
         # after we wrote the assembler to raw memory, set up
@@ -544,6 +600,12 @@
                 # less, we would run into the issue that overwriting the
                 # 5 bytes here might get a few nonsense bytes at the
                 # return address of the following CALL.
+        if WORD == 8:
+            for pos_after_jz in self.pending_memoryerror_trampoline_from:
+                assert self.error_trampoline_64 != 0     # only if non-empty
+                mc = codebuf.MachineCodeBlockWrapper()
+                mc.writeimm32(self.error_trampoline_64 - pos_after_jz)
+                mc.copy_to_raw_memory(rawstart + pos_after_jz - 4)
 
     def get_asmmemmgr_blocks(self, looptoken):
         clt = looptoken.compiled_loop_token
@@ -558,7 +620,7 @@
         return self.mc.materialize(self.cpu.asmmemmgr, allblocks,
                                    self.cpu.gc_ll_descr.gcrootmap)
 
-    def _register_counter(self):
+    def _register_counter(self, bridge, number):
         if self._debug:
             # YYY very minor leak -- we need the counters to stay alive
             # forever, just because we want to report them at the end
@@ -566,6 +628,8 @@
             struct = lltype.malloc(DEBUG_COUNTER, flavor='raw',
                                    track_allocation=False)
             struct.i = 0
+            struct.bridge = int(bridge)
+            struct.number = number
             self.loop_run_counters.append(struct)
 
     def _find_failure_recovery_bytecode(self, faildescr):
@@ -893,6 +957,7 @@
         if (isinstance(from_loc, RegLoc) and from_loc.is_xmm) or (isinstance(to_loc, RegLoc) and to_loc.is_xmm):
             self.mc.MOVSD(to_loc, from_loc)
         else:
+            assert to_loc is not ebp
             self.mc.MOV(to_loc, from_loc)
 
     regalloc_mov = mov # legacy interface
@@ -1056,9 +1121,10 @@
                     self.implement_guard(guard_token, checkfalsecond)
         return genop_cmp_guard_float
 
-    def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax):
+    def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax,
+                   argtypes=None, callconv=FFI_DEFAULT_ABI):
         if IS_X86_64:
-            return self._emit_call_64(force_index, x, arglocs, start)
+            return self._emit_call_64(force_index, x, arglocs, start, argtypes)
 
         p = 0
         n = len(arglocs)
@@ -1085,13 +1151,24 @@
         # x is a location
         self.mc.CALL(x)
         self.mark_gc_roots(force_index)
+        #
+        if callconv != FFI_DEFAULT_ABI:
+            self._fix_stdcall(callconv, p)
 
-    def _emit_call_64(self, force_index, x, arglocs, start):
+    def _fix_stdcall(self, callconv, p):
+        from pypy.rlib.clibffi import FFI_STDCALL
+        assert callconv == FFI_STDCALL
+        # it's a bit stupid, but we're just going to cancel the fact that
+        # the called function just added 'p' to ESP, by subtracting it again.
+        self.mc.SUB_ri(esp.value, p)
+
+    def _emit_call_64(self, force_index, x, arglocs, start, argtypes):
         src_locs = []
         dst_locs = []
         xmm_src_locs = []
         xmm_dst_locs = []
         pass_on_stack = []
+        singlefloats = None
 
         # In reverse order for use with pop()
         unused_gpr = [r9, r8, ecx, edx, esi, edi]
@@ -1111,6 +1188,11 @@
                     xmm_dst_locs.append(unused_xmm.pop())
                 else:
                     pass_on_stack.append(loc)
+            elif (argtypes is not None and argtypes[i-start] == 'S' and
+                  len(unused_xmm) > 0):
+                # Singlefloat argument
+                if singlefloats is None: singlefloats = []
+                singlefloats.append((loc, unused_xmm.pop()))
             else:
                 if len(unused_gpr) > 0:
                     src_locs.append(loc)
@@ -1138,9 +1220,15 @@
                 else:
                     self.mc.MOV_sr(i*WORD, loc.value)
 
-        # Handle register arguments
+        # Handle register arguments: first remap the xmm arguments
+        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs,
+                           X86_64_XMM_SCRATCH_REG)
+        # Load the singlefloat arguments from main regs or stack to xmm regs
+        if singlefloats is not None:
+            for src, dst in singlefloats:
+                self.mc.MOVD(dst, src)
+        # Finally remap the arguments in the main regs
         remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
-        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs, X86_64_XMM_SCRATCH_REG)
 
         self._regalloc.reserve_param(len(pass_on_stack))
         self.mc.CALL(x)
@@ -1255,6 +1343,20 @@
     def genop_cast_int_to_float(self, op, arglocs, resloc):
         self.mc.CVTSI2SD(resloc, arglocs[0])
 
+    def genop_cast_float_to_singlefloat(self, op, arglocs, resloc):
+        loc0, loctmp = arglocs
+        self.mc.CVTSD2SS(loctmp, loc0)
+        assert isinstance(resloc, RegLoc)
+        assert isinstance(loctmp, RegLoc)
+        self.mc.MOVD_rx(resloc.value, loctmp.value)
+
+    def genop_cast_singlefloat_to_float(self, op, arglocs, resloc):
+        loc0, = arglocs
+        assert isinstance(resloc, RegLoc)
+        assert isinstance(loc0, RegLoc)
+        self.mc.MOVD_xr(resloc.value, loc0.value)
+        self.mc.CVTSS2SD_xx(resloc.value, resloc.value)
+
     def genop_guard_int_is_true(self, op, guard_op, guard_token, arglocs, resloc):
         guard_opnum = guard_op.getopnum()
         self.mc.CMP(arglocs[0], imm0)
@@ -1376,7 +1478,7 @@
         assert isinstance(loc_vtable, ImmedLoc)
         arglocs = arglocs[:-1]
         self.call(self.malloc_func_addr, arglocs, eax)
-        # xxx ignore NULL returns for now
+        self.propagate_memoryerror_if_eax_is_null()
         self.set_vtable(eax, loc_vtable)
 
     def set_vtable(self, loc, loc_vtable):
@@ -1395,18 +1497,35 @@
     def genop_new(self, op, arglocs, result_loc):
         assert result_loc is eax
         self.call(self.malloc_func_addr, arglocs, eax)
+        self.propagate_memoryerror_if_eax_is_null()
 
     def genop_new_array(self, op, arglocs, result_loc):
         assert result_loc is eax
         self.call(self.malloc_array_func_addr, arglocs, eax)
+        self.propagate_memoryerror_if_eax_is_null()
 
     def genop_newstr(self, op, arglocs, result_loc):
         assert result_loc is eax
         self.call(self.malloc_str_func_addr, arglocs, eax)
+        self.propagate_memoryerror_if_eax_is_null()
 
     def genop_newunicode(self, op, arglocs, result_loc):
         assert result_loc is eax
         self.call(self.malloc_unicode_func_addr, arglocs, eax)
+        self.propagate_memoryerror_if_eax_is_null()
+
+    def propagate_memoryerror_if_eax_is_null(self):
+        # if self.propagate_exception_path == 0 (tests), this may jump to 0
+        # and segfaults.  too bad.  the alternative is to continue anyway
+        # with eax==0, but that will segfault too.
+        self.mc.TEST_rr(eax.value, eax.value)
+        if WORD == 4:
+            self.mc.J_il(rx86.Conditions['Z'], self.propagate_exception_path)
+            self.mc.add_pending_relocation()
+        elif WORD == 8:
+            self.mc.J_il(rx86.Conditions['Z'], 0)
+            pos = self.mc.get_relative_pos()
+            self.pending_memoryerror_trampoline_from.append(pos)
 
     # ----------
 
@@ -1678,6 +1797,12 @@
         return GuardToken(faildescr, failargs, fail_locs, exc,
                           is_guard_not_invalidated)
 
+    def generate_propagate_error_64(self):
+        assert WORD == 8
+        startpos = self.mc.get_relative_pos()
+        self.mc.JMP(imm(self.propagate_exception_path))
+        return startpos
+
     def generate_quick_failure(self, guardtok):
         """Generate the initial code for handling a failure.  We try to
         keep it as compact as possible.
@@ -2013,7 +2138,9 @@
         else:
             tmp = eax
 
-        self._emit_call(force_index, x, arglocs, 3, tmp=tmp)
+        self._emit_call(force_index, x, arglocs, 3, tmp=tmp,
+                        argtypes=op.getdescr().get_arg_types(),
+                        callconv=op.getdescr().get_call_conv())
 
         if IS_X86_32 and isinstance(resloc, StackLoc) and resloc.width == 8:
             # a float or a long long return
@@ -2025,7 +2152,19 @@
                 #     and this way is simpler also because the result loc
                 #     can just be always a stack location
             else:
-                self.mc.FSTP_b(resloc.value)   # float return
+                self.mc.FSTPL_b(resloc.value)   # float return
+        elif op.getdescr().get_return_type() == 'S':
+            # singlefloat return
+            assert resloc is eax
+            if IS_X86_32:
+                # must convert ST(0) to a 32-bit singlefloat and load it into EAX
+                # mess mess mess
+                self.mc.SUB_ri(esp.value, 4)
+                self.mc.FSTPS_s(0)
+                self.mc.POP_r(eax.value)
+            elif IS_X86_64:
+                # must copy from the lower 32 bits of XMM0 into eax
+                self.mc.MOVD_rx(eax.value, xmm0.value)
         elif size == WORD:
             assert resloc is eax or resloc is xmm0    # a full word
         elif size == 0:
@@ -2097,13 +2236,27 @@
                 css = get_ebp_ofs(pos + use_words - 1)
                 self._regalloc.close_stack_struct = css
             # The location where the future CALL will put its return address
-            # will be [ESP-WORD], so save that as the next frame's top address
-            self.mc.LEA_rs(eax.value, -WORD)        # LEA EAX, [ESP-4]
+            # will be [ESP-WORD].  But we can't use that as the next frame's
+            # top address!  As the code after releasegil() runs without the
+            # GIL, it might not be set yet by the time we need it (very
+            # unlikely), or it might be overwritten by the following call
+            # to reaquiregil() (much more likely).  So we hack even more
+            # and use a dummy location containing a dummy value (a pointer
+            # to itself) which we pretend is the return address :-/ :-/ :-/
+            # It prevents us to store any %esp-based stack locations but we
+            # don't so far.
+            adr = self.datablockwrapper.malloc_aligned(WORD, WORD)
+            rffi.cast(rffi.CArrayPtr(lltype.Signed), adr)[0] = adr
+            self.gcrootmap_retaddr_forced = adr
             frame_ptr = css + WORD * (2+asmgcroot.FRAME_PTR)
-            self.mc.MOV_br(frame_ptr, eax.value)    # MOV [css.frame], EAX
+            if rx86.fits_in_32bits(adr):
+                self.mc.MOV_bi(frame_ptr, adr)          # MOV [css.frame], adr
+            else:
+                self.mc.MOV_ri(eax.value, adr)          # MOV EAX, adr
+                self.mc.MOV_br(frame_ptr, eax.value)    # MOV [css.frame], EAX
             # Save ebp
             index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
-            self.mc.MOV_br(index_of_ebp, ebp.value) # MOV [css.ebp], EBP
+            self.mc.MOV_br(index_of_ebp, ebp.value)     # MOV [css.ebp], EBP
             # Call the closestack() function (also releasing the GIL)
             if IS_X86_32:
                 reg = eax
@@ -2131,6 +2284,9 @@
         if gcrootmap.is_shadow_stack:
             args = []
         else:
+            assert self.gcrootmap_retaddr_forced == -1, (
+                      "missing mark_gc_roots() in CALL_RELEASE_GIL")
+            self.gcrootmap_retaddr_forced = 0
             css = self._regalloc.close_stack_struct
             assert css != 0
             if IS_X86_32:
@@ -2183,7 +2339,7 @@
         self._emit_call(fail_index, imm(asm_helper_adr), [eax, arglocs[1]], 0,
                         tmp=ecx)
         if IS_X86_32 and isinstance(result_loc, StackLoc) and result_loc.type == FLOAT:
-            self.mc.FSTP_b(result_loc.value)
+            self.mc.FSTPL_b(result_loc.value)
         #else: result_loc is already either eax or None, checked below
         self.mc.JMP_l8(0) # jump to done, patched later
         jmp_location = self.mc.get_relative_pos()
@@ -2355,11 +2511,6 @@
 
     genop_discard_cond_call_gc_wb_array = genop_discard_cond_call_gc_wb
 
-    def genop_force_token(self, op, arglocs, resloc):
-        # RegAlloc.consider_force_token ensures this:
-        assert isinstance(resloc, RegLoc)
-        self.mc.LEA_rb(resloc.value, FORCE_INDEX_OFS)
-
     def not_implemented_op_discard(self, op, arglocs):
         not_implemented("not implemented operation: %s" % op.getopname())
 
@@ -2381,7 +2532,13 @@
             if gcrootmap.is_shadow_stack:
                 gcrootmap.write_callshape(mark, force_index)
             else:
-                self.mc.insert_gcroot_marker(mark)
+                if self.gcrootmap_retaddr_forced == 0:
+                    self.mc.insert_gcroot_marker(mark)   # common case
+                else:
+                    assert self.gcrootmap_retaddr_forced != -1, (
+                              "two mark_gc_roots() in a CALL_RELEASE_GIL")
+                    gcrootmap.put(self.gcrootmap_retaddr_forced, mark)
+                    self.gcrootmap_retaddr_forced = -1
 
     def target_arglocs(self, loop_token):
         return loop_token._x86_arglocs
@@ -2424,8 +2581,7 @@
             # there are two helpers to call only with asmgcc
             slowpath_addr1 = self.malloc_slowpath1
             self.mc.CALL(imm(slowpath_addr1))
-        self.mark_gc_roots(self.write_new_force_index(),
-                           use_copy_area=shadow_stack)
+        self.mark_gc_roots(self.write_new_force_index(), use_copy_area=True)
         slowpath_addr2 = self.malloc_slowpath2
         self.mc.CALL(imm(slowpath_addr2))
 
diff --git a/pypy/jit/backend/x86/codebuf.py b/pypy/jit/backend/x86/codebuf.py
--- a/pypy/jit/backend/x86/codebuf.py
+++ b/pypy/jit/backend/x86/codebuf.py
@@ -25,8 +25,11 @@
         self.init_block_builder()
         # a list of relative positions; for each position p, the bytes
         # at [p-4:p] encode an absolute address that will need to be
-        # made relative.
-        self.relocations = []
+        # made relative.  Only works on 32-bit!
+        if WORD == 4:
+            self.relocations = []
+        else:
+            self.relocations = None
         #
         # ResOperation --> offset in the assembly.
         # ops_offset[None] represents the beginning of the code after the last op
@@ -42,9 +45,10 @@
 
     def copy_to_raw_memory(self, addr):
         self._copy_to_raw_memory(addr)
-        for reloc in self.relocations:
-            p = addr + reloc
-            adr = rffi.cast(rffi.LONGP, p - WORD)
-            adr[0] = intmask(adr[0] - p)
+        if self.relocations is not None:
+            for reloc in self.relocations:
+                p = addr + reloc
+                adr = rffi.cast(rffi.LONGP, p - WORD)
+                adr[0] = intmask(adr[0] - p)
         valgrind.discard_translations(addr, self.get_relative_pos())
         self._dump(addr, "jit-backend-dump", backend_name)
diff --git a/pypy/jit/backend/x86/regalloc.py b/pypy/jit/backend/x86/regalloc.py
--- a/pypy/jit/backend/x86/regalloc.py
+++ b/pypy/jit/backend/x86/regalloc.py
@@ -29,6 +29,7 @@
     all_regs = [eax, ecx, edx, ebx, esi, edi]
     no_lower_byte_regs = [esi, edi]
     save_around_call_regs = [eax, edx, ecx]
+    frame_reg = ebp
 
     REGLOC_TO_GCROOTMAP_REG_INDEX = {
         ebx: 1,
@@ -312,8 +313,11 @@
                     self.fm.frame_bindings[arg] = loc
             else:
                 if isinstance(loc, RegLoc):
-                    self.rm.reg_bindings[arg] = loc
-                    used[loc] = None
+                    if loc is ebp:
+                        self.rm.bindings_to_frame_reg[arg] = None
+                    else:
+                        self.rm.reg_bindings[arg] = loc
+                        used[loc] = None
                 else:
                     self.fm.frame_bindings[arg] = loc
         self.rm.free_regs = []
@@ -705,6 +709,17 @@
         self.Perform(op, [loc0], loc1)
         self.rm.possibly_free_var(op.getarg(0))
 
+    def consider_cast_float_to_singlefloat(self, op):
+        loc0 = self.xrm.make_sure_var_in_reg(op.getarg(0))
+        loc1 = self.rm.force_allocate_reg(op.result)
+        self.xrm.possibly_free_var(op.getarg(0))
+        tmpxvar = TempBox()
+        loctmp = self.xrm.force_allocate_reg(tmpxvar)   # may be equal to loc0
+        self.xrm.possibly_free_var(tmpxvar)
+        self.Perform(op, [loc0, loctmp], loc1)
+
+    consider_cast_singlefloat_to_float = consider_cast_int_to_float
+
     def _consider_llong_binop_xx(self, op):
         # must force both arguments into xmm registers, because we don't
         # know if they will be suitably aligned.  Exception: if the second
@@ -832,8 +847,8 @@
 
     def consider_call(self, op):
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            oopspecindex = effectinfo.oopspecindex
+        oopspecindex = effectinfo.oopspecindex
+        if oopspecindex != EffectInfo.OS_NONE:
             if IS_X86_32:
                 # support for some of the llong operations,
                 # which only exist on x86-32
@@ -921,27 +936,13 @@
     def _do_fastpath_malloc(self, op, size, tid):
         gc_ll_descr = self.assembler.cpu.gc_ll_descr
         self.rm.force_allocate_reg(op.result, selected_reg=eax)
-
-        if gc_ll_descr.gcrootmap and gc_ll_descr.gcrootmap.is_shadow_stack:
-            # ---- shadowstack ----
-            # We need edx as a temporary, but otherwise don't save any more
-            # register.  See comments in _build_malloc_slowpath().
-            tmp_box = TempBox()
-            self.rm.force_allocate_reg(tmp_box, selected_reg=edx)
-            self.rm.possibly_free_var(tmp_box)
-        else:
-            # ---- asmgcc ----
-            # We need to force-allocate each of save_around_call_regs now.
-            # The alternative would be to save and restore them around the
-            # actual call to malloc(), in the rare case where we need to do
-            # it; however, mark_gc_roots() would need to be adapted to know
-            # where the variables end up being saved.  Messy.
-            for reg in self.rm.save_around_call_regs:
-                if reg is not eax:
-                    tmp_box = TempBox()
-                    self.rm.force_allocate_reg(tmp_box, selected_reg=reg)
-                    self.rm.possibly_free_var(tmp_box)
-
+        #
+        # We need edx as a temporary, but otherwise don't save any more
+        # register.  See comments in _build_malloc_slowpath().
+        tmp_box = TempBox()
+        self.rm.force_allocate_reg(tmp_box, selected_reg=edx)
+        self.rm.possibly_free_var(tmp_box)
+        #
         self.assembler.malloc_cond(
             gc_ll_descr.get_nursery_free_addr(),
             gc_ll_descr.get_nursery_top_addr(),
@@ -1337,20 +1338,32 @@
             if reg is eax:
                 continue      # ok to ignore this one
             if (isinstance(v, BoxPtr) and self.rm.stays_alive(v)):
-                if use_copy_area:
-                    assert reg in self.rm.REGLOC_TO_COPY_AREA_OFS
-                    area_offset = self.rm.REGLOC_TO_COPY_AREA_OFS[reg]
-                    gcrootmap.add_frame_offset(shape, area_offset)
-                else:
-                    assert reg in self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX
-                    gcrootmap.add_callee_save_reg(
-                        shape, self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX[reg])
+                #
+                # The register 'reg' is alive across this call.
+                gcrootmap = self.assembler.cpu.gc_ll_descr.gcrootmap
+                if gcrootmap is None or not gcrootmap.is_shadow_stack:
+                    #
+                    # Asmgcc: if reg is a callee-save register, we can
+                    # explicitly mark it as containing a BoxPtr.
+                    if reg in self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX:
+                        gcrootmap.add_callee_save_reg(
+                            shape, self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX[reg])
+                        continue
+                #
+                # Else, 'use_copy_area' must be True (otherwise this BoxPtr
+                # should not be in a register).  The copy area contains the
+                # real value of the register.
+                assert use_copy_area
+                assert reg in self.rm.REGLOC_TO_COPY_AREA_OFS
+                area_offset = self.rm.REGLOC_TO_COPY_AREA_OFS[reg]
+                gcrootmap.add_frame_offset(shape, area_offset)
+        #
         return gcrootmap.compress_callshape(shape,
                                             self.assembler.datablockwrapper)
 
     def consider_force_token(self, op):
-        loc = self.rm.force_allocate_reg(op.result)
-        self.Perform(op, [], loc)
+        # the FORCE_TOKEN operation returns directly 'ebp'
+        self.rm.force_allocate_frame_reg(op.result)
 
     def not_implemented_op(self, op):
         not_implemented("not implemented operation: %s" % op.getopname())
diff --git a/pypy/jit/backend/x86/regloc.py b/pypy/jit/backend/x86/regloc.py
--- a/pypy/jit/backend/x86/regloc.py
+++ b/pypy/jit/backend/x86/regloc.py
@@ -521,6 +521,8 @@
     UCOMISD = _binaryop('UCOMISD')
     CVTSI2SD = _binaryop('CVTSI2SD')
     CVTTSD2SI = _binaryop('CVTTSD2SI')
+    CVTSD2SS = _binaryop('CVTSD2SS')
+    CVTSS2SD = _binaryop('CVTSS2SD')
     
     SQRTSD = _binaryop('SQRTSD')
 
@@ -534,6 +536,8 @@
     PXOR  = _binaryop('PXOR')
     PCMPEQD = _binaryop('PCMPEQD')
 
+    MOVD = _binaryop('MOVD')
+
     CALL = _relative_unaryop('CALL')
     JMP = _relative_unaryop('JMP')
 
diff --git a/pypy/jit/backend/x86/runner.py b/pypy/jit/backend/x86/runner.py
--- a/pypy/jit/backend/x86/runner.py
+++ b/pypy/jit/backend/x86/runner.py
@@ -19,6 +19,7 @@
 class AbstractX86CPU(AbstractLLCPU):
     debug = True
     supports_floats = True
+    supports_singlefloats = True
 
     BOOTSTRAP_TP = lltype.FuncType([], lltype.Signed)
     dont_keepalive_stuff = False # for tests
@@ -118,7 +119,8 @@
             setitem(index, null)
 
     def get_latest_force_token(self):
-        return self.assembler.fail_ebp + FORCE_INDEX_OFS
+        # the FORCE_TOKEN operation and this helper both return 'ebp'.
+        return self.assembler.fail_ebp
 
     def execute_token(self, executable_token):
         addr = executable_token._x86_bootstrap_code
@@ -152,8 +154,9 @@
                                        flavor='raw', zero=True,
                                        immortal=True)
 
-    def force(self, addr_of_force_index):
+    def force(self, addr_of_force_token):
         TP = rffi.CArrayPtr(lltype.Signed)
+        addr_of_force_index = addr_of_force_token + FORCE_INDEX_OFS
         fail_index = rffi.cast(TP, addr_of_force_index)[0]
         assert fail_index >= 0, "already forced!"
         faildescr = self.get_fail_descr_from_number(fail_index)
@@ -163,7 +166,7 @@
         # start of "no gc operation!" block
         fail_index_2 = self.assembler.grab_frame_values(
             bytecode,
-            addr_of_force_index - FORCE_INDEX_OFS,
+            addr_of_force_token,
             self.all_null_registers)
         self.assembler.leave_jitted_hook()
         # end of "no gc operation!" block
diff --git a/pypy/jit/backend/x86/rx86.py b/pypy/jit/backend/x86/rx86.py
--- a/pypy/jit/backend/x86/rx86.py
+++ b/pypy/jit/backend/x86/rx86.py
@@ -527,6 +527,7 @@
 
     NOP = insn('\x90')
     RET = insn('\xC3')
+    RET16_i = insn('\xC2', immediate(1, 'h'))
 
     PUSH_r = insn(rex_nw, register(1), '\x50')
     PUSH_b = insn(rex_nw, '\xFF', orbyte(6<<3), stack_bp(1))
@@ -573,7 +574,8 @@
     BTS_jr = insn(rex_w, '\x0F\xAB', register(2,8), abs_, immediate(1))
 
     # x87 instructions
-    FSTP_b = insn('\xDD', orbyte(3<<3), stack_bp(1))
+    FSTPL_b = insn('\xDD', orbyte(3<<3), stack_bp(1)) # rffi.DOUBLE ('as' wants L??)
+    FSTPS_s = insn('\xD9', orbyte(3<<3), stack_sp(1)) # lltype.SingleFloat
 
     # ------------------------------ Random mess -----------------------
     RDTSC = insn('\x0F\x31')
@@ -590,8 +592,18 @@
     CVTTSD2SI_rx = xmminsn('\xF2', rex_w, '\x0F\x2C', register(1, 8), register(2), '\xC0')
     CVTTSD2SI_rb = xmminsn('\xF2', rex_w, '\x0F\x2C', register(1, 8), stack_bp(2))
 
-    MOVD_rx = xmminsn('\x66', rex_w, '\x0F\x7E', register(2, 8), register(1), '\xC0')
-    MOVD_xr = xmminsn('\x66', rex_w, '\x0F\x6E', register(1, 8), register(2), '\xC0')
+    CVTSD2SS_xx = xmminsn('\xF2', rex_nw, '\x0F\x5A',
+                          register(1, 8), register(2), '\xC0')
+    CVTSD2SS_xb = xmminsn('\xF2', rex_nw, '\x0F\x5A',
+                          register(1, 8), stack_bp(2))
+    CVTSS2SD_xx = xmminsn('\xF3', rex_nw, '\x0F\x5A',
+                          register(1, 8), register(2), '\xC0')
+    CVTSS2SD_xb = xmminsn('\xF3', rex_nw, '\x0F\x5A',
+                          register(1, 8), stack_bp(2))
+
+    MOVD_rx = xmminsn('\x66', rex_nw, '\x0F\x7E', register(2, 8), register(1), '\xC0')
+    MOVD_xr = xmminsn('\x66', rex_nw, '\x0F\x6E', register(1, 8), register(2), '\xC0')
+    MOVD_xb = xmminsn('\x66', rex_nw, '\x0F\x6E', register(1, 8), stack_bp(2))
 
     PSRAD_xi = xmminsn('\x66', rex_nw, '\x0F\x72', register(1), '\xE0', immediate(2, 'b'))
 
diff --git a/pypy/jit/backend/x86/test/test_gc_integration.py b/pypy/jit/backend/x86/test/test_gc_integration.py
--- a/pypy/jit/backend/x86/test/test_gc_integration.py
+++ b/pypy/jit/backend/x86/test/test_gc_integration.py
@@ -7,6 +7,7 @@
      BoxPtr, ConstPtr, TreeLoop
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.codewriter import heaptracker
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.backend.llsupport.descr import GcCache
 from pypy.jit.backend.llsupport.gc import GcLLDescription
 from pypy.jit.backend.detect_cpu import getcpuclass
@@ -76,7 +77,8 @@
         for box in boxes:
             regalloc.rm.try_allocate_reg(box)
         TP = lltype.FuncType([], lltype.Signed)
-        calldescr = cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        calldescr = cpu.calldescrof(TP, TP.ARGS, TP.RESULT,
+                                    EffectInfo.MOST_GENERAL)
         regalloc.rm._check_invariants()
         box = boxes[0]
         regalloc.position = 0
diff --git a/pypy/jit/backend/x86/test/test_regalloc.py b/pypy/jit/backend/x86/test/test_regalloc.py
--- a/pypy/jit/backend/x86/test/test_regalloc.py
+++ b/pypy/jit/backend/x86/test/test_regalloc.py
@@ -16,6 +16,7 @@
 from pypy.rpython.annlowlevel import llhelper
 from pypy.rpython.lltypesystem import rclass, rstr
 from pypy.jit.codewriter import longlong
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.backend.x86.rx86 import *
 
 def test_is_comparison_or_ovf_op():
@@ -92,7 +93,8 @@
     zd_addr = cpu.cast_int_to_adr(zero_division_tp)
     zero_division_error = llmemory.cast_adr_to_ptr(zd_addr,
                                             lltype.Ptr(rclass.OBJECT_VTABLE))
-    raising_calldescr = cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT)
+    raising_calldescr = cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT,
+                                        EffectInfo.MOST_GENERAL)
 
     fdescr1 = BasicFailDescr(1)
     fdescr2 = BasicFailDescr(2)
@@ -115,9 +117,12 @@
     f2ptr = llhelper(F2PTR, f2)
     f10ptr = llhelper(F10PTR, f10)
 
-    f1_calldescr = cpu.calldescrof(F1PTR.TO, F1PTR.TO.ARGS, F1PTR.TO.RESULT)
-    f2_calldescr = cpu.calldescrof(F2PTR.TO, F2PTR.TO.ARGS, F2PTR.TO.RESULT)
-    f10_calldescr = cpu.calldescrof(F10PTR.TO, F10PTR.TO.ARGS, F10PTR.TO.RESULT)
+    f1_calldescr = cpu.calldescrof(F1PTR.TO, F1PTR.TO.ARGS, F1PTR.TO.RESULT,
+                                   EffectInfo.MOST_GENERAL)
+    f2_calldescr = cpu.calldescrof(F2PTR.TO, F2PTR.TO.ARGS, F2PTR.TO.RESULT,
+                                   EffectInfo.MOST_GENERAL)
+    f10_calldescr= cpu.calldescrof(F10PTR.TO, F10PTR.TO.ARGS, F10PTR.TO.RESULT,
+                                   EffectInfo.MOST_GENERAL)
 
     namespace = locals().copy()
     type_system = 'lltype'
diff --git a/pypy/jit/backend/x86/test/test_regloc.py b/pypy/jit/backend/x86/test/test_regloc.py
--- a/pypy/jit/backend/x86/test/test_regloc.py
+++ b/pypy/jit/backend/x86/test/test_regloc.py
@@ -62,7 +62,7 @@
             assert mc.relocations == [5]
             expected = "\xE8" + struct.pack('<i', target - (rawstart + 5))
         elif IS_X86_64:
-            assert mc.relocations == []
+            assert mc.relocations is None
             if 0 <= target <= 0xffffffff:
                 assert length == 9
                 expected = (
diff --git a/pypy/jit/backend/x86/test/test_runner.py b/pypy/jit/backend/x86/test/test_runner.py
--- a/pypy/jit/backend/x86/test/test_runner.py
+++ b/pypy/jit/backend/x86/test/test_runner.py
@@ -433,6 +433,88 @@
                 ops_offset[operations[2]] <=
                 ops_offset[None])
 
+    def test_calling_convention(self, monkeypatch):
+        if WORD != 4:
+            py.test.skip("32-bit only test")
+        from pypy.jit.backend.x86.regloc import eax, edx
+        from pypy.jit.backend.x86 import codebuf
+        from pypy.jit.codewriter.effectinfo import EffectInfo
+        from pypy.rlib.libffi import types, clibffi
+        had_stdcall = hasattr(clibffi, 'FFI_STDCALL')
+        if not had_stdcall:    # not running on Windows, but we can still test
+            monkeypatch.setattr(clibffi, 'FFI_STDCALL', 12345, raising=False)
+        #
+        for ffi in [clibffi.FFI_DEFAULT_ABI, clibffi.FFI_STDCALL]:
+            cpu = self.cpu
+            mc = codebuf.MachineCodeBlockWrapper()
+            mc.MOV_rs(eax.value, 4)      # argument 1
+            mc.MOV_rs(edx.value, 40)     # argument 10
+            mc.SUB_rr(eax.value, edx.value)     # return arg1 - arg10
+            if ffi == clibffi.FFI_DEFAULT_ABI:
+                mc.RET()
+            else:
+                mc.RET16_i(40)
+            rawstart = mc.materialize(cpu.asmmemmgr, [])
+            #
+            calldescr = cpu.calldescrof_dynamic([types.slong] * 10,
+                                                types.slong,
+                                                EffectInfo.MOST_GENERAL,
+                                                ffi_flags=-1)
+            calldescr.get_call_conv = lambda: ffi      # <==== hack
+            funcbox = ConstInt(rawstart)
+            i1 = BoxInt()
+            i2 = BoxInt()
+            i3 = BoxInt()
+            i4 = BoxInt()
+            i5 = BoxInt()
+            i6 = BoxInt()
+            c = ConstInt(-1)
+            faildescr = BasicFailDescr(1)
+            # we must call it repeatedly: if the stack pointer gets increased
+            # by 40 bytes by the STDCALL call, and if we don't expect it,
+            # then we are going to get our stack emptied unexpectedly by
+            # several repeated calls
+            ops = [
+            ResOperation(rop.CALL_RELEASE_GIL,
+                         [funcbox, i1, c, c, c, c, c, c, c, c, i2],
+                         i3, descr=calldescr),
+            ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+
+            ResOperation(rop.CALL_RELEASE_GIL,
+                         [funcbox, i1, c, c, c, c, c, c, c, c, i2],
+                         i4, descr=calldescr),
+            ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+
+            ResOperation(rop.CALL_RELEASE_GIL,
+                         [funcbox, i1, c, c, c, c, c, c, c, c, i2],
+                         i5, descr=calldescr),
+            ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+
+            ResOperation(rop.CALL_RELEASE_GIL,
+                         [funcbox, i1, c, c, c, c, c, c, c, c, i2],
+                         i6, descr=calldescr),
+            ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+
+            ResOperation(rop.FINISH, [i3, i4, i5, i6], None,
+                         descr=BasicFailDescr(0))
+            ]
+            ops[1].setfailargs([])
+            ops[3].setfailargs([])
+            ops[5].setfailargs([])
+            ops[7].setfailargs([])
+            looptoken = LoopToken()
+            self.cpu.compile_loop([i1, i2], ops, looptoken)
+
+            self.cpu.set_future_value_int(0, 123450)
+            self.cpu.set_future_value_int(1, 123408)
+            fail = self.cpu.execute_token(looptoken)
+            assert fail.identifier == 0
+            assert self.cpu.get_latest_value_int(0) == 42
+            assert self.cpu.get_latest_value_int(1) == 42
+            assert self.cpu.get_latest_value_int(2) == 42
+            assert self.cpu.get_latest_value_int(3) == 42
+
+
 class TestDebuggingAssembler(object):
     def setup_method(self, meth):
         self.cpu = CPU(rtyper=None, stats=FakeStats())
@@ -463,7 +545,7 @@
             self.cpu.finish_once()
         finally:
             debug._log = None
-        assert ('jit-backend-counts', [('debug_print', '0:10')]) in dlog
+        assert ('jit-backend-counts', [('debug_print', 'loop -1:10')]) in dlog
 
     def test_debugger_checksum(self):
         loop = """
diff --git a/pypy/jit/backend/x86/test/test_rx86_32_auto_encoding.py b/pypy/jit/backend/x86/test/test_rx86_32_auto_encoding.py
--- a/pypy/jit/backend/x86/test/test_rx86_32_auto_encoding.py
+++ b/pypy/jit/backend/x86/test/test_rx86_32_auto_encoding.py
@@ -36,6 +36,14 @@
 def hexdump(s):
     return ' '.join(["%02X" % ord(c) for c in s])
 
+def reduce_to_32bit(s):
+    if s[:2] != '%r':
+        return s
+    if s[2:].isdigit():
+        return s + 'd'
+    else:
+        return '%e' + s[2:]
+
 # ____________________________________________________________
 
 COUNT1 = 15
@@ -180,12 +188,14 @@
     ##        for m, extra in args:
     ##            if m in (i386.MODRM, i386.MODRM8) or all:
     ##                suffix = suffixes[sizes[m]] + suffix
-            if argmodes and not self.is_xmm_insn:
+            if (argmodes and not self.is_xmm_insn
+                         and not instrname.startswith('FSTP')):
                 suffix = suffixes[self.WORD]
             # Special case: On 64-bit CPUs, rx86 assumes 64-bit integer
             # operands when converting to/from floating point, so we need to
             # indicate that with a suffix
-            if (self.WORD == 8) and instrname.startswith('CVT'):
+            if (self.WORD == 8) and (instrname.startswith('CVT') and
+                                     'SI' in instrname):
                 suffix = suffixes[self.WORD]
 
             if instr_suffix is not None:
@@ -218,10 +228,10 @@
                 and ops[1].startswith('%r')):
                 # movq $xxx, %rax => movl $xxx, %eax
                 suffix = 'l'
-                if ops[1][2:].isdigit():
-                    ops[1] += 'd'
-                else:
-                    ops[1] = '%e' + ops[1][2:]
+                ops[1] = reduce_to_32bit(ops[1])
+            if instrname.lower() == 'movd':
+                ops[0] = reduce_to_32bit(ops[0])
+                ops[1] = reduce_to_32bit(ops[1])
             #
             op = '\t%s%s %s%s' % (instrname.lower(), suffix,
                                   ', '.join(ops), following)
diff --git a/pypy/jit/codewriter/assembler.py b/pypy/jit/codewriter/assembler.py
--- a/pypy/jit/codewriter/assembler.py
+++ b/pypy/jit/codewriter/assembler.py
@@ -76,6 +76,8 @@
                 TYPE = llmemory.Address
             if TYPE == llmemory.Address:
                 value = heaptracker.adr2int(value)
+            if TYPE is lltype.SingleFloat:
+                value = longlong.singlefloat2int(value)
             if not isinstance(value, (llmemory.AddressAsInt,
                                       ComputedIntSymbolic)):
                 value = lltype.cast_primitive(lltype.Signed, value)
diff --git a/pypy/jit/codewriter/call.py b/pypy/jit/codewriter/call.py
--- a/pypy/jit/codewriter/call.py
+++ b/pypy/jit/codewriter/call.py
@@ -6,7 +6,7 @@
 from pypy.jit.codewriter import support
 from pypy.jit.codewriter.jitcode import JitCode
 from pypy.jit.codewriter.effectinfo import (VirtualizableAnalyzer,
-    QuasiImmutAnalyzer, CanReleaseGILAnalyzer, effectinfo_from_writeanalyze,
+    QuasiImmutAnalyzer, RandomEffectsAnalyzer, effectinfo_from_writeanalyze,
     EffectInfo, CallInfoCollection)
 from pypy.translator.simplify import get_funcobj, get_functype
 from pypy.rpython.lltypesystem import lltype, llmemory
@@ -31,7 +31,7 @@
             self.readwrite_analyzer = ReadWriteAnalyzer(translator)
             self.virtualizable_analyzer = VirtualizableAnalyzer(translator)
             self.quasiimmut_analyzer = QuasiImmutAnalyzer(translator)
-            self.canreleasegil_analyzer = CanReleaseGILAnalyzer(translator)
+            self.randomeffects_analyzer = RandomEffectsAnalyzer(translator)
         #
         for index, jd in enumerate(jitdrivers_sd):
             jd.index = index
@@ -190,7 +190,7 @@
             fnaddr = llmemory.cast_ptr_to_adr(fnptr)
         NON_VOID_ARGS = [ARG for ARG in FUNC.ARGS if ARG is not lltype.Void]
         calldescr = self.cpu.calldescrof(FUNC, tuple(NON_VOID_ARGS),
-                                         FUNC.RESULT)
+                                         FUNC.RESULT, EffectInfo.MOST_GENERAL)
         return (fnaddr, calldescr)
 
     def getcalldescr(self, op, oopspecindex=EffectInfo.OS_NONE,
@@ -222,17 +222,21 @@
                 assert not NON_VOID_ARGS, ("arguments not supported for "
                                            "loop-invariant function!")
         # build the extraeffect
-        can_release_gil = self.canreleasegil_analyzer.analyze(op)
-        # can_release_gil implies can_invalidate
-        can_invalidate = can_release_gil or self.quasiimmut_analyzer.analyze(op)
+        random_effects = self.randomeffects_analyzer.analyze(op)
+        if random_effects:
+            extraeffect = EffectInfo.EF_RANDOM_EFFECTS
+        # random_effects implies can_invalidate
+        can_invalidate = random_effects or self.quasiimmut_analyzer.analyze(op)
         if extraeffect is None:
             if self.virtualizable_analyzer.analyze(op):
                 extraeffect = EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
             elif loopinvariant:
                 extraeffect = EffectInfo.EF_LOOPINVARIANT
             elif elidable:
-                # XXX check what to do about exceptions (also MemoryError?)
-                extraeffect = EffectInfo.EF_ELIDABLE
+                if self._canraise(op):
+                    extraeffect = EffectInfo.EF_ELIDABLE_CAN_RAISE
+                else:
+                    extraeffect = EffectInfo.EF_ELIDABLE_CANNOT_RAISE
             elif self._canraise(op):
                 extraeffect = EffectInfo.EF_CAN_RAISE
             else:
@@ -240,12 +244,10 @@
         #
         effectinfo = effectinfo_from_writeanalyze(
             self.readwrite_analyzer.analyze(op), self.cpu, extraeffect,
-            oopspecindex, can_invalidate, can_release_gil)
+            oopspecindex, can_invalidate)
         #
-        if oopspecindex != EffectInfo.OS_NONE:
-            assert effectinfo is not None
+        assert effectinfo is not None
         if elidable or loopinvariant:
-            assert effectinfo is not None
             assert extraeffect != EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
             # XXX this should also say assert not can_invalidate, but
             #     it can't because our analyzer is not good enough for now
@@ -265,8 +267,7 @@
 
     def calldescr_canraise(self, calldescr):
         effectinfo = calldescr.get_extra_info()
-        return (effectinfo is None or
-                effectinfo.extraeffect >= EffectInfo.EF_CAN_RAISE)
+        return effectinfo.check_can_raise()
 
     def jitdriver_sd_from_portal_graph(self, graph):
         for jd in self.jitdrivers_sd:
diff --git a/pypy/jit/codewriter/effectinfo.py b/pypy/jit/codewriter/effectinfo.py
--- a/pypy/jit/codewriter/effectinfo.py
+++ b/pypy/jit/codewriter/effectinfo.py
@@ -9,11 +9,13 @@
     _cache = {}
 
     # the 'extraeffect' field is one of the following values:
-    EF_ELIDABLE                        = 0 #elidable function (and cannot raise)
+    EF_ELIDABLE_CANNOT_RAISE           = 0 #elidable function (and cannot raise)
     EF_LOOPINVARIANT                   = 1 #special: call it only once per loop
     EF_CANNOT_RAISE                    = 2 #a function which cannot raise
-    EF_CAN_RAISE                       = 3 #normal function (can raise)
+    EF_ELIDABLE_CAN_RAISE              = 3 #elidable function (but can raise)
+    EF_CAN_RAISE                       = 4 #normal function (can raise)
     EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE = 5 #can raise and force virtualizables
+    EF_RANDOM_EFFECTS                  = 6 #can do whatever
 
     # the 'oopspecindex' field is one of the following values:
     OS_NONE                     = 0    # normal case, no oopspec
@@ -79,22 +81,32 @@
                 write_descrs_fields, write_descrs_arrays,
                 extraeffect=EF_CAN_RAISE,
                 oopspecindex=OS_NONE,
-                can_invalidate=False, can_release_gil=False):
-        key = (frozenset(readonly_descrs_fields),
-               frozenset(readonly_descrs_arrays),
-               frozenset(write_descrs_fields),
-               frozenset(write_descrs_arrays),
+                can_invalidate=False):
+        key = (frozenset_or_none(readonly_descrs_fields),
+               frozenset_or_none(readonly_descrs_arrays),
+               frozenset_or_none(write_descrs_fields),
+               frozenset_or_none(write_descrs_arrays),
                extraeffect,
                oopspecindex,
-               can_invalidate,
-               can_release_gil)
+               can_invalidate)
         if key in cls._cache:
             return cls._cache[key]
+        if extraeffect == EffectInfo.EF_RANDOM_EFFECTS:
+            assert readonly_descrs_fields is None
+            assert readonly_descrs_arrays is None
+            assert write_descrs_fields is None
+            assert write_descrs_arrays is None
+        else:
+            assert readonly_descrs_fields is not None
+            assert readonly_descrs_arrays is not None
+            assert write_descrs_fields is not None
+            assert write_descrs_arrays is not None
         result = object.__new__(cls)
         result.readonly_descrs_fields = readonly_descrs_fields
         result.readonly_descrs_arrays = readonly_descrs_arrays
         if extraeffect == EffectInfo.EF_LOOPINVARIANT or \
-           extraeffect == EffectInfo.EF_ELIDABLE:
+           extraeffect == EffectInfo.EF_ELIDABLE_CANNOT_RAISE or \
+           extraeffect == EffectInfo.EF_ELIDABLE_CAN_RAISE:
             result.write_descrs_fields = []
             result.write_descrs_arrays = []
         else:
@@ -102,11 +114,13 @@
             result.write_descrs_arrays = write_descrs_arrays
         result.extraeffect = extraeffect
         result.can_invalidate = can_invalidate
-        result.can_release_gil = can_release_gil
         result.oopspecindex = oopspecindex
         cls._cache[key] = result
         return result
 
+    def check_can_raise(self):
+        return self.extraeffect > self.EF_CANNOT_RAISE
+
     def check_can_invalidate(self):
         return self.can_invalidate
 
@@ -114,56 +128,71 @@
         return self.extraeffect >= self.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
 
     def has_random_effects(self):
-        return self.oopspecindex == self.OS_LIBFFI_CALL or self.can_release_gil
+        return self.extraeffect >= self.EF_RANDOM_EFFECTS
+
+
+def frozenset_or_none(x):
+    if x is None:
+        return None
+    return frozenset(x)
+
+EffectInfo.MOST_GENERAL = EffectInfo(None, None, None, None,
+                                     EffectInfo.EF_RANDOM_EFFECTS,
+                                     can_invalidate=True)
+
 
 def effectinfo_from_writeanalyze(effects, cpu,
                                  extraeffect=EffectInfo.EF_CAN_RAISE,
                                  oopspecindex=EffectInfo.OS_NONE,
-                                 can_invalidate=False,
-                                 can_release_gil=False):
+                                 can_invalidate=False):
     from pypy.translator.backendopt.writeanalyze import top_set
-    if effects is top_set:
-        return None
-    readonly_descrs_fields = []
-    readonly_descrs_arrays = []
-    write_descrs_fields = []
-    write_descrs_arrays = []
+    if effects is top_set or extraeffect == EffectInfo.EF_RANDOM_EFFECTS:
+        readonly_descrs_fields = None
+        readonly_descrs_arrays = None
+        write_descrs_fields = None
+        write_descrs_arrays = None
+        extraeffect = EffectInfo.EF_RANDOM_EFFECTS
+    else:
+        readonly_descrs_fields = []
+        readonly_descrs_arrays = []
+        write_descrs_fields = []
+        write_descrs_arrays = []
 
-    def add_struct(descrs_fields, (_, T, fieldname)):
-        T = deref(T)
-        if consider_struct(T, fieldname):
-            descr = cpu.fielddescrof(T, fieldname)
-            descrs_fields.append(descr)
+        def add_struct(descrs_fields, (_, T, fieldname)):
+            T = deref(T)
+            if consider_struct(T, fieldname):
+                descr = cpu.fielddescrof(T, fieldname)
+                descrs_fields.append(descr)
 
-    def add_array(descrs_arrays, (_, T)):
-        ARRAY = deref(T)
-        if consider_array(ARRAY):
-            descr = cpu.arraydescrof(ARRAY)
-            descrs_arrays.append(descr)
+        def add_array(descrs_arrays, (_, T)):
+            ARRAY = deref(T)
+            if consider_array(ARRAY):
+                descr = cpu.arraydescrof(ARRAY)
+                descrs_arrays.append(descr)
 
-    for tup in effects:
-        if tup[0] == "struct":
-            add_struct(write_descrs_fields, tup)
-        elif tup[0] == "readstruct":
-            tupw = ("struct",) + tup[1:]
-            if tupw not in effects:
-                add_struct(readonly_descrs_fields, tup)
-        elif tup[0] == "array":
-            add_array(write_descrs_arrays, tup)
-        elif tup[0] == "readarray":
-            tupw = ("array",) + tup[1:]
-            if tupw not in effects:
-                add_array(readonly_descrs_arrays, tup)
-        else:
-            assert 0
+        for tup in effects:
+            if tup[0] == "struct":
+                add_struct(write_descrs_fields, tup)
+            elif tup[0] == "readstruct":
+                tupw = ("struct",) + tup[1:]
+                if tupw not in effects:
+                    add_struct(readonly_descrs_fields, tup)
+            elif tup[0] == "array":
+                add_array(write_descrs_arrays, tup)
+            elif tup[0] == "readarray":
+                tupw = ("array",) + tup[1:]
+                if tupw not in effects:
+                    add_array(readonly_descrs_arrays, tup)
+            else:
+                assert 0
+    #
     return EffectInfo(readonly_descrs_fields,
                       readonly_descrs_arrays,
                       write_descrs_fields,
                       write_descrs_arrays,
                       extraeffect,
                       oopspecindex,
-                      can_invalidate,
-                      can_release_gil)
+                      can_invalidate)
 
 def consider_struct(TYPE, fieldname):
     if fieldType(TYPE, fieldname) is lltype.Void:
@@ -199,12 +228,13 @@
     def analyze_simple_operation(self, op, graphinfo):
         return op.opname == 'jit_force_quasi_immutable'
 
-class CanReleaseGILAnalyzer(BoolGraphAnalyzer):
+class RandomEffectsAnalyzer(BoolGraphAnalyzer):
     def analyze_direct_call(self, graph, seen=None):
-        releases_gil = False
         if hasattr(graph, "func") and hasattr(graph.func, "_ptr"):
-            releases_gil = graph.func._ptr._obj.releases_gil
-        return releases_gil or super(CanReleaseGILAnalyzer, self).analyze_direct_call(graph, seen)
+            if graph.func._ptr._obj.random_effects_on_gcobjs:
+                return True
+        return super(RandomEffectsAnalyzer, self).analyze_direct_call(graph,
+                                                                      seen)
 
     def analyze_simple_operation(self, op, graphinfo):
         return False
diff --git a/pypy/jit/codewriter/jitcode.py b/pypy/jit/codewriter/jitcode.py
--- a/pypy/jit/codewriter/jitcode.py
+++ b/pypy/jit/codewriter/jitcode.py
@@ -1,7 +1,6 @@
 from pypy.jit.metainterp.history import AbstractDescr
 from pypy.jit.codewriter import heaptracker
 from pypy.rlib.objectmodel import we_are_translated
-from pypy.rpython.lltypesystem import llmemory
 
 
 class JitCode(AbstractDescr):
@@ -102,7 +101,7 @@
 
     def _clone_if_mutable(self):
         raise NotImplementedError
-    
+
 class MissingLiveness(Exception):
     pass
 
diff --git a/pypy/jit/codewriter/jtransform.py b/pypy/jit/codewriter/jtransform.py
--- a/pypy/jit/codewriter/jtransform.py
+++ b/pypy/jit/codewriter/jtransform.py
@@ -1,18 +1,17 @@
-import py, sys
-from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rclass
-from pypy.rpython import rlist
-from pypy.jit.metainterp.history import getkind
-from pypy.objspace.flow.model import SpaceOperation, Variable, Constant
-from pypy.objspace.flow.model import Block, Link, c_last_exception
-from pypy.jit.codewriter.flatten import ListOfKind, IndirectCallTargets
+import py
+
 from pypy.jit.codewriter import support, heaptracker, longlong
 from pypy.jit.codewriter.effectinfo import EffectInfo
+from pypy.jit.codewriter.flatten import ListOfKind, IndirectCallTargets
 from pypy.jit.codewriter.policy import log
+from pypy.jit.metainterp import quasiimmut
+from pypy.jit.metainterp.history import getkind
 from pypy.jit.metainterp.typesystem import deref, arrayItem
-from pypy.jit.metainterp import quasiimmut
-from pypy.rpython.rclass import IR_QUASIIMMUTABLE, IR_QUASIIMMUTABLE_ARRAY
+from pypy.objspace.flow.model import SpaceOperation, Variable, Constant, c_last_exception
 from pypy.rlib import objectmodel
 from pypy.rlib.jit import _we_are_jitted
+from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rclass, rffi
+from pypy.rpython.rclass import IR_QUASIIMMUTABLE, IR_QUASIIMMUTABLE_ARRAY
 from pypy.translator.simplify import get_funcobj
 from pypy.translator.unsimplify import varoftype
 
@@ -24,6 +23,11 @@
     t = Transformer(cpu, callcontrol, portal_jd)
     t.transform(graph)
 
+def integer_bounds(size, unsigned):
+    if unsigned:
+        return 0, 1 << (8 * size)
+    else:
+        return -(1 << (8 * size - 1)), 1 << (8 * size - 1)
 
 class Transformer(object):
     vable_array_vars = None
@@ -200,7 +204,6 @@
             self.vable_array_vars[op.result]= self.vable_array_vars[op.args[0]]
 
     rewrite_op_cast_pointer = rewrite_op_same_as
-    rewrite_op_cast_opaque_ptr = rewrite_op_same_as   # rlib.rerased
     def rewrite_op_cast_bool_to_int(self, op): pass
     def rewrite_op_cast_bool_to_uint(self, op): pass
     def rewrite_op_cast_char_to_int(self, op): pass
@@ -588,6 +591,7 @@
                 pure = '_pure'
         else:
             pure = ''
+        self.check_field_access(v_inst.concretetype.TO)
         argname = getattr(v_inst.concretetype.TO, '_gckind', 'gc')
         descr = self.cpu.fielddescrof(v_inst.concretetype.TO,
                                       c_fieldname.value)
@@ -621,6 +625,7 @@
             return [SpaceOperation('-live-', [], None),
                     SpaceOperation('setfield_vable_%s' % kind,
                                    [v_inst, descr, v_value], None)]
+        self.check_field_access(v_inst.concretetype.TO)
         argname = getattr(v_inst.concretetype.TO, '_gckind', 'gc')
         descr = self.cpu.fielddescrof(v_inst.concretetype.TO,
                                       c_fieldname.value)
@@ -633,6 +638,22 @@
         return (op.args[1].value == 'typeptr' and
                 op.args[0].concretetype.TO._hints.get('typeptr'))
 
+    def check_field_access(self, STRUCT):
+        # check against a GcStruct with a nested GcStruct as a first argument
+        # but which is not an object at all; see metainterp/test/test_loop,
+        # test_regular_pointers_in_short_preamble.
+        if not isinstance(STRUCT, lltype.GcStruct):
+            return
+        if STRUCT._first_struct() == (None, None):
+            return
+        PARENT = STRUCT
+        while not PARENT._hints.get('typeptr'):
+            _, PARENT = PARENT._first_struct()
+            if PARENT is None:
+                raise NotImplementedError("%r is a GcStruct using nesting but "
+                                          "not inheriting from object" %
+                                          (STRUCT,))
+
     def get_vinfo(self, v_virtualizable):
         if self.callcontrol is None:      # for tests
             return None
@@ -779,76 +800,127 @@
             raise NotImplementedError("cast_ptr_to_int")
 
     def rewrite_op_force_cast(self, op):
-        assert not self._is_gc(op.args[0])
-        fromll = longlong.is_longlong(op.args[0].concretetype)
-        toll   = longlong.is_longlong(op.result.concretetype)
-        if fromll and toll:
+        v_arg = op.args[0]
+        v_result = op.result
+        assert not self._is_gc(v_arg)
+
+        if v_arg.concretetype == v_result.concretetype:
             return
-        if fromll:
-            args = op.args
-            opname = 'truncate_longlong_to_int'
-            RESULT = lltype.Signed
-            v = varoftype(RESULT)
-            op1 = SpaceOperation(opname, args, v)
-            op2 = self.rewrite_operation(op1)
-            oplist = self.force_cast_without_longlong(op2.result, op.result)
+
+        float_arg = v_arg.concretetype in [lltype.Float, lltype.SingleFloat]
+        float_res = v_result.concretetype in [lltype.Float, lltype.SingleFloat]
+        if not float_arg and not float_res:
+            # some int -> some int cast
+            return self._int_to_int_cast(v_arg, v_result)
+        elif float_arg and float_res:
+            # some float -> some float cast
+            return self._float_to_float_cast(v_arg, v_result)
+        elif not float_arg and float_res:
+            # some int -> some float
+            ops = []
+            v1 = varoftype(lltype.Signed)
+            oplist = self.rewrite_operation(
+                SpaceOperation('force_cast', [v_arg], v1)
+            )
             if oplist:
-                return [op2] + oplist
-            #
-            # force a renaming to put the correct result in place, even though
-            # it might be slightly mistyped (e.g. Signed versus Unsigned)
-            assert op2.result is v
-            op2.result = op.result
-            return op2
-        elif toll:
-            from pypy.rpython.lltypesystem import rffi
-            size, unsigned = rffi.size_and_sign(op.args[0].concretetype)
-            if unsigned:
+                ops.extend(oplist)
+            else:
+                v1 = v_arg
+            v2 = varoftype(lltype.Float)
+            op = self.rewrite_operation(
+                SpaceOperation('cast_int_to_float', [v1], v2)
+            )
+            ops.append(op)
+            op2 = self.rewrite_operation(
+                SpaceOperation('force_cast', [v2], v_result)
+            )
+            if op2:
+                ops.append(op2)
+            else:
+                op.result = v_result
+            return ops
+        elif float_arg and not float_res:
+            # some float -> some int
+            ops = []
+            v1 = varoftype(lltype.Float)
+            op1 = self.rewrite_operation(
+                SpaceOperation('force_cast', [v_arg], v1)
+            )
+            if op1:
+                ops.append(op1)
+            else:
+                v1 = v_arg
+            v2 = varoftype(lltype.Signed)
+            op = self.rewrite_operation(
+                SpaceOperation('cast_float_to_int', [v1], v2)
+            )
+            ops.append(op)
+            oplist = self.rewrite_operation(
+                SpaceOperation('force_cast', [v2], v_result)
+            )
+            if oplist:
+                ops.extend(oplist)
+            else:
+                op.result = v_result
+            return ops
+        else:
+            assert False
+
+    def _int_to_int_cast(self, v_arg, v_result):
+        longlong_arg = longlong.is_longlong(v_arg.concretetype)
+        longlong_res = longlong.is_longlong(v_result.concretetype)
+        size1, unsigned1 = rffi.size_and_sign(v_arg.concretetype)
+        size2, unsigned2 = rffi.size_and_sign(v_result.concretetype)
+
+        if longlong_arg and longlong_res:
+            return
+        elif longlong_arg:
+            v = varoftype(lltype.Signed)
+            op1 = self.rewrite_operation(
+                SpaceOperation('truncate_longlong_to_int', [v_arg], v)
+            )
+            op2 = SpaceOperation('force_cast', [v], v_result)
+            oplist = self.rewrite_operation(op2)
+            if not oplist:
+                op1.result = v_result
+                oplist = []
+            return [op1] + oplist
+        elif longlong_res:
+            if unsigned1:
                 INTERMEDIATE = lltype.Unsigned
             else:
                 INTERMEDIATE = lltype.Signed
             v = varoftype(INTERMEDIATE)
-            oplist = self.force_cast_without_longlong(op.args[0], v)
+            op1 = SpaceOperation('force_cast', [v_arg], v)
+            oplist = self.rewrite_operation(op1)
             if not oplist:
-                v = op.args[0]
+                v = v_arg
                 oplist = []
-            if unsigned:
+            if unsigned1:
                 opname = 'cast_uint_to_longlong'
             else:
                 opname = 'cast_int_to_longlong'
-            op1 = SpaceOperation(opname, [v], op.result)
-            op2 = self.rewrite_operation(op1)
+            op2 = self.rewrite_operation(
+                SpaceOperation(opname, [v], v_result)
+            )
             return oplist + [op2]
-        else:
-            return self.force_cast_without_longlong(op.args[0], op.result)
 
-    def force_cast_without_longlong(self, v_arg, v_result):
-        from pypy.rpython.lltypesystem.rffi import size_and_sign, sizeof, FLOAT
-        from pypy.rlib.rarithmetic import intmask
-        #
-        if (v_result.concretetype in (FLOAT, lltype.Float) or
-            v_arg.concretetype in (FLOAT, lltype.Float)):
-            assert (v_result.concretetype == lltype.Float and
-                    v_arg.concretetype == lltype.Float), "xxx unsupported cast"
+        # We've now, ostensibly, dealt with the longlongs, everything should be
+        # a Signed or smaller
+        assert size1 <= rffi.sizeof(lltype.Signed)
+        assert size2 <= rffi.sizeof(lltype.Signed)
+
+        # the target type is LONG or ULONG
+        if size2 == rffi.sizeof(lltype.Signed):
             return
-        #
-        size2, unsigned2 = size_and_sign(v_result.concretetype)
-        assert size2 <= sizeof(lltype.Signed)
-        if size2 == sizeof(lltype.Signed):
-            return     # the target type is LONG or ULONG
-        size1, unsigned1 = size_and_sign(v_arg.concretetype)
-        assert size1 <= sizeof(lltype.Signed)
-        #
-        def bounds(size, unsigned):
-            if unsigned:
-                return 0, 1<<(8*size)
-            else:
-                return -(1<<(8*size-1)), 1<<(8*size-1)
-        min1, max1 = bounds(size1, unsigned1)
-        min2, max2 = bounds(size2, unsigned2)
+
+        min1, max1 = integer_bounds(size1, unsigned1)
+        min2, max2 = integer_bounds(size2, unsigned2)
+
+        # the target type includes the source range
         if min2 <= min1 <= max1 <= max2:
-            return     # the target type includes the source range
-        #
+            return
+
         result = []
         if min2:
             c_min2 = Constant(min2, lltype.Signed)
@@ -856,17 +928,29 @@
             result.append(SpaceOperation('int_sub', [v_arg, c_min2], v2))
         else:
             v2 = v_arg
-        c_mask = Constant(int((1<<(8*size2))-1), lltype.Signed)
-        v3 = varoftype(lltype.Signed)
+        c_mask = Constant(int((1 << (8 * size2)) - 1), lltype.Signed)
+        if min2:
+            v3 = varoftype(lltype.Signed)
+        else:
+            v3 = v_result
         result.append(SpaceOperation('int_and', [v2, c_mask], v3))
         if min2:
             result.append(SpaceOperation('int_add', [v3, c_min2], v_result))
-        else:
-            result[-1].result = v_result
         return result
 
+    def _float_to_float_cast(self, v_arg, v_result):
+        if v_arg.concretetype == lltype.SingleFloat:
+            assert v_result.concretetype == lltype.Float, "cast %s -> %s" % (
+                v_arg.concretetype, v_result.concretetype)
+            return SpaceOperation('cast_singlefloat_to_float', [v_arg],
+                                  v_result)
+        if v_result.concretetype == lltype.SingleFloat:
+            assert v_arg.concretetype == lltype.Float, "cast %s -> %s" % (
+                v_arg.concretetype, v_result.concretetype)
+            return SpaceOperation('cast_float_to_singlefloat', [v_arg],
+                                  v_result)
+
     def rewrite_op_direct_ptradd(self, op):
-        from pypy.rpython.lltypesystem import rffi
         # xxx otherwise, not implemented:
         assert op.args[0].concretetype == rffi.CCHARP
         #
@@ -919,7 +1003,7 @@
                 op1 = self.prepare_builtin_call(op, "llong_%s", args)
                 op2 = self._handle_oopspec_call(op1, args,
                                                 EffectInfo.OS_LLONG_%s,
-                                                EffectInfo.EF_ELIDABLE)
+                                           EffectInfo.EF_ELIDABLE_CANNOT_RAISE)
                 if %r == "TO_INT":
                     assert op2.result.concretetype == lltype.Signed
                 return op2
@@ -1380,15 +1464,15 @@
                     otherindex += EffectInfo._OS_offset_uni
                 self._register_extra_helper(otherindex, othername,
                                             argtypes, resulttype,
-                                            EffectInfo.EF_ELIDABLE)
+                                           EffectInfo.EF_ELIDABLE_CANNOT_RAISE)
         #
         return self._handle_oopspec_call(op, args, dict[oopspec_name],
-                                         EffectInfo.EF_ELIDABLE)
+                                         EffectInfo.EF_ELIDABLE_CANNOT_RAISE)
 
     def _handle_str2unicode_call(self, op, oopspec_name, args):
-        # ll_str2unicode is not EF_ELIDABLE, because it can raise
-        # UnicodeDecodeError...
-        return self._handle_oopspec_call(op, args, EffectInfo.OS_STR2UNICODE)
+        # ll_str2unicode can raise UnicodeDecodeError
+        return self._handle_oopspec_call(op, args, EffectInfo.OS_STR2UNICODE,
+                                         EffectInfo.EF_ELIDABLE_CAN_RAISE)
 
     # ----------
     # VirtualRefs.
@@ -1412,7 +1496,7 @@
             extraeffect = EffectInfo.EF_CANNOT_RAISE
         elif oopspec_name.startswith('libffi_call_'):
             oopspecindex = EffectInfo.OS_LIBFFI_CALL
-            extraeffect = EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
+            extraeffect = EffectInfo.EF_RANDOM_EFFECTS
         else:
             assert False, 'unsupported oopspec: %s' % oopspec_name
         return self._handle_oopspec_call(op, args, oopspecindex, extraeffect)
@@ -1426,13 +1510,13 @@
         assert vinfo is not None
         self.vable_flags[op.args[0]] = op.args[2].value
         return []
-        
+
     # ---------
     # ll_math.sqrt_nonneg()
-    
+
     def _handle_math_sqrt_call(self, op, oopspec_name, args):
         return self._handle_oopspec_call(op, args, EffectInfo.OS_MATH_SQRT,
-                                         EffectInfo.EF_ELIDABLE)
+                                         EffectInfo.EF_ELIDABLE_CANNOT_RAISE)
 
     def rewrite_op_jit_force_quasi_immutable(self, op):
         v_inst, c_fieldname = op.args
diff --git a/pypy/jit/codewriter/longlong.py b/pypy/jit/codewriter/longlong.py
--- a/pypy/jit/codewriter/longlong.py
+++ b/pypy/jit/codewriter/longlong.py
@@ -7,7 +7,8 @@
 """
 
 import sys
-from pypy.rpython.lltypesystem import lltype
+from pypy.rpython.lltypesystem import lltype, rffi
+from pypy.rlib import rarithmetic, longlong2float
 
 
 if sys.maxint > 2147483647:
@@ -31,8 +32,6 @@
     # ---------- 32-bit platform ----------
     # the type FloatStorage is r_longlong, and conversion is needed
 
-    from pypy.rlib import rarithmetic, longlong2float
-
     is_64_bit = False
     supports_longlong = True
     r_float_storage = rarithmetic.r_longlong
@@ -41,9 +40,19 @@
     getfloatstorage = longlong2float.float2longlong
     getrealfloat    = longlong2float.longlong2float
     gethash         = lambda xll: rarithmetic.intmask(xll - (xll >> 32))
-    is_longlong     = lambda TYPE: (TYPE == lltype.SignedLongLong or
-                                    TYPE == lltype.UnsignedLongLong)
+    is_longlong     = lambda TYPE: (TYPE is lltype.SignedLongLong or
+                                    TYPE is lltype.UnsignedLongLong)
 
     # -------------------------------------
 
 ZEROF = getfloatstorage(0.0)
+
+# ____________________________________________________________
+
+def int2singlefloat(x):
+    x = rffi.r_uint(x)
+    return longlong2float.uint2singlefloat(x)
+
+def singlefloat2int(x):
+    x = longlong2float.singlefloat2uint(x)
+    return rffi.cast(lltype.Signed, x)
diff --git a/pypy/jit/codewriter/policy.py b/pypy/jit/codewriter/policy.py
--- a/pypy/jit/codewriter/policy.py
+++ b/pypy/jit/codewriter/policy.py
@@ -1,9 +1,7 @@
-from pypy.translator.simplify import get_funcobj
 from pypy.jit.metainterp import history
-from pypy.rpython.lltypesystem import lltype, rclass
 from pypy.tool.udir import udir
 
-import py, sys
+import py
 from pypy.tool.ansi_print import ansi_log
 log = py.log.Producer('jitcodewriter')
 py.log.setconsumer('jitcodewriter', ansi_log)
@@ -14,6 +12,7 @@
         self.unsafe_loopy_graphs = set()
         self.supports_floats = False
         self.supports_longlong = False
+        self.supports_singlefloats = False
 
     def set_supports_floats(self, flag):
         self.supports_floats = flag
@@ -21,6 +20,9 @@
     def set_supports_longlong(self, flag):
         self.supports_longlong = flag
 
+    def set_supports_singlefloats(self, flag):
+        self.supports_singlefloats = flag
+
     def dump_unsafe_loops(self):
         f = udir.join("unsafe-loops.txt").open('w')
         strs = [str(graph) for graph in self.unsafe_loopy_graphs]
@@ -62,8 +64,9 @@
                     func, '_jit_unroll_if_const_', False)
 
         unsupported = contains_unsupported_variable_type(graph,
-                                                         self.supports_floats,
-                                                         self.supports_longlong)
+                            self.supports_floats,
+                            self.supports_longlong,
+                            self.supports_singlefloats)
         res = see_function and not unsupported
         if res and contains_loop:
             self.unsafe_loopy_graphs.add(graph)
@@ -84,17 +87,24 @@
         return res
 
 def contains_unsupported_variable_type(graph, supports_floats,
-                                       supports_longlong):
+                                              supports_longlong,
+                                              supports_singlefloats):
     getkind = history.getkind
     try:
         for block in graph.iterblocks():
             for v in block.inputargs:
-                getkind(v.concretetype, supports_floats, supports_longlong)
+                getkind(v.concretetype, supports_floats,
+                                        supports_longlong,
+                                        supports_singlefloats)
             for op in block.operations:
                 for v in op.args:
-                    getkind(v.concretetype, supports_floats, supports_longlong)
+                    getkind(v.concretetype, supports_floats,
+                                            supports_longlong,
+                                            supports_singlefloats)
                 v = op.result
-                getkind(v.concretetype, supports_floats, supports_longlong)
+                getkind(v.concretetype, supports_floats,
+                                        supports_longlong,
+                                        supports_singlefloats)
     except NotImplementedError, e:
         log.WARNING('%s, ignoring graph' % (e,))
         log.WARNING('  %s' % (graph,))
diff --git a/pypy/jit/codewriter/regalloc.py b/pypy/jit/codewriter/regalloc.py
--- a/pypy/jit/codewriter/regalloc.py
+++ b/pypy/jit/codewriter/regalloc.py
@@ -1,129 +1,8 @@
-import sys
-from pypy.objspace.flow.model import Variable
-from pypy.tool.algo.color import DependencyGraph
-from pypy.tool.algo.unionfind import UnionFind
+from pypy.tool.algo import regalloc
 from pypy.jit.metainterp.history import getkind
 from pypy.jit.codewriter.flatten import ListOfKind
 
+
 def perform_register_allocation(graph, kind):
-    """Perform register allocation for the Variables of the given 'kind'
-    in the 'graph'."""
-    regalloc = RegAllocator(graph, kind)
-    regalloc.make_dependencies()
-    regalloc.coalesce_variables()
-    regalloc.find_node_coloring()
-    return regalloc
-
-
-class RegAllocator(object):
-    DEBUG_REGALLOC = False
-
-    def __init__(self, graph, kind):
-        self.graph = graph
-        self.kind = kind
-
-    def make_dependencies(self):
-        dg = DependencyGraph()
-        for block in self.graph.iterblocks():
-            # Compute die_at = {Variable: index_of_operation_with_last_usage}
-            die_at = dict.fromkeys(block.inputargs, 0)
-            for i, op in enumerate(block.operations):
-                for v in op.args:
-                    if isinstance(v, Variable):
-                        die_at[v] = i
-                    elif isinstance(v, ListOfKind):
-                        for v1 in v:
-                            if isinstance(v1, Variable):
-                                die_at[v1] = i
-                if op.result is not None:
-                    die_at[op.result] = i + 1
-            if isinstance(block.exitswitch, tuple):
-                for x in block.exitswitch:
-                    die_at.pop(x, None)
-            else:
-                die_at.pop(block.exitswitch, None)
-            for link in block.exits:
-                for v in link.args:
-                    die_at.pop(v, None)
-            die_at = [(value, key) for (key, value) in die_at.items()]
-            die_at.sort()
-            die_at.append((sys.maxint,))
-            # Done.  XXX the code above this line runs 3 times
-            # (for kind in KINDS) to produce the same result...
-            livevars = [v for v in block.inputargs
-                          if getkind(v.concretetype) == self.kind]
-            # Add the variables of this block to the dependency graph
-            for i, v in enumerate(livevars):
-                dg.add_node(v)
-                for j in range(i):
-                    dg.add_edge(livevars[j], v)
-            livevars = set(livevars)
-            die_index = 0
-            for i, op in enumerate(block.operations):
-                while die_at[die_index][0] == i:
-                    try:
-                        livevars.remove(die_at[die_index][1])
-                    except KeyError:
-                        pass
-                    die_index += 1
-                if (op.result is not None and
-                    getkind(op.result.concretetype) == self.kind):
-                    dg.add_node(op.result)
-                    for v in livevars:
-                        if getkind(v.concretetype) == self.kind:
-                            dg.add_edge(v, op.result)
-                    livevars.add(op.result)
-        self._depgraph = dg
-
-    def coalesce_variables(self):
-        self._unionfind = UnionFind()
-        pendingblocks = list(self.graph.iterblocks())
-        while pendingblocks:
-            block = pendingblocks.pop()
-            # Aggressively try to coalesce each source variable with its
-            # target.  We start from the end of the graph instead of
-            # from the beginning.  This is a bit arbitrary, but the idea
-            # is that the end of the graph runs typically more often
-            # than the start, given that we resume execution from the
-            # middle during blackholing.
-            for link in block.exits:
-                if link.last_exception is not None:
-                    self._depgraph.add_node(link.last_exception)
-                if link.last_exc_value is not None:
-                    self._depgraph.add_node(link.last_exc_value)
-                for i, v in enumerate(link.args):
-                    self._try_coalesce(v, link.target.inputargs[i])
-
-    def _try_coalesce(self, v, w):
-        if isinstance(v, Variable) and getkind(v.concretetype) == self.kind:
-            assert getkind(w.concretetype) == self.kind
-            dg = self._depgraph
-            uf = self._unionfind
-            v0 = uf.find_rep(v)
-            w0 = uf.find_rep(w)
-            if v0 is not w0 and v0 not in dg.neighbours[w0]:
-                _, rep, _ = uf.union(v0, w0)
-                assert uf.find_rep(v0) is uf.find_rep(w0) is rep
-                if rep is v0:
-                    dg.coalesce(w0, v0)
-                else:
-                    assert rep is w0
-                    dg.coalesce(v0, w0)
-
-    def find_node_coloring(self):
-        self._coloring = self._depgraph.find_node_coloring()
-        if self.DEBUG_REGALLOC:
-            for block in self.graph.iterblocks():
-                print block
-                for v in block.getvariables():
-                    print '\t', v, '\t', self.getcolor(v)
-
-    def getcolor(self, v):
-        return self._coloring[self._unionfind.find_rep(v)]
-
-    def swapcolors(self, col1, col2):
-        for key, value in self._coloring.items():
-            if value == col1:
-                self._coloring[key] = col2
-            elif value == col2:
-                self._coloring[key] = col1
+    checkkind = lambda v: getkind(v.concretetype) == kind
+    return regalloc.perform_register_allocation(graph, checkkind, ListOfKind)
diff --git a/pypy/jit/codewriter/support.py b/pypy/jit/codewriter/support.py
--- a/pypy/jit/codewriter/support.py
+++ b/pypy/jit/codewriter/support.py
@@ -20,6 +20,7 @@
 from pypy.rpython.annlowlevel import MixLevelHelperAnnotator
 from pypy.jit.metainterp.typesystem import deref
 from pypy.rlib import rgc
+from pypy.rlib.jit import elidable
 from pypy.rlib.rarithmetic import r_longlong, r_ulonglong, r_uint, intmask
 
 def getargtypes(annotator, values):
@@ -90,9 +91,12 @@
     reds_v = op.args[2+numgreens:]
     assert len(reds_v) == numreds
     #
-    def _sort(args_v):
+    def _sort(args_v, is_green):
         from pypy.jit.metainterp.history import getkind
         lst = [v for v in args_v if v.concretetype is not lltype.Void]
+        if is_green:
+            assert len(lst) == len(args_v), (
+                "not supported so far: 'greens' variables contain Void")
         _kind2count = {'int': 1, 'ref': 2, 'float': 3}
         lst2 = sorted(lst, key=lambda v: _kind2count[getkind(v.concretetype)])
         # a crash here means that you have to reorder the variable named in
@@ -101,7 +105,7 @@
         assert lst == lst2
         return lst
     #
-    return (_sort(greens_v), _sort(reds_v))
+    return (_sort(greens_v, True), _sort(reds_v, False))
 
 def maybe_on_top_of_llinterp(rtyper, fnptr):
     # Run a generated graph on top of the llinterp for testing.
@@ -167,9 +171,14 @@
 
 _ll_5_list_ll_arraycopy = rgc.ll_arraycopy
 
+ at elidable
 def _ll_1_gc_identityhash(x):
     return lltype.identityhash(x)
 
+# the following function should not be "@elidable": I can think of
+# a corner case in which id(const) is constant-folded, and then 'const'
+# disappears and is collected too early (possibly causing another object
+# with the same id() to appear).
 def _ll_1_gc_id(ptr):
     return llop.gc_id(lltype.Signed, ptr)
 
@@ -420,10 +429,6 @@
     _ll_1_dict_values.need_result_type = True
     _ll_1_dict_items .need_result_type = True
 
-    def _ll_1_newdictiter(ITER, d):
-        return ll_rdict.ll_dictiter(lltype.Ptr(ITER), d)
-    _ll_1_newdictiter.need_result_type = True
-
     _dictnext_keys   = staticmethod(ll_rdict.ll_dictnext_group['keys'])
     _dictnext_values = staticmethod(ll_rdict.ll_dictnext_group['values'])
     _dictnext_items  = staticmethod(ll_rdict.ll_dictnext_group['items'])
@@ -574,10 +579,6 @@
     _ll_1_dict_values.need_result_type = True
     _ll_1_dict_items .need_result_type = True
 
-    def _ll_1_newdictiter(ITER, d):
-        return oo_rdict.ll_dictiter(ITER, d)
-    _ll_1_newdictiter.need_result_type = True
-
     _dictnext_keys   = staticmethod(oo_rdict.ll_dictnext_group['keys'])
     _dictnext_values = staticmethod(oo_rdict.ll_dictnext_group['values'])
     _dictnext_items  = staticmethod(oo_rdict.ll_dictnext_group['items'])
diff --git a/pypy/jit/codewriter/test/test_call.py b/pypy/jit/codewriter/test/test_call.py
--- a/pypy/jit/codewriter/test/test_call.py
+++ b/pypy/jit/codewriter/test/test_call.py
@@ -202,4 +202,4 @@
     [block, _] = list(f_graph.iterblocks())
     [op] = block.operations
     call_descr = cc.getcalldescr(op)
-    assert call_descr.extrainfo.can_release_gil
+    assert call_descr.extrainfo.has_random_effects()
diff --git a/pypy/jit/codewriter/test/test_codewriter.py b/pypy/jit/codewriter/test/test_codewriter.py
--- a/pypy/jit/codewriter/test/test_codewriter.py
+++ b/pypy/jit/codewriter/test/test_codewriter.py
@@ -5,7 +5,7 @@
 from pypy.rpython.lltypesystem import lltype, llmemory, rffi
 
 class FakeCallDescr(AbstractDescr):
-    def __init__(self, FUNC, ARGS, RESULT, effectinfo=None):
+    def __init__(self, FUNC, ARGS, RESULT, effectinfo):
         self.FUNC = FUNC
         self.ARGS = ARGS
         self.RESULT = RESULT
diff --git a/pypy/jit/codewriter/test/test_flatten.py b/pypy/jit/codewriter/test/test_flatten.py
--- a/pypy/jit/codewriter/test/test_flatten.py
+++ b/pypy/jit/codewriter/test/test_flatten.py
@@ -50,7 +50,7 @@
     def __init__(self, rtyper):
         rtyper._builtin_func_for_spec_cache = FakeDict()
         self.rtyper = rtyper
-    def calldescrof(self, FUNC, ARGS, RESULT):
+    def calldescrof(self, FUNC, ARGS, RESULT, effectinfo):
         return FakeDescr()
     def fielddescrof(self, STRUCT, name):
         return FakeDescr()
@@ -324,7 +324,7 @@
     def test_exc_exitswitch(self):
         def g(i):
             pass
-        
+
         def f(i):
             try:
                 g(i)
@@ -854,13 +854,51 @@
             int_return %i0
         """, transform=True)
 
-    def test_force_cast_float(self):
+    def test_force_cast_floats(self):
         from pypy.rpython.lltypesystem import rffi
+        # Caststs to lltype.Float
         def f(n):
             return rffi.cast(lltype.Float, n)
         self.encoding_test(f, [12.456], """
             float_return %f0
         """, transform=True)
+        self.encoding_test(f, [rffi.cast(rffi.SIGNEDCHAR, 42)], """
+            cast_int_to_float %i0 -> %f0
+            float_return %f0
+        """, transform=True)
+
+        # Casts to lltype.SingleFloat
+        def g(n):
+            return rffi.cast(lltype.SingleFloat, n)
+        self.encoding_test(g, [12.456], """
+            cast_float_to_singlefloat %f0 -> %i0
+            int_return %i0
+        """, transform=True)
+        self.encoding_test(g, [rffi.cast(rffi.SIGNEDCHAR, 42)], """
+            cast_int_to_float %i0 -> %f0
+            cast_float_to_singlefloat %f0 -> %i1
+            int_return %i1
+        """, transform=True)
+
+        # Casts from floats
+        def f(n):
+            return rffi.cast(rffi.SIGNEDCHAR, n)
+        self.encoding_test(f, [12.456], """
+            cast_float_to_int %f0 -> %i0
+            int_sub %i0, $-128 -> %i1
+            int_and %i1, $255 -> %i2
+            int_add %i2, $-128 -> %i3
+            int_return %i3
+        """, transform=True)
+        self.encoding_test(f, [rffi.cast(lltype.SingleFloat, 12.456)], """
+            cast_singlefloat_to_float %i0 -> %f0
+            cast_float_to_int %f0 -> %i1
+            int_sub %i1, $-128 -> %i2
+            int_and %i2, $255 -> %i3
+            int_add %i3, $-128 -> %i4
+            int_return %i4
+        """, transform=True)
+
 
     def test_direct_ptradd(self):
         from pypy.rpython.lltypesystem import rffi
diff --git a/pypy/jit/codewriter/test/test_jtransform.py b/pypy/jit/codewriter/test/test_jtransform.py
--- a/pypy/jit/codewriter/test/test_jtransform.py
+++ b/pypy/jit/codewriter/test/test_jtransform.py
@@ -135,9 +135,9 @@
             assert argtypes[0] == [v.concretetype for v in op.args[1:]]
             assert argtypes[1] == op.result.concretetype
             if oopspecindex == EI.OS_STR2UNICODE:
-                assert extraeffect == None    # not pure, can raise!
+                assert extraeffect == EI.EF_ELIDABLE_CAN_RAISE
             else:
-                assert extraeffect == EI.EF_ELIDABLE
+                assert extraeffect == EI.EF_ELIDABLE_CANNOT_RAISE
         return 'calldescr-%d' % oopspecindex
     def calldescr_canraise(self, calldescr):
         return False
@@ -797,7 +797,7 @@
         def get_vinfo(self, v):
             return None
         def could_be_green_field(self, S1, name1):
-            assert S1 is S
+            assert S1 == S
             assert name1 == 'x'
             return True
     S = lltype.GcStruct('S', ('x', lltype.Char),
@@ -1042,3 +1042,13 @@
         assert op1.opname == 'jit_force_quasi_immutable'
         assert op1.args[0] == v_x
         assert op1.args[1] == ('fielddescr', STRUCT, 'mutate_x')
+
+def test_no_gcstruct_nesting_outside_of_OBJECT():
+    PARENT = lltype.GcStruct('parent')
+    STRUCT = lltype.GcStruct('struct', ('parent', PARENT),
+                                       ('x', lltype.Signed))
+    v_x = varoftype(lltype.Ptr(STRUCT))
+    op = SpaceOperation('getfield', [v_x, Constant('x', lltype.Void)],
+                        varoftype(lltype.Signed))
+    tr = Transformer(None, None)
+    raises(NotImplementedError, tr.rewrite_operation, op)
diff --git a/pypy/jit/codewriter/test/test_longlong.py b/pypy/jit/codewriter/test/test_longlong.py
--- a/pypy/jit/codewriter/test/test_longlong.py
+++ b/pypy/jit/codewriter/test/test_longlong.py
@@ -230,3 +230,18 @@
             assert list(op1.args[3]) == []
             assert list(op1.args[4]) == vlist
             assert op1.result == v_result
+
+
+##def test_singlefloat_constants():
+##    v_x = varoftype(TYPE)
+##    vlist = [v_x, const(rffi.cast(TYPE, 7))]
+##    v_result = varoftype(TYPE)
+##    op = SpaceOperation('llong_add', vlist, v_result)
+##    tr = Transformer(FakeCPU(), FakeBuiltinCallControl())
+##    op1 = tr.rewrite_operation(op)
+##    #
+##    assert op1.opname == 'residual_call_irf_f'
+##    assert list(op1.args[2]) == []
+##    assert list(op1.args[3]) == []
+##    assert list(op1.args[4]) == vlist
+##    assert op1.result == v_result
diff --git a/pypy/jit/codewriter/test/test_policy.py b/pypy/jit/codewriter/test/test_policy.py
--- a/pypy/jit/codewriter/test/test_policy.py
+++ b/pypy/jit/codewriter/test/test_policy.py
@@ -12,24 +12,30 @@
     graph = support.getgraph(f, [5])
     for sf in [False, True]:
         for sll in [False, True]:
-            assert not contains_unsupported_variable_type(graph, sf, sll)
+            for ssf in [False, True]:
+                assert not contains_unsupported_variable_type(graph, sf,
+                                                              sll, ssf)
     #
     graph = support.getgraph(f, [5.5])
     for sf in [False, True]:
         for sll in [False, True]:
-            res = contains_unsupported_variable_type(graph, sf, sll)
-            assert res is not sf
+            for ssf in [False, True]:
+                res = contains_unsupported_variable_type(graph, sf, sll, ssf)
+                assert res is not sf
     #
     graph = support.getgraph(f, [r_singlefloat(5.5)])
     for sf in [False, True]:
         for sll in [False, True]:
-            assert contains_unsupported_variable_type(graph, sf, sll)
+            for ssf in [False, True]:
+                res = contains_unsupported_variable_type(graph, sf, sll, ssf)
+                assert res == (not ssf)
     #
     graph = support.getgraph(f, [r_longlong(5)])
     for sf in [False, True]:
         for sll in [False, True]:
-            res = contains_unsupported_variable_type(graph, sf, sll)
-            assert res == (sys.maxint == 2147483647 and not sll)
+            for ssf in [False, True]:
+                res = contains_unsupported_variable_type(graph, sf, sll, ssf)
+                assert res == (sys.maxint == 2147483647 and not sll)
 
 
 def test_regular_function():
diff --git a/pypy/jit/metainterp/blackhole.py b/pypy/jit/metainterp/blackhole.py
--- a/pypy/jit/metainterp/blackhole.py
+++ b/pypy/jit/metainterp/blackhole.py
@@ -500,6 +500,9 @@
     @arguments("r", returns="i")
     def bhimpl_ptr_nonzero(a):
         return bool(a)
+    @arguments("r", returns="r")
+    def bhimpl_cast_opaque_ptr(a):
+        return a
 
     @arguments("i", returns="i")
     def bhimpl_int_copy(a):
@@ -623,6 +626,19 @@
         x = float(a)
         return longlong.getfloatstorage(x)
 
+    @arguments("f", returns="i")
+    def bhimpl_cast_float_to_singlefloat(a):
+        from pypy.rlib.rarithmetic import r_singlefloat
+        a = longlong.getrealfloat(a)
+        a = r_singlefloat(a)
+        return longlong.singlefloat2int(a)
+
+    @arguments("i", returns="f")
+    def bhimpl_cast_singlefloat_to_float(a):
+        a = longlong.int2singlefloat(a)
+        a = float(a)
+        return longlong.getfloatstorage(a)
+
     # ----------
     # control flow operations
 
diff --git a/pypy/jit/metainterp/compile.py b/pypy/jit/metainterp/compile.py
--- a/pypy/jit/metainterp/compile.py
+++ b/pypy/jit/metainterp/compile.py
@@ -137,6 +137,10 @@
             jitdriver_sd.warmstate.attach_unoptimized_bridge_from_interp(
                 greenkey, loop.preamble.token)
             record_loop_or_bridge(metainterp_sd, loop.preamble)
+        elif token.short_preamble:
+            short = token.short_preamble[-1]
+            metainterp_sd.logger_ops.log_short_preamble(short.inputargs,
+                                                        short.operations)
         return token
     else:
         send_loop_to_backend(greenkey, jitdriver_sd, metainterp_sd, loop,
@@ -637,6 +641,7 @@
         debug_print("compile_new_bridge: got an InvalidLoop")
         # XXX I am fairly convinced that optimize_bridge cannot actually raise
         # InvalidLoop
+        debug_print('InvalidLoop in compile_new_bridge')
         return None
     # Did it work?
     if target_loop_token is not None:
@@ -668,10 +673,9 @@
     def handle_fail(self, metainterp_sd, jitdriver_sd):
         cpu = metainterp_sd.cpu
         exception = cpu.grab_exc_value()
+        assert exception, "PropagateExceptionDescr: no exception??"
         raise metainterp_sd.ExitFrameWithExceptionRef(cpu, exception)
 
-propagate_exception_descr = PropagateExceptionDescr()
-
 def compile_tmp_callback(cpu, jitdriver_sd, greenboxes, redboxes,
                          memory_manager=None):
     """Make a LoopToken that corresponds to assembler code that just
@@ -705,7 +709,7 @@
         finishargs = []
     #
     jd = jitdriver_sd
-    faildescr = propagate_exception_descr
+    faildescr = PropagateExceptionDescr()
     operations = [
         ResOperation(rop.CALL, callargs, result, descr=jd.portal_calldescr),
         ResOperation(rop.GUARD_NO_EXCEPTION, [], None, descr=faildescr),
diff --git a/pypy/jit/metainterp/executor.py b/pypy/jit/metainterp/executor.py
--- a/pypy/jit/metainterp/executor.py
+++ b/pypy/jit/metainterp/executor.py
@@ -50,7 +50,7 @@
     func = argboxes[0].getint()
     # do the call using the correct function from the cpu
     rettype = descr.get_return_type()
-    if rettype == INT:
+    if rettype == INT or rettype == 'S':       # *S*ingle float
         try:
             result = cpu.bh_call_i(func, descr, args_i, args_r, args_f)
         except Exception, e:
@@ -64,7 +64,7 @@
             metainterp.execute_raised(e)
             result = NULL
         return BoxPtr(result)
-    if rettype == FLOAT or rettype == 'L':
+    if rettype == FLOAT or rettype == 'L':     # *L*ong long
         try:
             result = cpu.bh_call_f(func, descr, args_i, args_r, args_f)
         except Exception, e:
diff --git a/pypy/jit/metainterp/history.py b/pypy/jit/metainterp/history.py
--- a/pypy/jit/metainterp/history.py
+++ b/pypy/jit/metainterp/history.py
@@ -20,12 +20,16 @@
 
 FAILARGS_LIMIT = 1000
 
-def getkind(TYPE, supports_floats=True, supports_longlong=True):
+def getkind(TYPE, supports_floats=True,
+                  supports_longlong=True,
+                  supports_singlefloats=True):
     if TYPE is lltype.Void:
         return "void"
     elif isinstance(TYPE, lltype.Primitive):
         if TYPE is lltype.Float and supports_floats:
             return 'float'
+        if TYPE is lltype.SingleFloat and supports_singlefloats:
+            return 'int'     # singlefloats are stored in an int
         if TYPE in (lltype.Float, lltype.SingleFloat):
             raise NotImplementedError("type %s not supported" % TYPE)
         # XXX fix this for oo...
@@ -145,6 +149,7 @@
         """ Implement in call descr.
         Must return INT, REF, FLOAT, or 'v' for void.
         On 32-bit (hack) it can also be 'L' for longlongs.
+        Additionally it can be 'S' for singlefloats.
         """
         raise NotImplementedError
 
diff --git a/pypy/jit/metainterp/optimize.py b/pypy/jit/metainterp/optimize.py
--- a/pypy/jit/metainterp/optimize.py
+++ b/pypy/jit/metainterp/optimize.py
@@ -1,4 +1,4 @@
-from pypy.rlib.debug import debug_start, debug_stop
+from pypy.rlib.debug import debug_start, debug_stop, debug_print
 from pypy.jit.metainterp.jitexc import JitException
 
 class InvalidLoop(JitException):
diff --git a/pypy/jit/metainterp/optimizeopt/__init__.py b/pypy/jit/metainterp/optimizeopt/__init__.py
--- a/pypy/jit/metainterp/optimizeopt/__init__.py
+++ b/pypy/jit/metainterp/optimizeopt/__init__.py
@@ -33,10 +33,6 @@
         if name in enable_opts:
             if opt is not None:
                 o = opt()
-                if unroll and name == 'string':
-                    o.enabled = False
-                # FIXME: Workaround to disable string optimisation
-                # during preamble but to keep it during the loop
                 optimizations.append(o)
             elif name == 'ffi' and config.translation.jit_ffi:
                 # we cannot put the class directly in the unrolling_iterable,
@@ -55,7 +51,7 @@
 
 
 def optimize_loop_1(metainterp_sd, loop, enable_opts,
-                    inline_short_preamble=True, retraced=False):
+                    inline_short_preamble=True, retraced=False, bridge=False):
     """Optimize loop.operations to remove internal overheadish operations.
     """
 
@@ -64,7 +60,7 @@
     if unroll:
         optimize_unroll(metainterp_sd, loop, optimizations)
     else:
-        optimizer = Optimizer(metainterp_sd, loop, optimizations)
+        optimizer = Optimizer(metainterp_sd, loop, optimizations, bridge)
         optimizer.propagate_all_forward()
 
 def optimize_bridge_1(metainterp_sd, bridge, enable_opts,
@@ -76,7 +72,7 @@
     except KeyError:
         pass
     optimize_loop_1(metainterp_sd, bridge, enable_opts,
-                    inline_short_preamble, retraced)
+                    inline_short_preamble, retraced, bridge=True)
 
 if __name__ == '__main__':
     print ALL_OPTS_NAMES
diff --git a/pypy/jit/metainterp/optimizeopt/fficall.py b/pypy/jit/metainterp/optimizeopt/fficall.py
--- a/pypy/jit/metainterp/optimizeopt/fficall.py
+++ b/pypy/jit/metainterp/optimizeopt/fficall.py
@@ -1,12 +1,11 @@
 from pypy.rpython.annlowlevel import cast_base_ptr_to_instance
 from pypy.rlib.objectmodel import we_are_translated
 from pypy.rlib.libffi import Func
-from pypy.rlib.debug import debug_start, debug_stop, debug_print, have_debug_prints
+from pypy.rlib.debug import debug_print
 from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method
 from pypy.jit.metainterp.optimizeopt.optimizer import Optimization
-from pypy.jit.backend.llsupport.ffisupport import UnsupportedKind
 
 
 class FuncInfo(object):
@@ -19,28 +18,27 @@
     def __init__(self, funcval, cpu, prepare_op):
         self.funcval = funcval
         self.opargs = []
-        argtypes, restype = self._get_signature(funcval)
-        try:
-            self.descr = cpu.calldescrof_dynamic(argtypes, restype)
-        except UnsupportedKind:
-            # e.g., I or U for long longs
-            self.descr = None
+        argtypes, restype, flags = self._get_signature(funcval)
+        self.descr = cpu.calldescrof_dynamic(argtypes, restype,
+                                             EffectInfo.MOST_GENERAL,
+                                             ffi_flags=flags)
+        # ^^^ may be None if unsupported
         self.prepare_op = prepare_op
         self.delayed_ops = []
 
     def _get_signature(self, funcval):
         """
-        given the funcval, return a tuple (argtypes, restype), where the
-        actuall types are libffi.types.*
+        given the funcval, return a tuple (argtypes, restype, flags), where
+        the actuall types are libffi.types.*
 
         The implementation is tricky because we have three possible cases:
 
         - translated: the easiest case, we can just cast back the pointer to
-          the original Func instance and read .argtypes and .restype
+          the original Func instance and read .argtypes, .restype and .flags
 
         - completely untranslated: this is what we get from test_optimizeopt
           tests. funcval contains a FakeLLObject whose _fake_class is Func,
-          and we can just get .argtypes and .restype
+          and we can just get .argtypes, .restype and .flags
 
         - partially translated: this happens when running metainterp tests:
           funcval contains the low-level equivalent of a Func, and thus we
@@ -48,14 +46,14 @@
           inst_argtypes is actually a low-level array, but we can use it
           directly since the only thing we do with it is to read its items
         """
-        
+
         llfunc = funcval.box.getref_base()
         if we_are_translated():
             func = cast_base_ptr_to_instance(Func, llfunc)
-            return func.argtypes, func.restype
+            return func.argtypes, func.restype, func.flags
         elif getattr(llfunc, '_fake_class', None) is Func:
             # untranslated
-            return llfunc.argtypes, llfunc.restype
+            return llfunc.argtypes, llfunc.restype, llfunc.flags
         else:
             # partially translated
             # llfunc contains an opaque pointer to something like the following:
@@ -66,7 +64,7 @@
             # because we don't have the exact TYPE to cast to.  Instead, we
             # just fish it manually :-(
             f = llfunc._obj.container
-            return f.inst_argtypes, f.inst_restype
+            return f.inst_argtypes, f.inst_restype, f.inst_flags
 
 
 class OptFfiCall(Optimization):
@@ -78,18 +76,9 @@
         else:
             self.logops = None
 
-    def propagate_begin_forward(self):
-        debug_start('jit-log-ffiopt')
-        Optimization.propagate_begin_forward(self)
-
-    def propagate_end_forward(self):
-        debug_stop('jit-log-ffiopt')
-        Optimization.propagate_end_forward(self)
-
-    def reconstruct_for_next_iteration(self, optimizer, valuemap):
+    def new(self):
         return OptFfiCall()
-        # FIXME: Should any status be saved for next iteration?
-
+    
     def begin_optimization(self, funcval, op):
         self.rollback_maybe('begin_optimization', op)
         self.funcinfo = FuncInfo(funcval, self.optimizer.cpu, op)
@@ -184,7 +173,8 @@
     def do_call(self, op):
         funcval = self._get_funcval(op)
         funcinfo = self.funcinfo
-        if not funcinfo or funcinfo.funcval is not funcval:
+        if (not funcinfo or funcinfo.funcval is not funcval or
+            funcinfo.descr is None):
             return [op] # cannot optimize
         funcsymval = self.getvalue(op.getarg(2))
         arglist = [funcsymval.force_box()]
@@ -207,9 +197,7 @@
 
     def _get_oopspec(self, op):
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            return effectinfo.oopspecindex
-        return EffectInfo.OS_NONE
+        return effectinfo.oopspecindex
 
     def _get_funcval(self, op):
         return self.getvalue(op.getarg(1))
diff --git a/pypy/jit/metainterp/optimizeopt/generalize.py b/pypy/jit/metainterp/optimizeopt/generalize.py
new file mode 100644
--- /dev/null
+++ b/pypy/jit/metainterp/optimizeopt/generalize.py
@@ -0,0 +1,19 @@
+from pypy.jit.metainterp.optimizeopt.optimizer import MININT, MAXINT
+
+class GeneralizationStrategy(object):
+    def __init__(self, optimizer):
+        self.optimizer = optimizer
+
+    def apply(self):
+        raise NotImplementedError
+
+class KillHugeIntBounds(GeneralizationStrategy):
+    def apply(self):
+        for v in self.optimizer.values.values():
+            if v.is_constant():
+                continue
+            if v.intbound.lower < MININT/2:
+                v.intbound.lower = MININT
+            if v.intbound.upper > MAXINT/2:
+                v.intbound.upper = MAXINT
+          
diff --git a/pypy/jit/metainterp/optimizeopt/heap.py b/pypy/jit/metainterp/optimizeopt/heap.py
--- a/pypy/jit/metainterp/optimizeopt/heap.py
+++ b/pypy/jit/metainterp/optimizeopt/heap.py
@@ -1,9 +1,11 @@
 import os
+
+from pypy.jit.metainterp.jitexc import JitException
+from pypy.jit.metainterp.optimizeopt.optimizer import Optimization, MODE_ARRAY
+from pypy.jit.metainterp.history import ConstInt, Const
 from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.rlib.objectmodel import we_are_translated
-from pypy.jit.metainterp.jitexc import JitException
-from pypy.jit.metainterp.optimizeopt.optimizer import Optimization
 
 
 class CachedField(object):
@@ -23,6 +25,7 @@
         #      'cached_fields'.
         #
         self._cached_fields = {}
+        self._cached_fields_getfield_op = {}
         self._lazy_setfield = None
         self._lazy_setfield_registered = False
 
@@ -34,6 +37,12 @@
             self.force_lazy_setfield(optheap)
             assert not self.possible_aliasing(optheap, structvalue)
         cached_fieldvalue = self._cached_fields.get(structvalue, None)
+
+        # Hack to ensure constants are imported from the preamble
+        if cached_fieldvalue and fieldvalue.is_constant(): 
+            optheap.optimizer.ensure_imported(cached_fieldvalue)
+            cached_fieldvalue = self._cached_fields.get(structvalue, None)
+
         if cached_fieldvalue is not fieldvalue:
             # common case: store the 'op' as lazy_setfield, and register
             # myself in the optheap's _lazy_setfields_and_arrayitems list
@@ -69,9 +78,10 @@
         else:
             return self._cached_fields.get(structvalue, None)
 
-    def remember_field_value(self, structvalue, fieldvalue):
+    def remember_field_value(self, structvalue, fieldvalue, getfield_op=None):
         assert self._lazy_setfield is None
         self._cached_fields[structvalue] = fieldvalue
+        self._cached_fields_getfield_op[structvalue] = getfield_op
 
     def force_lazy_setfield(self, optheap, can_cache=True):
         op = self._lazy_setfield
@@ -80,7 +90,7 @@
             # Now we clear _cached_fields, because actually doing the
             # setfield might impact any of the stored result (because of
             # possible aliasing).
-            self._cached_fields.clear()
+            self.clear()
             self._lazy_setfield = None
             optheap.next_optimization.propagate_forward(op)
             if not can_cache:
@@ -90,19 +100,47 @@
             # field.
             structvalue = optheap.getvalue(op.getarg(0))
             fieldvalue  = optheap.getvalue(op.getarglist()[-1])
-            self.remember_field_value(structvalue, fieldvalue)
+            self.remember_field_value(structvalue, fieldvalue, op)
         elif not can_cache:
-            self._cached_fields.clear()
+            self.clear()
 
-    def get_reconstructed(self, optimizer, valuemap):
-        assert self._lazy_setfield is None
-        cf = CachedField()
-        for structvalue, fieldvalue in self._cached_fields.iteritems():
-            structvalue2 = structvalue.get_reconstructed(optimizer, valuemap)
-            fieldvalue2  = fieldvalue .get_reconstructed(optimizer, valuemap)
-            cf._cached_fields[structvalue2] = fieldvalue2
-        return cf
+    def clear(self):
+        self._cached_fields.clear()
+        self._cached_fields_getfield_op.clear()
 
+    def turned_constant(self, newvalue, value):
+        if newvalue not in self._cached_fields and value in self._cached_fields:
+            self._cached_fields[newvalue] = self._cached_fields[value]
+            op = self._cached_fields_getfield_op[value].clone()
+            constbox = value.box
+            assert isinstance(constbox, Const)
+            op.setarg(0, constbox)
+            self._cached_fields_getfield_op[newvalue] = op
+        for structvalue in self._cached_fields.keys():
+            if self._cached_fields[structvalue] is value:
+                self._cached_fields[structvalue] = newvalue
+
+    def produce_potential_short_preamble_ops(self, optimizer, shortboxes, descr):
+        if self._lazy_setfield is not None:
+            return
+        for structvalue in self._cached_fields_getfield_op.keys():
+            op = self._cached_fields_getfield_op[structvalue]
+            if not op:
+                continue
+            if optimizer.getvalue(op.getarg(0)) in optimizer.opaque_pointers:
+                continue
+            if structvalue in self._cached_fields:
+                if op.getopnum() == rop.SETFIELD_GC:
+                    result = op.getarg(1)
+                    if isinstance(result, Const):
+                        newresult = result.clonebox()
+                        optimizer.make_constant(newresult, result)
+                        result = newresult
+                    getop = ResOperation(rop.GETFIELD_GC, [op.getarg(0)],
+                                         result, op.getdescr())
+                    shortboxes.add_potential(getop, synthetic=True)
+                elif op.result is not None:
+                    shortboxes.add_potential(op)
 
 class BogusPureField(JitException):
     pass
@@ -121,24 +159,32 @@
         self._remove_guard_not_invalidated = False
         self._seen_guard_not_invalidated = False
 
-    def reconstruct_for_next_iteration(self, optimizer, valuemap):
-        new = OptHeap()
+    def force_at_end_of_preamble(self):
+        self.force_all_lazy_setfields_and_arrayitems()
 
-        if True:
-            self.force_all_lazy_setfields_and_arrayitems()
-        else:
-            assert 0   # was: new.lazy_setfields = self.lazy_setfields
+    def flush(self):
+        self.force_all_lazy_setfields_and_arrayitems()
 
-        for descr, d in self.cached_fields.items():
-            new.cached_fields[descr] = d.get_reconstructed(optimizer, valuemap)
+    def new(self):
+        return OptHeap()
+
+    def produce_potential_short_preamble_ops(self, sb):
+        descrkeys = self.cached_fields.keys()
+        if not we_are_translated():
+            # XXX Pure operation of boxes that are cached in several places will
+            #     only be removed from the peeled loop when red from the first
+            #     place discovered here. This is far from ideal, as it makes
+            #     the effectiveness of our optimization a bit random. It should
+            #     howevere always generate correct results. For tests we dont
+            #     want this randomness.
+            descrkeys.sort(key=str, reverse=True)
+        for descr in descrkeys:
+            d = self.cached_fields[descr]
+            d.produce_potential_short_preamble_ops(self.optimizer, sb, descr)
 
         for descr, submap in self.cached_arrayitems.items():
-            newdict = {}
             for index, d in submap.items():
-                newdict[index] = d.get_reconstructed(optimizer, valuemap)
-            new.cached_arrayitems[descr] = newdict
-
-        return new
+                d.produce_potential_short_preamble_ops(self.optimizer, sb, descr)
 
     def clean_caches(self):
         del self._lazy_setfields_and_arrayitems[:]
@@ -193,43 +239,43 @@
             opnum == rop.CALL_RELEASE_GIL or
             opnum == rop.CALL_ASSEMBLER):
             if opnum == rop.CALL_ASSEMBLER:
-                effectinfo = None
+                self._seen_guard_not_invalidated = False
             else:
                 effectinfo = op.getdescr().get_extra_info()
-            if effectinfo is None or effectinfo.check_can_invalidate():
-                self._seen_guard_not_invalidated = False
-            if effectinfo is not None and not effectinfo.has_random_effects():
-                # XXX we can get the wrong complexity here, if the lists
-                # XXX stored on effectinfo are large
-                for fielddescr in effectinfo.readonly_descrs_fields:
-                    self.force_lazy_setfield(fielddescr)
-                for arraydescr in effectinfo.readonly_descrs_arrays:
-                    self.force_lazy_setarrayitem(arraydescr)
-                for fielddescr in effectinfo.write_descrs_fields:
-                    self.force_lazy_setfield(fielddescr, can_cache=False)
-                for arraydescr in effectinfo.write_descrs_arrays:
-                    self.force_lazy_setarrayitem(arraydescr, can_cache=False)
-                if effectinfo.check_forces_virtual_or_virtualizable():
-                    vrefinfo = self.optimizer.metainterp_sd.virtualref_info
-                    self.force_lazy_setfield(vrefinfo.descr_forced)
-                    # ^^^ we only need to force this field; the other fields
-                    # of virtualref_info and virtualizable_info are not gcptrs.
-                return
+                if effectinfo.check_can_invalidate():
+                    self._seen_guard_not_invalidated = False
+                if not effectinfo.has_random_effects():
+                    self.force_from_effectinfo(effectinfo)
+                    return
         self.force_all_lazy_setfields_and_arrayitems()
         self.clean_caches()
 
+    def force_from_effectinfo(self, effectinfo):
+        # XXX we can get the wrong complexity here, if the lists
+        # XXX stored on effectinfo are large
+        for fielddescr in effectinfo.readonly_descrs_fields:
+            self.force_lazy_setfield(fielddescr)
+        for arraydescr in effectinfo.readonly_descrs_arrays:
+            self.force_lazy_setarrayitem(arraydescr)
+        for fielddescr in effectinfo.write_descrs_fields:
+            self.force_lazy_setfield(fielddescr, can_cache=False)
+        for arraydescr in effectinfo.write_descrs_arrays:
+            self.force_lazy_setarrayitem(arraydescr, can_cache=False)
+        if effectinfo.check_forces_virtual_or_virtualizable():
+            vrefinfo = self.optimizer.metainterp_sd.virtualref_info
+            self.force_lazy_setfield(vrefinfo.descr_forced)
+            # ^^^ we only need to force this field; the other fields
+            # of virtualref_info and virtualizable_info are not gcptrs.
 
     def turned_constant(self, value):
         assert value.is_constant()
         newvalue = self.getvalue(value.box)
         if value is not newvalue:
             for cf in self.cached_fields.itervalues():
-                if value in cf._cached_fields:
-                    cf._cached_fields[newvalue] = cf._cached_fields[value]
+                cf.turned_constant(newvalue, value)
             for submap in self.cached_arrayitems.itervalues():
                 for cf in submap.itervalues():
-                    if value in cf._cached_fields:
-                        cf._cached_fields[newvalue] = cf._cached_fields[value]
+                    cf.turned_constant(newvalue, value)
 
     def force_lazy_setfield(self, descr, can_cache=True):
         try:
@@ -238,13 +284,14 @@
             return
         cf.force_lazy_setfield(self, can_cache)
 
-    def force_lazy_setarrayitem(self, arraydescr, can_cache=True):
+    def force_lazy_setarrayitem(self, arraydescr, indexvalue=None, can_cache=True):
         try:
             submap = self.cached_arrayitems[arraydescr]
         except KeyError:
             return
-        for cf in submap.values():
-            cf.force_lazy_setfield(self, can_cache)
+        for idx, cf in submap.iteritems():
+            if indexvalue is None or indexvalue.intbound.contains(idx):
+                cf.force_lazy_setfield(self, can_cache)
 
     def fixup_guard_situation(self):
         # hackish: reverse the order of the last two operations if it makes
@@ -331,7 +378,7 @@
         self.emit_operation(op)
         # then remember the result of reading the field
         fieldvalue = self.getvalue(op.result)
-        cf.remember_field_value(structvalue, fieldvalue)
+        cf.remember_field_value(structvalue, fieldvalue, op)
 
     def optimize_SETFIELD_GC(self, op):
         if self.has_pure_result(rop.GETFIELD_GC_PURE, [op.getarg(0)],
@@ -348,6 +395,7 @@
         indexvalue = self.getvalue(op.getarg(1))
         cf = None
         if indexvalue.is_constant():
+            arrayvalue.make_len_gt(MODE_ARRAY, op.getdescr(), indexvalue.box.getint())
             # use the cache on (arraydescr, index), which is a constant
             cf = self.arrayitem_cache(op.getdescr(), indexvalue.box.getint())
             fieldvalue = cf.getfield_from_cache(self, arrayvalue)
@@ -356,14 +404,14 @@
                 return
         else:
             # variable index, so make sure the lazy setarrayitems are done
-            self.force_lazy_setarrayitem(op.getdescr())
+            self.force_lazy_setarrayitem(op.getdescr(), indexvalue=indexvalue)
         # default case: produce the operation
         arrayvalue.ensure_nonnull()
         self.emit_operation(op)
         # the remember the result of reading the array item
         if cf is not None:
             fieldvalue = self.getvalue(op.result)
-            cf.remember_field_value(arrayvalue, fieldvalue)
+            cf.remember_field_value(arrayvalue, fieldvalue, op)
 
     def optimize_SETARRAYITEM_GC(self, op):
         if self.has_pure_result(rop.GETARRAYITEM_GC_PURE, [op.getarg(0),
@@ -375,12 +423,14 @@
         #
         indexvalue = self.getvalue(op.getarg(1))
         if indexvalue.is_constant():
+            arrayvalue = self.getvalue(op.getarg(0))
+            arrayvalue.make_len_gt(MODE_ARRAY, op.getdescr(), indexvalue.box.getint())
             # use the cache on (arraydescr, index), which is a constant
             cf = self.arrayitem_cache(op.getdescr(), indexvalue.box.getint())
             cf.do_setfield(self, op)
         else:
             # variable index, so make sure the lazy setarrayitems are done
-            self.force_lazy_setarrayitem(op.getdescr(), can_cache=False)
+            self.force_lazy_setarrayitem(op.getdescr(), indexvalue=indexvalue, can_cache=False)
             # and then emit the operation
             self.emit_operation(op)
 
diff --git a/pypy/jit/metainterp/optimizeopt/intbounds.py b/pypy/jit/metainterp/optimizeopt/intbounds.py
--- a/pypy/jit/metainterp/optimizeopt/intbounds.py
+++ b/pypy/jit/metainterp/optimizeopt/intbounds.py
@@ -1,9 +1,11 @@
-from pypy.jit.metainterp.optimizeopt.optimizer import Optimization, CONST_1, CONST_0
+from pypy.jit.metainterp.optimizeopt.optimizer import Optimization, CONST_1, CONST_0, \
+                                                  MODE_ARRAY, MODE_STR, MODE_UNICODE
+from pypy.jit.metainterp.history import ConstInt
+from pypy.jit.metainterp.optimizeopt.intutils import (IntBound, IntLowerBound,
+    IntUpperBound)
 from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method
-from pypy.jit.metainterp.optimizeopt.intutils import (IntBound, IntUnbounded,
-    IntLowerBound, IntUpperBound)
-from pypy.jit.metainterp.history import Const, ConstInt
-from pypy.jit.metainterp.resoperation import rop, ResOperation
+from pypy.jit.metainterp.resoperation import rop
+
 
 class OptIntBounds(Optimization):
     """Keeps track of the bounds placed on integers by guards and remove
@@ -13,18 +15,17 @@
         self.posponedop = None
         self.nextop = None
 
-    def reconstruct_for_next_iteration(self, optimizer, valuemap):
+    def new(self):
         assert self.posponedop is None
-        return self
+        return OptIntBounds()
+        
+    def flush(self):
+        assert self.posponedop is None
 
     def setup(self):
         self.posponedop = None
         self.nextop = None
 
-    def reconstruct_for_next_iteration(self, optimizer, valuemap):
-        assert self.posponedop is None
-        return self
-
     def propagate_forward(self, op):
         if op.is_ovf():
             self.posponedop = op
@@ -124,6 +125,17 @@
         r = self.getvalue(op.result)
         r.intbound.intersect(v1.intbound.div_bound(v2.intbound))
 
+    def optimize_INT_MOD(self, op):
+        self.emit_operation(op)
+        v2 = self.getvalue(op.getarg(1))
+        if v2.is_constant():
+            val = v2.box.getint()
+            r = self.getvalue(op.result)
+            if val < 0:
+                val = -val
+            r.intbound.make_gt(IntBound(-val, -val))
+            r.intbound.make_lt(IntBound(val, val))
+
     def optimize_INT_LSHIFT(self, op):
         v1 = self.getvalue(op.getarg(0))
         v2 = self.getvalue(op.getarg(1))
@@ -274,10 +286,27 @@
 
     def optimize_ARRAYLEN_GC(self, op):
         self.emit_operation(op)
-        v1 = self.getvalue(op.result)
-        v1.intbound.make_ge(IntLowerBound(0))
+        array  = self.getvalue(op.getarg(0))
+        result = self.getvalue(op.result)
+        array.make_len_gt(MODE_ARRAY, op.getdescr(), -1)
+        array.lenbound.bound.intersect(result.intbound)
+        result.intbound = array.lenbound.bound
 
-    optimize_STRLEN = optimize_UNICODELEN = optimize_ARRAYLEN_GC
+    def optimize_STRLEN(self, op):
+        self.emit_operation(op)
+        array  = self.getvalue(op.getarg(0))
+        result = self.getvalue(op.result)
+        array.make_len_gt(MODE_STR, op.getdescr(), -1)
+        array.lenbound.bound.intersect(result.intbound)
+        result.intbound = array.lenbound.bound
+
+    def optimize_UNICODELEN(self, op):
+        self.emit_operation(op)
+        array  = self.getvalue(op.getarg(0))
+        result = self.getvalue(op.result)
+        array.make_len_gt(MODE_UNICODE, op.getdescr(), -1)
+        array.lenbound.bound.intersect(result.intbound)
+        result.intbound = array.lenbound.bound
 
     def optimize_STRGETITEM(self, op):
         self.emit_operation(op)
diff --git a/pypy/jit/metainterp/optimizeopt/intutils.py b/pypy/jit/metainterp/optimizeopt/intutils.py
--- a/pypy/jit/metainterp/optimizeopt/intutils.py
+++ b/pypy/jit/metainterp/optimizeopt/intutils.py
@@ -1,4 +1,9 @@
 from pypy.rlib.rarithmetic import ovfcheck, ovfcheck_lshift, LONG_BIT
+from pypy.jit.metainterp.resoperation import rop, ResOperation
+from pypy.jit.metainterp.history import BoxInt, ConstInt
+import sys
+MAXINT = sys.maxint
+MININT = -sys.maxint - 1
 
 class IntBound(object):
     _attrs_ = ('has_upper', 'has_lower', 'upper', 'lower')
@@ -210,11 +215,11 @@
         
     def __repr__(self):
         if self.has_lower:
-            l = '%4d' % self.lower
+            l = '%d' % self.lower
         else:
             l = '-Inf'
         if self.has_upper:
-            u = '%3d' % self.upper
+            u = '%d' % self.upper
         else:
             u = 'Inf'
         return '%s <= x <= %s' % (l, u)
@@ -224,7 +229,24 @@
         res.has_lower = self.has_lower
         res.has_upper = self.has_upper
         return res
+
+    def make_guards(self, box, guards):
+        if self.has_lower and self.lower > MININT:
+            bound = self.lower
+            res = BoxInt()
+            op = ResOperation(rop.INT_GE, [box, ConstInt(bound)], res)
+            guards.append(op)
+            op = ResOperation(rop.GUARD_TRUE, [res], None)
+            guards.append(op)
+        if self.has_upper and self.upper < MAXINT:
+            bound = self.upper
+            res = BoxInt()
+            op = ResOperation(rop.INT_LE, [box, ConstInt(bound)], res)
+            guards.append(op)
+            op = ResOperation(rop.GUARD_TRUE, [res], None)
+            guards.append(op)
     
+
 class IntUpperBound(IntBound):
     def __init__(self, upper):
         self.has_upper = True
@@ -244,7 +266,23 @@
         self.has_upper = False
         self.has_lower = False
         self.upper = 0
-        self.lower = 0        
+        self.lower = 0
+
+class ImmutableIntUnbounded(IntUnbounded):
+    def _raise(self):
+        raise TypeError('ImmutableIntUnbounded is immutable')
+    def make_le(self, other):
+        self._raise()
+    def make_lt(self, other):
+        self._raise()
+    def make_ge(self, other):
+        self._raise()
+    def make_gt(self, other):
+        self._raise()
+    def make_constant(self, value):
+        self._raise()
+    def intersect(self, other):        
+        self._raise()
 
 def min4(t):
     return min(min(t[0], t[1]), min(t[2], t[3]))
diff --git a/pypy/jit/metainterp/optimizeopt/optimizer.py b/pypy/jit/metainterp/optimizeopt/optimizer.py
--- a/pypy/jit/metainterp/optimizeopt/optimizer.py
+++ b/pypy/jit/metainterp/optimizeopt/optimizer.py
@@ -1,70 +1,128 @@
-from pypy.jit.metainterp.history import Box, BoxInt, LoopToken, BoxFloat,\
-     ConstFloat
-from pypy.jit.metainterp.history import Const, ConstInt, ConstPtr, ConstObj, REF
+from pypy.jit.metainterp import jitprof, resume, compile
+from pypy.jit.metainterp.executor import execute_nonspec
+from pypy.jit.metainterp.history import BoxInt, BoxFloat, Const, ConstInt, REF
+from pypy.jit.metainterp.optimizeopt.intutils import IntBound, IntUnbounded, \
+                                                     ImmutableIntUnbounded, \
+                                                     IntLowerBound, MININT, MAXINT
+from pypy.jit.metainterp.optimizeopt.util import (make_dispatcher_method,
+    args_dict)
 from pypy.jit.metainterp.resoperation import rop, ResOperation
-from pypy.jit.metainterp import jitprof
-from pypy.jit.metainterp.executor import execute_nonspec
-from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method, sort_descrs
-from pypy.jit.metainterp.optimizeopt.util import descrlist_dict, args_dict
-from pypy.jit.metainterp.optimize import InvalidLoop
-from pypy.jit.metainterp import resume, compile
 from pypy.jit.metainterp.typesystem import llhelper, oohelper
-from pypy.rpython.lltypesystem import lltype
-from pypy.jit.metainterp.history import AbstractDescr, make_hashable_int
-from pypy.jit.metainterp.optimizeopt.intutils import IntBound, IntUnbounded
 from pypy.tool.pairtype import extendabletype
+from pypy.rlib.debug import debug_start, debug_stop, debug_print
+from pypy.rlib.objectmodel import specialize
 
 LEVEL_UNKNOWN    = '\x00'
 LEVEL_NONNULL    = '\x01'
 LEVEL_KNOWNCLASS = '\x02'     # might also mean KNOWNARRAYDESCR, for arrays
 LEVEL_CONSTANT   = '\x03'
 
-import sys
-MAXINT = sys.maxint
-MININT = -sys.maxint - 1
+MODE_ARRAY   = '\x00'
+MODE_STR     = '\x01'
+MODE_UNICODE = '\x02'
+class LenBound(object):
+    def __init__(self, mode, descr, bound):
+        self.mode = mode
+        self.descr = descr
+        self.bound = bound
+
+    def clone(self):
+        return LenBound(self.mode, self.descr, self.bound.clone())
 
 class OptValue(object):
     __metaclass__ = extendabletype
-    _attrs_ = ('box', 'known_class', 'last_guard_index', 'level', 'intbound')
+    _attrs_ = ('box', 'known_class', 'last_guard_index', 'level', 'intbound', 'lenbound')
     last_guard_index = -1
 
     level = LEVEL_UNKNOWN
     known_class = None
-    intbound = None
+    intbound = ImmutableIntUnbounded()
+    lenbound = None
 
-    def __init__(self, box):
+    def __init__(self, box, level=None, known_class=None, intbound=None):
         self.box = box
-        self.intbound = IntBound(MININT, MAXINT) #IntUnbounded()
+        if level is not None:
+            self.level = level
+        self.known_class = known_class
+        if intbound:
+            self.intbound = intbound
+        else:
+            if isinstance(box, BoxInt):
+                self.intbound = IntBound(MININT, MAXINT)
+            else:
+                self.intbound = IntUnbounded()
+
         if isinstance(box, Const):
             self.make_constant(box)
         # invariant: box is a Const if and only if level == LEVEL_CONSTANT
 
+    def make_len_gt(self, mode, descr, val):
+        if self.lenbound:
+            assert self.lenbound.mode == mode
+            assert self.lenbound.descr == descr
+            self.lenbound.bound.make_gt(IntBound(val, val))
+        else:
+            self.lenbound = LenBound(mode, descr, IntLowerBound(val + 1))
+
+    def make_guards(self, box):
+        guards = []
+        if self.level == LEVEL_CONSTANT:
+            op = ResOperation(rop.GUARD_VALUE, [box, self.box], None)
+            guards.append(op)
+        elif self.level == LEVEL_KNOWNCLASS:
+            op = ResOperation(rop.GUARD_NONNULL, [box], None)
+            guards.append(op)            
+            op = ResOperation(rop.GUARD_CLASS, [box, self.known_class], None)
+            guards.append(op)
+        else:
+            if self.level == LEVEL_NONNULL:
+                op = ResOperation(rop.GUARD_NONNULL, [box], None)
+                guards.append(op)
+            self.intbound.make_guards(box, guards)
+            if self.lenbound:
+                lenbox = BoxInt()
+                if self.lenbound.mode == MODE_ARRAY:
+                    op = ResOperation(rop.ARRAYLEN_GC, [box], lenbox, self.lenbound.descr)
+                elif self.lenbound.mode == MODE_STR:
+                    op = ResOperation(rop.STRLEN, [box], lenbox, self.lenbound.descr)
+                elif self.lenbound.mode == MODE_UNICODE:
+                    op = ResOperation(rop.UNICODELEN, [box], lenbox, self.lenbound.descr)
+                else:
+                    debug_print("Unknown lenbound mode")
+                    assert False
+                guards.append(op)
+                self.lenbound.bound.make_guards(lenbox, guards)
+        return guards
+
+    def import_from(self, other, optimizer):
+        assert self.level <= LEVEL_NONNULL
+        if other.level == LEVEL_CONSTANT:
+            self.make_constant(other.get_key_box())
+            optimizer.turned_constant(self)
+        elif other.level == LEVEL_KNOWNCLASS:
+            self.make_constant_class(other.known_class, -1)
+        else:
+            if other.level == LEVEL_NONNULL:
+                self.ensure_nonnull()
+            self.intbound.intersect(other.intbound)
+            if other.lenbound:
+                if self.lenbound:
+                    assert other.lenbound.mode == self.lenbound.mode
+                    assert other.lenbound.descr == self.lenbound.descr
+                    self.lenbound.bound.intersect(other.lenbound.bound)
+                else:
+                    self.lenbound = other.lenbound.clone()
+                    
+
     def force_box(self):
         return self.box
 
     def get_key_box(self):
         return self.box
 
-    def enum_forced_boxes(self, boxes, already_seen):
-        key = self.get_key_box()
-        if key not in already_seen:
-            boxes.append(self.force_box())
-            already_seen[self.get_key_box()] = None
-
-    def get_reconstructed(self, optimizer, valuemap):
-        if self in valuemap:
-            return valuemap[self]
-        new = self.reconstruct_for_next_iteration(optimizer)
-        valuemap[self] = new
-        self.reconstruct_childs(new, valuemap)
-        return new
-
-    def reconstruct_for_next_iteration(self, optimizer):
+    def force_at_end_of_preamble(self, already_forced):
         return self
 
-    def reconstruct_childs(self, new, valuemap):
-        pass
-
     def get_args_for_fail(self, modifier):
         pass
 
@@ -88,6 +146,7 @@
         assert isinstance(constbox, Const)
         self.box = constbox
         self.level = LEVEL_CONSTANT
+        
         if isinstance(constbox, ConstInt):
             val = constbox.getint()
             self.intbound = IntBound(val, val)
@@ -228,7 +287,9 @@
 
     def pure(self, opnum, args, result):
         op = ResOperation(opnum, args, result)
-        self.optimizer.pure_operations[self.optimizer.make_args_key(op)] = op
+        key = self.optimizer.make_args_key(op)
+        if key not in self.optimizer.pure_operations:
+            self.optimizer.pure_operations[key] = op
 
     def has_pure_result(self, opnum, args, descr):
         op = ResOperation(opnum, args, None, descr)
@@ -241,39 +302,47 @@
     def setup(self):
         pass
 
+    def turned_constant(self, value):
+        pass
+
     def force_at_end_of_preamble(self):
         pass
 
-    def turned_constant(self, value):
+    # It is too late to force stuff here, it must be done in force_at_end_of_preamble
+    def new(self):
+        raise NotImplementedError
+
+    # Called after last operation has been propagated to flush out any posponed ops
+    def flush(self):
         pass
 
-    def reconstruct_for_next_iteration(self, optimizer=None, valuemap=None):
-        #return self.__class__()
-        raise NotImplementedError
-
+    def produce_potential_short_preamble_ops(self, potential_ops):
+        pass
 
 class Optimizer(Optimization):
 
-    def __init__(self, metainterp_sd, loop, optimizations=None):
+    def __init__(self, metainterp_sd, loop, optimizations=None, bridge=False):
         self.metainterp_sd = metainterp_sd
         self.cpu = metainterp_sd.cpu
         self.loop = loop
+        self.bridge = bridge
         self.values = {}
         self.interned_refs = self.cpu.ts.new_ref_dict()
         self.resumedata_memo = resume.ResumeDataLoopMemo(metainterp_sd)
         self.bool_boxes = {}
-        self.loop_invariant_results = {}
         self.pure_operations = args_dict()
         self.producer = {}
         self.pendingfields = []
         self.posponedop = None
         self.exception_might_have_happened = False
         self.quasi_immutable_deps = None
+        self.opaque_pointers = {}
         self.newoperations = []
         if loop is not None:
             self.call_pure_results = loop.call_pure_results
 
         self.set_optimizations(optimizations)
+        self.setup()
 
     def set_optimizations(self, optimizations):
         if optimizations:
@@ -291,39 +360,27 @@
         self.optimizations  = optimizations
 
     def force_at_end_of_preamble(self):
-        self.resumedata_memo = resume.ResumeDataLoopMemo(self.metainterp_sd)
         for o in self.optimizations:
             o.force_at_end_of_preamble()
 
-    def reconstruct_for_next_iteration(self, optimizer=None, valuemap=None):
-        assert optimizer is None
-        assert valuemap is None
-        valuemap = {}
+    def flush(self):
+        for o in self.optimizations:
+            o.flush()
+        assert self.posponedop is None
+
+    def new(self):
         new = Optimizer(self.metainterp_sd, self.loop)
-        optimizations = [o.reconstruct_for_next_iteration(new, valuemap) for o in
-                         self.optimizations]
+        return self._new(new)
+
+    def _new(self, new):
+        assert self.posponedop is None
+        optimizations = [o.new() for o in self.optimizations]
         new.set_optimizations(optimizations)
-
-        new.values = {}
-        for box, value in self.values.items():
-            new.values[box] = value.get_reconstructed(new, valuemap)
-        new.interned_refs = self.interned_refs
-        new.bool_boxes = {}
-        for value in new.bool_boxes.keys():
-            new.bool_boxes[value.get_reconstructed(new, valuemap)] = None
-
-        # FIXME: Move to rewrite.py
-        new.loop_invariant_results = {}
-        for key, value in self.loop_invariant_results.items():
-            new.loop_invariant_results[key] = \
-                                 value.get_reconstructed(new, valuemap)
-
-        new.pure_operations = self.pure_operations
-        new.producer = self.producer
-        assert self.posponedop is None
         new.quasi_immutable_deps = self.quasi_immutable_deps
-
         return new
+        
+    def produce_potential_short_preamble_ops(self, sb):
+        raise NotImplementedError('This is implemented in unroll.UnrollableOptimizer')
 
     def turned_constant(self, value):
         for o in self.optimizations:
@@ -345,19 +402,26 @@
         else:
             return box
 
+    @specialize.argtype(0)
     def getvalue(self, box):
         box = self.getinterned(box)
         try:
             value = self.values[box]
         except KeyError:
             value = self.values[box] = OptValue(box)
+        self.ensure_imported(value)
         return value
 
+    def ensure_imported(self, value):
+        pass
+
+    @specialize.argtype(0)
     def get_constant_box(self, box):
         if isinstance(box, Const):
             return box
         try:
             value = self.values[box]
+            self.ensure_imported(value)
         except KeyError:
             return None
         if value.is_constant():
@@ -413,9 +477,7 @@
             return CVAL_ZERO
 
     def propagate_all_forward(self):
-        self.exception_might_have_happened = True
-        # ^^^ at least at the start of bridges.  For loops, we could set
-        # it to False, but we probably don't care
+        self.exception_might_have_happened = self.bridge
         self.newoperations = []
         self.first_optimization.propagate_begin_forward()
         self.i = 0
@@ -440,23 +502,27 @@
         return True
 
     def emit_operation(self, op):
-        ###self.heap_op_optimizer.emitting_operation(op)
+        if op.returns_bool_result():
+            self.bool_boxes[self.getvalue(op.result)] = None
         self._emit_operation(op)
-
-    def _emit_operation(self, op):
+        
+    @specialize.argtype(0)
+    def _emit_operation(self, op):        
         for i in range(op.numargs()):
             arg = op.getarg(i)
-            if arg in self.values:
-                box = self.values[arg].force_box()
-                op.setarg(i, box)
+            try:
+                value = self.values[arg]
+            except KeyError:
+                pass
+            else:
+                self.ensure_imported(value)
+                op.setarg(i, value.force_box())
         self.metainterp_sd.profiler.count(jitprof.OPT_OPS)
         if op.is_guard():
             self.metainterp_sd.profiler.count(jitprof.OPT_GUARDS)
             op = self.store_final_boxes_in_guard(op)
         elif op.can_raise():
             self.exception_might_have_happened = True
-        elif op.returns_bool_result():
-            self.bool_boxes[self.getvalue(op.result)] = None
         self.newoperations.append(op)
 
     def store_final_boxes_in_guard(self, op):
@@ -505,6 +571,7 @@
         args[n+1] = op.getdescr()
         return args
 
+    @specialize.argtype(0)
     def optimize_default(self, op):
         canfold = op.is_always_pure()
         if op.is_ovf():
@@ -540,12 +607,16 @@
                 return
             else:
                 self.pure_operations[args] = op
+                self.remember_emitting_pure(op)
 
         # otherwise, the operation remains
         self.emit_operation(op)
         if nextop:
             self.emit_operation(nextop)
 
+    def remember_emitting_pure(self, op):
+        pass
+    
     def constant_fold(self, op):
         argboxes = [self.get_constant_box(op.getarg(i))
                     for i in range(op.numargs())]
@@ -562,6 +633,35 @@
     def optimize_DEBUG_MERGE_POINT(self, op):
         self.emit_operation(op)
 
+    def optimize_CAST_OPAQUE_PTR(self, op):
+        value = self.getvalue(op.getarg(0))
+        self.opaque_pointers[value] = True
+        self.make_equal_to(op.result, value)
+
+    def optimize_GETARRAYITEM_GC_PURE(self, op):
+        indexvalue = self.getvalue(op.getarg(1))
+        if indexvalue.is_constant():
+            arrayvalue = self.getvalue(op.getarg(0))
+            arrayvalue.make_len_gt(MODE_ARRAY, op.getdescr(), indexvalue.box.getint())
+        self.optimize_default(op)
+
+    def optimize_STRGETITEM(self, op):
+        indexvalue = self.getvalue(op.getarg(1))
+        if indexvalue.is_constant():
+            arrayvalue = self.getvalue(op.getarg(0))
+            arrayvalue.make_len_gt(MODE_STR, op.getdescr(), indexvalue.box.getint())
+        self.optimize_default(op)
+
+    def optimize_UNICODEGETITEM(self, op):
+        indexvalue = self.getvalue(op.getarg(1))
+        if indexvalue.is_constant():
+            arrayvalue = self.getvalue(op.getarg(0))
+            arrayvalue.make_len_gt(MODE_UNICODE, op.getdescr(), indexvalue.box.getint())
+        self.optimize_default(op)
+        
+
+    
+
 dispatch_opt = make_dispatcher_method(Optimizer, 'optimize_',
         default=Optimizer.optimize_default)
 
diff --git a/pypy/jit/metainterp/optimizeopt/rewrite.py b/pypy/jit/metainterp/optimizeopt/rewrite.py
--- a/pypy/jit/metainterp/optimizeopt/rewrite.py
+++ b/pypy/jit/metainterp/optimizeopt/rewrite.py
@@ -1,10 +1,11 @@
+from pypy.jit.codewriter.effectinfo import EffectInfo
+from pypy.jit.metainterp.history import ConstInt, make_hashable_int
+from pypy.jit.metainterp.optimize import InvalidLoop
+from pypy.jit.metainterp.optimizeopt.intutils import IntBound
 from pypy.jit.metainterp.optimizeopt.optimizer import *
-from pypy.jit.metainterp.resoperation import opboolinvers, opboolreflex
-from pypy.jit.metainterp.history import ConstInt
 from pypy.jit.metainterp.optimizeopt.util import _findall, make_dispatcher_method
-from pypy.jit.metainterp.resoperation import rop, ResOperation
-from pypy.jit.codewriter.effectinfo import EffectInfo
-from pypy.jit.metainterp.optimizeopt.intutils import IntBound
+from pypy.jit.metainterp.resoperation import (opboolinvers, opboolreflex, rop,
+    ResOperation)
 from pypy.rlib.rarithmetic import highest_bit
 
 
@@ -12,9 +13,16 @@
     """Rewrite operations into equivalent, cheaper operations.
        This includes already executed operations and constants.
     """
+    def __init__(self):
+        self.loop_invariant_results = {}
+        self.loop_invariant_producer = {}
 
-    def reconstruct_for_next_iteration(self, optimizer, valuemap):
-        return self