[pypy-commit] pypy list-strategies: merged default into list-strategies branch

l.diekmann noreply at buildbot.pypy.org
Fri Sep 23 13:15:22 CEST 2011


Author: Lukas Diekmann <lukas.diekmann at uni-duesseldorf.de>
Branch: list-strategies
Changeset: r47541:8f73f060e1cb
Date: 2011-09-08 15:10 +0200
http://bitbucket.org/pypy/pypy/changeset/8f73f060e1cb/

Log:	merged default into list-strategies branch

diff too long, truncating to 10000 out of 57463 lines

diff --git a/.hgtags b/.hgtags
--- a/.hgtags
+++ b/.hgtags
@@ -1,1 +1,2 @@
 b590cf6de4190623aad9aa698694c22e614d67b9 release-1.5
+b48df0bf4e75b81d98f19ce89d4a7dc3e1dab5e5 benchmarked
diff --git a/LICENSE b/LICENSE
--- a/LICENSE
+++ b/LICENSE
@@ -37,22 +37,22 @@
     Armin Rigo
     Maciej Fijalkowski
     Carl Friedrich Bolz
+    Antonio Cuni
     Amaury Forgeot d'Arc
-    Antonio Cuni
     Samuele Pedroni
     Michael Hudson
     Holger Krekel
+    Benjamin Peterson
     Christian Tismer
-    Benjamin Peterson
+    Hakan Ardo
+    Alex Gaynor
     Eric van Riet Paap
-    Anders Chrigstr&#246;m
-    H&#229;kan Ard&#246;
+    Anders Chrigstrom
+    David Schneider
     Richard Emslie
     Dan Villiom Podlaski Christiansen
     Alexander Schremmer
-    Alex Gaynor
-    David Schneider
-    Aureli&#233;n Campeas
+    Aurelien Campeas
     Anders Lehmann
     Camillo Bruni
     Niklaus Haldimann
@@ -63,16 +63,17 @@
     Bartosz Skowron
     Jakub Gustak
     Guido Wesdorp
+    Daniel Roberts
     Adrien Di Mascio
     Laura Creighton
     Ludovic Aubry
     Niko Matsakis
-    Daniel Roberts
     Jason Creighton
-    Jacob Hall&#233;n
+    Jacob Hallen
     Alex Martelli
     Anders Hammarquist
     Jan de Mooij
+    Wim Lavrijsen
     Stephan Diehl
     Michael Foord
     Stefan Schwarzer
@@ -83,9 +84,13 @@
     Alexandre Fayolle
     Marius Gedminas
     Simon Burton
+    Justin Peel
     Jean-Paul Calderone
     John Witulski
+    Lukas Diekmann
+    holger krekel
     Wim Lavrijsen
+    Dario Bertini
     Andreas St&#252;hrk
     Jean-Philippe St. Pierre
     Guido van Rossum
@@ -97,15 +102,16 @@
     Georg Brandl
     Gerald Klix
     Wanja Saatkamp
+    Ronny Pfannschmidt
     Boris Feigin
     Oscar Nierstrasz
-    Dario Bertini
     David Malcolm
     Eugene Oden
     Henry Mason
+    Sven Hager
     Lukas Renggli
+    Ilya Osadchiy
     Guenter Jantzen
-    Ronny Pfannschmidt
     Bert Freudenberg
     Amit Regmi
     Ben Young
@@ -122,8 +128,8 @@
     Jared Grubb
     Karl Bartel
     Gabriel Lavoie
+    Victor Stinner
     Brian Dorsey
-    Victor Stinner
     Stuart Williams
     Toby Watson
     Antoine Pitrou
@@ -134,19 +140,23 @@
     Jonathan David Riehl
     Elmo M&#228;ntynen
     Anders Qvist
-    Beatrice D&#252;ring
+    Beatrice During
     Alexander Sedov
+    Timo Paulssen
+    Corbin Simpson
     Vincent Legoll
+    Romain Guillebert
     Alan McIntyre
-    Romain Guillebert
     Alex Perry
     Jens-Uwe Mager
+    Simon Cross
     Dan Stromberg
-    Lukas Diekmann
+    Guillebert Romain
     Carl Meyer
     Pieter Zieschang
     Alejandro J. Cura
     Sylvain Thenault
+    Christoph Gerum
     Travis Francis Athougies
     Henrik Vendelbo
     Lutz Paelike
@@ -157,6 +167,7 @@
     Miguel de Val Borro
     Ignas Mikalajunas
     Artur Lisiecki
+    Philip Jenvey
     Joshua Gilbert
     Godefroid Chappelle
     Yusei Tahara
@@ -165,26 +176,31 @@
     Gustavo Niemeyer
     William Leslie
     Akira Li
-    Kristj&#225;n Valur J&#243;nsson
+    Kristjan Valur Jonsson
     Bobby Impollonia
+    Michael Hudson-Doyle
     Andrew Thompson
     Anders Sigfridsson
+    Floris Bruynooghe
     Jacek Generowicz
     Dan Colish
-    Sven Hager
     Zooko Wilcox-O Hearn
+    Dan Villiom Podlaski Christiansen
     Anders Hammarquist
+    Chris Lambacher
     Dinu Gherman
     Dan Colish
+    Brett Cannon
     Daniel Neuh&#228;user
     Michael Chermside
     Konrad Delong
     Anna Ravencroft
     Greg Price
     Armin Ronacher
+    Christian Muirhead
     Jim Baker
-    Philip Jenvey
     Rodrigo Ara&#250;jo
+    Romain Guillebert
 
     Heinrich-Heine University, Germany 
     Open End AB (formerly AB Strakt), Sweden
diff --git a/ctypes_configure/configure.py b/ctypes_configure/configure.py
--- a/ctypes_configure/configure.py
+++ b/ctypes_configure/configure.py
@@ -559,7 +559,9 @@
 C_HEADER = """
 #include <stdio.h>
 #include <stddef.h>   /* for offsetof() */
-#include <stdint.h>   /* FreeBSD: for uint64_t */
+#ifndef _WIN32
+#  include <stdint.h>   /* FreeBSD: for uint64_t */
+#endif
 
 void dump(char* key, int value) {
     printf("%s: %d\\n", key, value);
diff --git a/ctypes_configure/stdoutcapture.py b/ctypes_configure/stdoutcapture.py
--- a/ctypes_configure/stdoutcapture.py
+++ b/ctypes_configure/stdoutcapture.py
@@ -15,6 +15,15 @@
             not hasattr(os, 'fdopen')):
             self.dummy = 1
         else:
+            try:
+                self.tmpout = os.tmpfile()
+                if mixed_out_err:
+                    self.tmperr = self.tmpout
+                else:
+                    self.tmperr = os.tmpfile()
+            except OSError:     # bah?  on at least one Windows box
+                self.dummy = 1
+                return
             self.dummy = 0
             # make new stdout/stderr files if needed
             self.localoutfd = os.dup(1)
@@ -29,11 +38,6 @@
                 sys.stderr = os.fdopen(self.localerrfd, 'w', 0)
             else:
                 self.saved_stderr = None
-            self.tmpout = os.tmpfile()
-            if mixed_out_err:
-                self.tmperr = self.tmpout
-            else:
-                self.tmperr = os.tmpfile()
             os.dup2(self.tmpout.fileno(), 1)
             os.dup2(self.tmperr.fileno(), 2)
 
diff --git a/lib-python/conftest.py b/lib-python/conftest.py
--- a/lib-python/conftest.py
+++ b/lib-python/conftest.py
@@ -154,18 +154,18 @@
     RegrTest('test_cmd.py'),
     RegrTest('test_cmd_line_script.py'),
     RegrTest('test_codeccallbacks.py', core=True),
-    RegrTest('test_codecencodings_cn.py'),
-    RegrTest('test_codecencodings_hk.py'),
-    RegrTest('test_codecencodings_jp.py'),
-    RegrTest('test_codecencodings_kr.py'),
-    RegrTest('test_codecencodings_tw.py'),
+    RegrTest('test_codecencodings_cn.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecencodings_hk.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecencodings_jp.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecencodings_kr.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecencodings_tw.py', usemodules='_multibytecodec'),
 
-    RegrTest('test_codecmaps_cn.py'),
-    RegrTest('test_codecmaps_hk.py'),
-    RegrTest('test_codecmaps_jp.py'),
-    RegrTest('test_codecmaps_kr.py'),
-    RegrTest('test_codecmaps_tw.py'),
-    RegrTest('test_codecs.py', core=True),
+    RegrTest('test_codecmaps_cn.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecmaps_hk.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecmaps_jp.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecmaps_kr.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecmaps_tw.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecs.py', core=True, usemodules='_multibytecodec'),
     RegrTest('test_codeop.py', core=True),
     RegrTest('test_coercion.py', core=True),
     RegrTest('test_collections.py'),
@@ -314,7 +314,7 @@
     RegrTest('test_mmap.py'),
     RegrTest('test_module.py', core=True),
     RegrTest('test_modulefinder.py'),
-    RegrTest('test_multibytecodec.py'),
+    RegrTest('test_multibytecodec.py', usemodules='_multibytecodec'),
     RegrTest('test_multibytecodec_support.py', skip="not a test"),
     RegrTest('test_multifile.py'),
     RegrTest('test_multiprocessing.py', skip='FIXME leaves subprocesses'),
@@ -359,7 +359,7 @@
     RegrTest('test_property.py', core=True),
     RegrTest('test_pstats.py'),
     RegrTest('test_pty.py', skip="unsupported extension module"),
-    RegrTest('test_pwd.py', skip=skip_win32),
+    RegrTest('test_pwd.py', usemodules="pwd", skip=skip_win32),
     RegrTest('test_py3kwarn.py'),
     RegrTest('test_pyclbr.py'),
     RegrTest('test_pydoc.py'),
diff --git a/lib-python/modified-2.7/ctypes/__init__.py b/lib-python/modified-2.7/ctypes/__init__.py
--- a/lib-python/modified-2.7/ctypes/__init__.py
+++ b/lib-python/modified-2.7/ctypes/__init__.py
@@ -489,9 +489,12 @@
         _flags_ = _FUNCFLAG_CDECL | _FUNCFLAG_PYTHONAPI
     return CFunctionType
 
-_cast = PYFUNCTYPE(py_object, c_void_p, py_object, py_object)(_cast_addr)
 def cast(obj, typ):
-    return _cast(obj, obj, typ)
+    try:
+        c_void_p.from_param(obj)
+    except TypeError, e:
+        raise ArgumentError(str(e))
+    return _cast_addr(obj, obj, typ)
 
 _string_at = PYFUNCTYPE(py_object, c_void_p, c_int)(_string_at_addr)
 def string_at(ptr, size=-1):
diff --git a/lib-python/modified-2.7/ctypes/util.py b/lib-python/modified-2.7/ctypes/util.py
--- a/lib-python/modified-2.7/ctypes/util.py
+++ b/lib-python/modified-2.7/ctypes/util.py
@@ -72,8 +72,8 @@
         return name
 
 if os.name == "posix" and sys.platform == "darwin":
-    from ctypes.macholib.dyld import dyld_find as _dyld_find
     def find_library(name):
+        from ctypes.macholib.dyld import dyld_find as _dyld_find
         possible = ['lib%s.dylib' % name,
                     '%s.dylib' % name,
                     '%s.framework/%s' % (name, name)]
diff --git a/lib-python/modified-2.7/distutils/sysconfig_pypy.py b/lib-python/modified-2.7/distutils/sysconfig_pypy.py
--- a/lib-python/modified-2.7/distutils/sysconfig_pypy.py
+++ b/lib-python/modified-2.7/distutils/sysconfig_pypy.py
@@ -116,6 +116,12 @@
     if compiler.compiler_type == "unix":
         compiler.compiler_so.extend(['-fPIC', '-Wimplicit'])
         compiler.shared_lib_extension = get_config_var('SO')
+        if "CFLAGS" in os.environ:
+            cflags = os.environ["CFLAGS"]
+            compiler.compiler.append(cflags)
+            compiler.compiler_so.append(cflags)
+            compiler.linker_so.append(cflags)
+
 
 from sysconfig_cpython import (
     parse_makefile, _variable_rx, expand_makefile_vars)
diff --git a/lib-python/modified-2.7/distutils/unixccompiler.py b/lib-python/modified-2.7/distutils/unixccompiler.py
--- a/lib-python/modified-2.7/distutils/unixccompiler.py
+++ b/lib-python/modified-2.7/distutils/unixccompiler.py
@@ -324,7 +324,7 @@
             # On OSX users can specify an alternate SDK using
             # '-isysroot', calculate the SDK root if it is specified
             # (and use it further on)
-            cflags = sysconfig.get_config_var('CFLAGS')
+            cflags = sysconfig.get_config_var('CFLAGS') or ''
             m = re.search(r'-isysroot\s+(\S+)', cflags)
             if m is None:
                 sysroot = '/'
diff --git a/lib-python/modified-2.7/gzip.py b/lib-python/modified-2.7/gzip.py
new file mode 100644
--- /dev/null
+++ b/lib-python/modified-2.7/gzip.py
@@ -0,0 +1,514 @@
+"""Functions that read and write gzipped files.
+
+The user of the file doesn't have to worry about the compression,
+but random access is not allowed."""
+
+# based on Andrew Kuchling's minigzip.py distributed with the zlib module
+
+import struct, sys, time, os
+import zlib
+import io
+import __builtin__
+
+__all__ = ["GzipFile","open"]
+
+FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
+
+READ, WRITE = 1, 2
+
+def write32u(output, value):
+    # The L format writes the bit pattern correctly whether signed
+    # or unsigned.
+    output.write(struct.pack("<L", value))
+
+def read32(input):
+    return struct.unpack("<I", input.read(4))[0]
+
+def open(filename, mode="rb", compresslevel=9):
+    """Shorthand for GzipFile(filename, mode, compresslevel).
+
+    The filename argument is required; mode defaults to 'rb'
+    and compresslevel defaults to 9.
+
+    """
+    return GzipFile(filename, mode, compresslevel)
+
+class GzipFile(io.BufferedIOBase):
+    """The GzipFile class simulates most of the methods of a file object with
+    the exception of the readinto() and truncate() methods.
+
+    """
+
+    myfileobj = None
+    max_read_chunk = 10 * 1024 * 1024   # 10Mb
+
+    def __init__(self, filename=None, mode=None,
+                 compresslevel=9, fileobj=None, mtime=None):
+        """Constructor for the GzipFile class.
+
+        At least one of fileobj and filename must be given a
+        non-trivial value.
+
+        The new class instance is based on fileobj, which can be a regular
+        file, a StringIO object, or any other object which simulates a file.
+        It defaults to None, in which case filename is opened to provide
+        a file object.
+
+        When fileobj is not None, the filename argument is only used to be
+        included in the gzip file header, which may includes the original
+        filename of the uncompressed file.  It defaults to the filename of
+        fileobj, if discernible; otherwise, it defaults to the empty string,
+        and in this case the original filename is not included in the header.
+
+        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
+        depending on whether the file will be read or written.  The default
+        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
+        Be aware that only the 'rb', 'ab', and 'wb' values should be used
+        for cross-platform portability.
+
+        The compresslevel argument is an integer from 1 to 9 controlling the
+        level of compression; 1 is fastest and produces the least compression,
+        and 9 is slowest and produces the most compression.  The default is 9.
+
+        The mtime argument is an optional numeric timestamp to be written
+        to the stream when compressing.  All gzip compressed streams
+        are required to contain a timestamp.  If omitted or None, the
+        current time is used.  This module ignores the timestamp when
+        decompressing; however, some programs, such as gunzip, make use
+        of it.  The format of the timestamp is the same as that of the
+        return value of time.time() and of the st_mtime member of the
+        object returned by os.stat().
+
+        """
+
+        # guarantee the file is opened in binary mode on platforms
+        # that care about that sort of thing
+        if mode and 'b' not in mode:
+            mode += 'b'
+        if fileobj is None:
+            fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
+        if filename is None:
+            if hasattr(fileobj, 'name'): filename = fileobj.name
+            else: filename = ''
+        if mode is None:
+            if hasattr(fileobj, 'mode'): mode = fileobj.mode
+            else: mode = 'rb'
+
+        if mode[0:1] == 'r':
+            self.mode = READ
+            # Set flag indicating start of a new member
+            self._new_member = True
+            # Buffer data read from gzip file. extrastart is offset in
+            # stream where buffer starts. extrasize is number of
+            # bytes remaining in buffer from current stream position.
+            self.extrabuf = ""
+            self.extrasize = 0
+            self.extrastart = 0
+            self.name = filename
+            # Starts small, scales exponentially
+            self.min_readsize = 100
+
+        elif mode[0:1] == 'w' or mode[0:1] == 'a':
+            self.mode = WRITE
+            self._init_write(filename)
+            self.compress = zlib.compressobj(compresslevel,
+                                             zlib.DEFLATED,
+                                             -zlib.MAX_WBITS,
+                                             zlib.DEF_MEM_LEVEL,
+                                             0)
+        else:
+            raise IOError, "Mode " + mode + " not supported"
+
+        self.fileobj = fileobj
+        self.offset = 0
+        self.mtime = mtime
+
+        if self.mode == WRITE:
+            self._write_gzip_header()
+
+    @property
+    def filename(self):
+        import warnings
+        warnings.warn("use the name attribute", DeprecationWarning, 2)
+        if self.mode == WRITE and self.name[-3:] != ".gz":
+            return self.name + ".gz"
+        return self.name
+
+    def __repr__(self):
+        s = repr(self.fileobj)
+        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
+
+    def _check_closed(self):
+        """Raises a ValueError if the underlying file object has been closed.
+
+        """
+        if self.closed:
+            raise ValueError('I/O operation on closed file.')
+
+    def _init_write(self, filename):
+        self.name = filename
+        self.crc = zlib.crc32("") & 0xffffffffL
+        self.size = 0
+        self.writebuf = []
+        self.bufsize = 0
+
+    def _write_gzip_header(self):
+        self.fileobj.write('\037\213')             # magic header
+        self.fileobj.write('\010')                 # compression method
+        fname = os.path.basename(self.name)
+        if fname.endswith(".gz"):
+            fname = fname[:-3]
+        flags = 0
+        if fname:
+            flags = FNAME
+        self.fileobj.write(chr(flags))
+        mtime = self.mtime
+        if mtime is None:
+            mtime = time.time()
+        write32u(self.fileobj, long(mtime))
+        self.fileobj.write('\002')
+        self.fileobj.write('\377')
+        if fname:
+            self.fileobj.write(fname + '\000')
+
+    def _init_read(self):
+        self.crc = zlib.crc32("") & 0xffffffffL
+        self.size = 0
+
+    def _read_gzip_header(self):
+        magic = self.fileobj.read(2)
+        if magic != '\037\213':
+            raise IOError, 'Not a gzipped file'
+        method = ord( self.fileobj.read(1) )
+        if method != 8:
+            raise IOError, 'Unknown compression method'
+        flag = ord( self.fileobj.read(1) )
+        self.mtime = read32(self.fileobj)
+        # extraflag = self.fileobj.read(1)
+        # os = self.fileobj.read(1)
+        self.fileobj.read(2)
+
+        if flag & FEXTRA:
+            # Read & discard the extra field, if present
+            xlen = ord(self.fileobj.read(1))
+            xlen = xlen + 256*ord(self.fileobj.read(1))
+            self.fileobj.read(xlen)
+        if flag & FNAME:
+            # Read and discard a null-terminated string containing the filename
+            while True:
+                s = self.fileobj.read(1)
+                if not s or s=='\000':
+                    break
+        if flag & FCOMMENT:
+            # Read and discard a null-terminated string containing a comment
+            while True:
+                s = self.fileobj.read(1)
+                if not s or s=='\000':
+                    break
+        if flag & FHCRC:
+            self.fileobj.read(2)     # Read & discard the 16-bit header CRC
+
+    def write(self,data):
+        self._check_closed()
+        if self.mode != WRITE:
+            import errno
+            raise IOError(errno.EBADF, "write() on read-only GzipFile object")
+
+        if self.fileobj is None:
+            raise ValueError, "write() on closed GzipFile object"
+
+        # Convert data type if called by io.BufferedWriter.
+        if isinstance(data, memoryview):
+            data = data.tobytes()
+
+        if len(data) > 0:
+            self.size = self.size + len(data)
+            self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
+            self.fileobj.write( self.compress.compress(data) )
+            self.offset += len(data)
+
+        return len(data)
+
+    def read(self, size=-1):
+        self._check_closed()
+        if self.mode != READ:
+            import errno
+            raise IOError(errno.EBADF, "read() on write-only GzipFile object")
+
+        if self.extrasize <= 0 and self.fileobj is None:
+            return ''
+
+        readsize = 1024
+        if size < 0:        # get the whole thing
+            try:
+                while True:
+                    self._read(readsize)
+                    readsize = min(self.max_read_chunk, readsize * 2)
+            except EOFError:
+                size = self.extrasize
+        elif size == 0:
+            return ""
+        else:               # just get some more of it
+            try:
+                while size > self.extrasize:
+                    self._read(readsize)
+                    readsize = min(self.max_read_chunk, readsize * 2)
+            except EOFError:
+                if size > self.extrasize:
+                    size = self.extrasize
+
+        offset = self.offset - self.extrastart
+        chunk = self.extrabuf[offset: offset + size]
+        self.extrasize = self.extrasize - size
+
+        self.offset += size
+        return chunk
+
+    def _unread(self, buf):
+        self.extrasize = len(buf) + self.extrasize
+        self.offset -= len(buf)
+
+    def _read(self, size=1024):
+        if self.fileobj is None:
+            raise EOFError, "Reached EOF"
+
+        if self._new_member:
+            # If the _new_member flag is set, we have to
+            # jump to the next member, if there is one.
+            #
+            # First, check if we're at the end of the file;
+            # if so, it's time to stop; no more members to read.
+            pos = self.fileobj.tell()   # Save current position
+            self.fileobj.seek(0, 2)     # Seek to end of file
+            if pos == self.fileobj.tell():
+                raise EOFError, "Reached EOF"
+            else:
+                self.fileobj.seek( pos ) # Return to original position
+
+            self._init_read()
+            self._read_gzip_header()
+            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
+            self._new_member = False
+
+        # Read a chunk of data from the file
+        buf = self.fileobj.read(size)
+
+        # If the EOF has been reached, flush the decompression object
+        # and mark this object as finished.
+
+        if buf == "":
+            uncompress = self.decompress.flush()
+            self._read_eof()
+            self._add_read_data( uncompress )
+            raise EOFError, 'Reached EOF'
+
+        uncompress = self.decompress.decompress(buf)
+        self._add_read_data( uncompress )
+
+        if self.decompress.unused_data != "":
+            # Ending case: we've come to the end of a member in the file,
+            # so seek back to the start of the unused data, finish up
+            # this member, and read a new gzip header.
+            # (The number of bytes to seek back is the length of the unused
+            # data, minus 8 because _read_eof() will rewind a further 8 bytes)
+            self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
+
+            # Check the CRC and file size, and set the flag so we read
+            # a new member on the next call
+            self._read_eof()
+            self._new_member = True
+
+    def _add_read_data(self, data):
+        self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
+        offset = self.offset - self.extrastart
+        self.extrabuf = self.extrabuf[offset:] + data
+        self.extrasize = self.extrasize + len(data)
+        self.extrastart = self.offset
+        self.size = self.size + len(data)
+
+    def _read_eof(self):
+        # We've read to the end of the file, so we have to rewind in order
+        # to reread the 8 bytes containing the CRC and the file size.
+        # We check the that the computed CRC and size of the
+        # uncompressed data matches the stored values.  Note that the size
+        # stored is the true file size mod 2**32.
+        self.fileobj.seek(-8, 1)
+        crc32 = read32(self.fileobj)
+        isize = read32(self.fileobj)  # may exceed 2GB
+        if crc32 != self.crc:
+            raise IOError("CRC check failed %s != %s" % (hex(crc32),
+                                                         hex(self.crc)))
+        elif isize != (self.size & 0xffffffffL):
+            raise IOError, "Incorrect length of data produced"
+
+        # Gzip files can be padded with zeroes and still have archives.
+        # Consume all zero bytes and set the file position to the first
+        # non-zero byte. See http://www.gzip.org/#faq8
+        c = "\x00"
+        while c == "\x00":
+            c = self.fileobj.read(1)
+        if c:
+            self.fileobj.seek(-1, 1)
+
+    @property
+    def closed(self):
+        return self.fileobj is None
+
+    def close(self):
+        if self.fileobj is None:
+            return
+        if self.mode == WRITE:
+            self.fileobj.write(self.compress.flush())
+            write32u(self.fileobj, self.crc)
+            # self.size may exceed 2GB, or even 4GB
+            write32u(self.fileobj, self.size & 0xffffffffL)
+            self.fileobj = None
+        elif self.mode == READ:
+            self.fileobj = None
+        if self.myfileobj:
+            self.myfileobj.close()
+            self.myfileobj = None
+
+    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
+        self._check_closed()
+        if self.mode == WRITE:
+            # Ensure the compressor's buffer is flushed
+            self.fileobj.write(self.compress.flush(zlib_mode))
+            self.fileobj.flush()
+
+    def fileno(self):
+        """Invoke the underlying file object's fileno() method.
+
+        This will raise AttributeError if the underlying file object
+        doesn't support fileno().
+        """
+        return self.fileobj.fileno()
+
+    def rewind(self):
+        '''Return the uncompressed stream file position indicator to the
+        beginning of the file'''
+        if self.mode != READ:
+            raise IOError("Can't rewind in write mode")
+        self.fileobj.seek(0)
+        self._new_member = True
+        self.extrabuf = ""
+        self.extrasize = 0
+        self.extrastart = 0
+        self.offset = 0
+
+    def readable(self):
+        return self.mode == READ
+
+    def writable(self):
+        return self.mode == WRITE
+
+    def seekable(self):
+        return True
+
+    def seek(self, offset, whence=0):
+        if whence:
+            if whence == 1:
+                offset = self.offset + offset
+            else:
+                raise ValueError('Seek from end not supported')
+        if self.mode == WRITE:
+            if offset < self.offset:
+                raise IOError('Negative seek in write mode')
+            count = offset - self.offset
+            for i in range(count // 1024):
+                self.write(1024 * '\0')
+            self.write((count % 1024) * '\0')
+        elif self.mode == READ:
+            if offset == self.offset:
+                self.read(0) # to make sure that this file is open
+                return self.offset
+            if offset < self.offset:
+                # for negative seek, rewind and do positive seek
+                self.rewind()
+            count = offset - self.offset
+            for i in range(count // 1024):
+                self.read(1024)
+            self.read(count % 1024)
+
+        return self.offset
+
+    def readline(self, size=-1):
+        if size < 0:
+            # Shortcut common case - newline found in buffer.
+            offset = self.offset - self.extrastart
+            i = self.extrabuf.find('\n', offset) + 1
+            if i > 0:
+                self.extrasize -= i - offset
+                self.offset += i - offset
+                return self.extrabuf[offset: i]
+
+            size = sys.maxint
+            readsize = self.min_readsize
+        else:
+            readsize = size
+        bufs = []
+        while size != 0:
+            c = self.read(readsize)
+            i = c.find('\n')
+
+            # We set i=size to break out of the loop under two
+            # conditions: 1) there's no newline, and the chunk is
+            # larger than size, or 2) there is a newline, but the
+            # resulting line would be longer than 'size'.
+            if (size <= i) or (i == -1 and len(c) > size):
+                i = size - 1
+
+            if i >= 0 or c == '':
+                bufs.append(c[:i + 1])    # Add portion of last chunk
+                self._unread(c[i + 1:])   # Push back rest of chunk
+                break
+
+            # Append chunk to list, decrease 'size',
+            bufs.append(c)
+            size = size - len(c)
+            readsize = min(size, readsize * 2)
+        if readsize > self.min_readsize:
+            self.min_readsize = min(readsize, self.min_readsize * 2, 512)
+        return ''.join(bufs) # Return resulting line
+
+
+def _test():
+    # Act like gzip; with -d, act like gunzip.
+    # The input file is not deleted, however, nor are any other gzip
+    # options or features supported.
+    args = sys.argv[1:]
+    decompress = args and args[0] == "-d"
+    if decompress:
+        args = args[1:]
+    if not args:
+        args = ["-"]
+    for arg in args:
+        if decompress:
+            if arg == "-":
+                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
+                g = sys.stdout
+            else:
+                if arg[-3:] != ".gz":
+                    print "filename doesn't end in .gz:", repr(arg)
+                    continue
+                f = open(arg, "rb")
+                g = __builtin__.open(arg[:-3], "wb")
+        else:
+            if arg == "-":
+                f = sys.stdin
+                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
+            else:
+                f = __builtin__.open(arg, "rb")
+                g = open(arg + ".gz", "wb")
+        while True:
+            chunk = f.read(1024)
+            if not chunk:
+                break
+            g.write(chunk)
+        if g is not sys.stdout:
+            g.close()
+        if f is not sys.stdin:
+            f.close()
+
+if __name__ == '__main__':
+    _test()
diff --git a/lib-python/modified-2.7/pickle.py b/lib-python/modified-2.7/pickle.py
--- a/lib-python/modified-2.7/pickle.py
+++ b/lib-python/modified-2.7/pickle.py
@@ -168,7 +168,7 @@
 
 # Pickling machinery
 
-class Pickler:
+class Pickler(object):
 
     def __init__(self, file, protocol=None):
         """This takes a file-like object for writing a pickle data stream.
diff --git a/lib-python/modified-2.7/sqlite3/test/regression.py b/lib-python/modified-2.7/sqlite3/test/regression.py
--- a/lib-python/modified-2.7/sqlite3/test/regression.py
+++ b/lib-python/modified-2.7/sqlite3/test/regression.py
@@ -274,6 +274,18 @@
         cur.execute("UPDATE foo SET id = 3 WHERE id = 1")
         self.assertEqual(cur.description, None)
 
+    def CheckStatementCache(self):
+        cur = self.con.cursor()
+        cur.execute("CREATE TABLE foo (id INTEGER)")
+        values = [(i,) for i in xrange(5)]
+        cur.executemany("INSERT INTO foo (id) VALUES (?)", values)
+
+        cur.execute("SELECT id FROM foo")
+        self.assertEqual(list(cur), values)
+        self.con.commit()
+        cur.execute("SELECT id FROM foo")
+        self.assertEqual(list(cur), values)
+
 def suite():
     regression_suite = unittest.makeSuite(RegressionTests, "Check")
     return unittest.TestSuite((regression_suite,))
diff --git a/lib-python/modified-2.7/tarfile.py b/lib-python/modified-2.7/tarfile.py
--- a/lib-python/modified-2.7/tarfile.py
+++ b/lib-python/modified-2.7/tarfile.py
@@ -252,8 +252,8 @@
        the high bit set. So we calculate two checksums, unsigned and
        signed.
     """
-    unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
-    signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
+    unsigned_chksum = 256 + sum(struct.unpack("148B8x356B", buf[:512]))
+    signed_chksum = 256 + sum(struct.unpack("148b8x356b", buf[:512]))
     return unsigned_chksum, signed_chksum
 
 def copyfileobj(src, dst, length=None):
@@ -265,7 +265,6 @@
     if length is None:
         shutil.copyfileobj(src, dst)
         return
-
     BUFSIZE = 16 * 1024
     blocks, remainder = divmod(length, BUFSIZE)
     for b in xrange(blocks):
@@ -802,19 +801,19 @@
         if self.closed:
             raise ValueError("I/O operation on closed file")
 
-        buf = ""
         if self.buffer:
             if size is None:
-                buf = self.buffer
+                buf = self.buffer + self.fileobj.read()
                 self.buffer = ""
             else:
                 buf = self.buffer[:size]
                 self.buffer = self.buffer[size:]
-
-        if size is None:
-            buf += self.fileobj.read()
+                buf += self.fileobj.read(size - len(buf))
         else:
-            buf += self.fileobj.read(size - len(buf))
+            if size is None:
+                buf = self.fileobj.read()
+            else:
+                buf = self.fileobj.read(size)
 
         self.position += len(buf)
         return buf
diff --git a/lib-python/modified-2.7/test/regrtest.py b/lib-python/modified-2.7/test/regrtest.py
--- a/lib-python/modified-2.7/test/regrtest.py
+++ b/lib-python/modified-2.7/test/regrtest.py
@@ -1403,7 +1403,26 @@
         test_zipimport
         test_zlib
         """,
-    'openbsd3':
+    'openbsd4':
+        """
+        test_ascii_formatd
+        test_bsddb
+        test_bsddb3
+        test_ctypes
+        test_dl
+        test_epoll
+        test_gdbm
+        test_locale
+        test_normalization
+        test_ossaudiodev
+        test_pep277
+        test_tcl
+        test_tk
+        test_ttk_guionly
+        test_ttk_textonly
+        test_multiprocessing
+        """,
+    'openbsd5':
         """
         test_ascii_formatd
         test_bsddb
diff --git a/lib-python/modified-2.7/test/test_bz2.py b/lib-python/modified-2.7/test/test_bz2.py
--- a/lib-python/modified-2.7/test/test_bz2.py
+++ b/lib-python/modified-2.7/test/test_bz2.py
@@ -50,6 +50,7 @@
         self.filename = TESTFN
 
     def tearDown(self):
+        test_support.gc_collect()
         if os.path.isfile(self.filename):
             os.unlink(self.filename)
 
diff --git a/lib-python/modified-2.7/test/test_fcntl.py b/lib-python/modified-2.7/test/test_fcntl.py
new file mode 100644
--- /dev/null
+++ b/lib-python/modified-2.7/test/test_fcntl.py
@@ -0,0 +1,108 @@
+"""Test program for the fcntl C module.
+
+OS/2+EMX doesn't support the file locking operations.
+
+"""
+import os
+import struct
+import sys
+import unittest
+from test.test_support import (verbose, TESTFN, unlink, run_unittest,
+    import_module)
+
+# Skip test if no fnctl module.
+fcntl = import_module('fcntl')
+
+
+# TODO - Write tests for flock() and lockf().
+
+def get_lockdata():
+    if sys.platform.startswith('atheos'):
+        start_len = "qq"
+    else:
+        try:
+            os.O_LARGEFILE
+        except AttributeError:
+            start_len = "ll"
+        else:
+            start_len = "qq"
+
+    if sys.platform in ('netbsd1', 'netbsd2', 'netbsd3',
+                        'Darwin1.2', 'darwin',
+                        'freebsd2', 'freebsd3', 'freebsd4', 'freebsd5',
+                        'freebsd6', 'freebsd7', 'freebsd8',
+                        'bsdos2', 'bsdos3', 'bsdos4',
+                        'openbsd', 'openbsd2', 'openbsd3', 'openbsd4', 'openbsd5'):
+        if struct.calcsize('l') == 8:
+            off_t = 'l'
+            pid_t = 'i'
+        else:
+            off_t = 'lxxxx'
+            pid_t = 'l'
+        lockdata = struct.pack(off_t + off_t + pid_t + 'hh', 0, 0, 0,
+                               fcntl.F_WRLCK, 0)
+    elif sys.platform in ['aix3', 'aix4', 'hp-uxB', 'unixware7']:
+        lockdata = struct.pack('hhlllii', fcntl.F_WRLCK, 0, 0, 0, 0, 0, 0)
+    elif sys.platform in ['os2emx']:
+        lockdata = None
+    else:
+        lockdata = struct.pack('hh'+start_len+'hh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+    if lockdata:
+        if verbose:
+            print 'struct.pack: ', repr(lockdata)
+    return lockdata
+
+lockdata = get_lockdata()
+
+
+class TestFcntl(unittest.TestCase):
+
+    def setUp(self):
+        self.f = None
+
+    def tearDown(self):
+        if self.f and not self.f.closed:
+            self.f.close()
+        unlink(TESTFN)
+
+    def test_fcntl_fileno(self):
+        # the example from the library docs
+        self.f = open(TESTFN, 'w')
+        rv = fcntl.fcntl(self.f.fileno(), fcntl.F_SETFL, os.O_NONBLOCK)
+        if verbose:
+            print 'Status from fcntl with O_NONBLOCK: ', rv
+        if sys.platform not in ['os2emx']:
+            rv = fcntl.fcntl(self.f.fileno(), fcntl.F_SETLKW, lockdata)
+            if verbose:
+                print 'String from fcntl with F_SETLKW: ', repr(rv)
+        self.f.close()
+
+    def test_fcntl_file_descriptor(self):
+        # again, but pass the file rather than numeric descriptor
+        self.f = open(TESTFN, 'w')
+        rv = fcntl.fcntl(self.f, fcntl.F_SETFL, os.O_NONBLOCK)
+        if sys.platform not in ['os2emx']:
+            rv = fcntl.fcntl(self.f, fcntl.F_SETLKW, lockdata)
+        self.f.close()
+
+    def test_fcntl_64_bit(self):
+        # Issue #1309352: fcntl shouldn't fail when the third arg fits in a
+        # C 'long' but not in a C 'int'.
+        try:
+            cmd = fcntl.F_NOTIFY
+            # This flag is larger than 2**31 in 64-bit builds
+            flags = fcntl.DN_MULTISHOT
+        except AttributeError:
+            self.skipTest("F_NOTIFY or DN_MULTISHOT unavailable")
+        fd = os.open(os.path.dirname(os.path.abspath(TESTFN)), os.O_RDONLY)
+        try:
+            fcntl.fcntl(fd, cmd, flags)
+        finally:
+            os.close(fd)
+
+
+def test_main():
+    run_unittest(TestFcntl)
+
+if __name__ == '__main__':
+    test_main()
diff --git a/lib-python/modified-2.7/test/test_multibytecodec.py b/lib-python/modified-2.7/test/test_multibytecodec.py
--- a/lib-python/modified-2.7/test/test_multibytecodec.py
+++ b/lib-python/modified-2.7/test/test_multibytecodec.py
@@ -148,7 +148,8 @@
 class Test_StreamReader(unittest.TestCase):
     def test_bug1728403(self):
         try:
-            open(TESTFN, 'w').write('\xa1')
+            with open(TESTFN, 'w') as f:
+                f.write('\xa1')
             f = codecs.open(TESTFN, encoding='cp949')
             self.assertRaises(UnicodeDecodeError, f.read, 2)
         finally:
diff --git a/lib-python/2.7/test/test_sets.py b/lib-python/modified-2.7/test/test_sets.py
copy from lib-python/2.7/test/test_sets.py
copy to lib-python/modified-2.7/test/test_sets.py
--- a/lib-python/2.7/test/test_sets.py
+++ b/lib-python/modified-2.7/test/test_sets.py
@@ -686,7 +686,9 @@
         set_list = sorted(self.set)
         self.assertEqual(len(dup_list), len(set_list))
         for i, el in enumerate(dup_list):
-            self.assertIs(el, set_list[i])
+            # Object identity is not guarnteed for immutable objects, so we
+            # can't use assertIs here.
+            self.assertEqual(el, set_list[i])
 
     def test_deep_copy(self):
         dup = copy.deepcopy(self.set)
diff --git a/lib-python/2.7/test/test_tarfile.py b/lib-python/modified-2.7/test/test_tarfile.py
copy from lib-python/2.7/test/test_tarfile.py
copy to lib-python/modified-2.7/test/test_tarfile.py
--- a/lib-python/2.7/test/test_tarfile.py
+++ b/lib-python/modified-2.7/test/test_tarfile.py
@@ -169,6 +169,7 @@
         except tarfile.ReadError:
             self.fail("tarfile.open() failed on empty archive")
         self.assertListEqual(tar.getmembers(), [])
+        tar.close()
 
     def test_null_tarfile(self):
         # Test for issue6123: Allow opening empty archives.
@@ -207,16 +208,21 @@
         fobj = open(self.tarname, "rb")
         tar = tarfile.open(fileobj=fobj, mode=self.mode)
         self.assertEqual(tar.name, os.path.abspath(fobj.name))
+        tar.close()
 
     def test_no_name_attribute(self):
-        data = open(self.tarname, "rb").read()
+        f = open(self.tarname, "rb")
+        data = f.read()
+        f.close()
         fobj = StringIO.StringIO(data)
         self.assertRaises(AttributeError, getattr, fobj, "name")
         tar = tarfile.open(fileobj=fobj, mode=self.mode)
         self.assertEqual(tar.name, None)
 
     def test_empty_name_attribute(self):
-        data = open(self.tarname, "rb").read()
+        f = open(self.tarname, "rb")
+        data = f.read()
+        f.close()
         fobj = StringIO.StringIO(data)
         fobj.name = ""
         tar = tarfile.open(fileobj=fobj, mode=self.mode)
@@ -515,6 +521,7 @@
         self.tar = tarfile.open(self.tarname, mode=self.mode, encoding="iso8859-1")
         tarinfo = self.tar.getmember("pax/umlauts-&#65533;&#65533;&#65533;&#65533;&#65533;&#65533;&#65533;")
         self._test_member(tarinfo, size=7011, chksum=md5_regtype)
+        self.tar.close()
 
 
 class LongnameTest(ReadTest):
@@ -675,6 +682,7 @@
             tar = tarfile.open(tmpname, self.mode)
             tarinfo = tar.gettarinfo(path)
             self.assertEqual(tarinfo.size, 0)
+            tar.close()
         finally:
             os.rmdir(path)
 
@@ -692,6 +700,7 @@
                 tar.gettarinfo(target)
                 tarinfo = tar.gettarinfo(link)
                 self.assertEqual(tarinfo.size, 0)
+                tar.close()
             finally:
                 os.remove(target)
                 os.remove(link)
@@ -704,6 +713,7 @@
                 tar = tarfile.open(tmpname, self.mode)
                 tarinfo = tar.gettarinfo(path)
                 self.assertEqual(tarinfo.size, 0)
+                tar.close()
             finally:
                 os.remove(path)
 
@@ -722,6 +732,7 @@
         tar.add(dstname)
         os.chdir(cwd)
         self.assertTrue(tar.getnames() == [], "added the archive to itself")
+        tar.close()
 
     def test_exclude(self):
         tempdir = os.path.join(TEMPDIR, "exclude")
@@ -742,6 +753,7 @@
             tar = tarfile.open(tmpname, "r")
             self.assertEqual(len(tar.getmembers()), 1)
             self.assertEqual(tar.getnames()[0], "empty_dir")
+            tar.close()
         finally:
             shutil.rmtree(tempdir)
 
@@ -859,7 +871,9 @@
             fobj.close()
         elif self.mode.endswith("bz2"):
             dec = bz2.BZ2Decompressor()
-            data = open(tmpname, "rb").read()
+            f = open(tmpname, "rb")
+            data = f.read()
+            f.close()
             data = dec.decompress(data)
             self.assertTrue(len(dec.unused_data) == 0,
                     "found trailing data")
@@ -938,6 +952,7 @@
                 "unable to read longname member")
         self.assertEqual(tarinfo.linkname, member.linkname,
                 "unable to read longname member")
+        tar.close()
 
     def test_longname_1023(self):
         self._test(("longnam/" * 127) + "longnam")
@@ -1030,6 +1045,7 @@
         else:
             n = tar.getmembers()[0].name
             self.assertTrue(name == n, "PAX longname creation failed")
+        tar.close()
 
     def test_pax_global_header(self):
         pax_headers = {
@@ -1058,6 +1074,7 @@
                     tarfile.PAX_NUMBER_FIELDS[key](val)
                 except (TypeError, ValueError):
                     self.fail("unable to convert pax header field")
+        tar.close()
 
     def test_pax_extended_header(self):
         # The fields from the pax header have priority over the
@@ -1077,6 +1094,7 @@
         self.assertEqual(t.pax_headers, pax_headers)
         self.assertEqual(t.name, "foo")
         self.assertEqual(t.uid, 123)
+        tar.close()
 
 
 class UstarUnicodeTest(unittest.TestCase):
@@ -1120,6 +1138,7 @@
         tarinfo.name = "foo"
         tarinfo.uname = u"&#65533;&#65533;&#65533;"
         self.assertRaises(UnicodeError, tar.addfile, tarinfo)
+        tar.close()
 
     def test_unicode_argument(self):
         tar = tarfile.open(tarname, "r", encoding="iso8859-1", errors="strict")
@@ -1174,6 +1193,7 @@
             tar = tarfile.open(tmpname, format=self.format, encoding="ascii",
                     errors=handler)
             self.assertEqual(tar.getnames()[0], name)
+            tar.close()
 
         self.assertRaises(UnicodeError, tarfile.open, tmpname,
                 encoding="ascii", errors="strict")
@@ -1186,6 +1206,7 @@
         tar = tarfile.open(tmpname, format=self.format, encoding="iso8859-1",
                 errors="utf-8")
         self.assertEqual(tar.getnames()[0], "&#65533;&#65533;&#65533;/" + u"&#65533;".encode("utf8"))
+        tar.close()
 
 
 class AppendTest(unittest.TestCase):
@@ -1213,6 +1234,7 @@
     def _test(self, names=["bar"], fileobj=None):
         tar = tarfile.open(self.tarname, fileobj=fileobj)
         self.assertEqual(tar.getnames(), names)
+        tar.close()
 
     def test_non_existing(self):
         self._add_testfile()
@@ -1231,7 +1253,9 @@
 
     def test_fileobj(self):
         self._create_testtar()
-        data = open(self.tarname).read()
+        f = open(self.tarname)
+        data = f.read()
+        f.close()
         fobj = StringIO.StringIO(data)
         self._add_testfile(fobj)
         fobj.seek(0)
@@ -1257,7 +1281,9 @@
     # Append mode is supposed to fail if the tarfile to append to
     # does not end with a zero block.
     def _test_error(self, data):
-        open(self.tarname, "wb").write(data)
+        f = open(self.tarname, "wb")
+        f.write(data)
+        f.close()
         self.assertRaises(tarfile.ReadError, self._add_testfile)
 
     def test_null(self):
diff --git a/lib-python/modified-2.7/test/test_tempfile.py b/lib-python/modified-2.7/test/test_tempfile.py
--- a/lib-python/modified-2.7/test/test_tempfile.py
+++ b/lib-python/modified-2.7/test/test_tempfile.py
@@ -23,8 +23,8 @@
 
 # TEST_FILES may need to be tweaked for systems depending on the maximum
 # number of files that can be opened at one time (see ulimit -n)
-if sys.platform in ('openbsd3', 'openbsd4'):
-    TEST_FILES = 48
+if sys.platform.startswith("openbsd"):
+    TEST_FILES = 64 # ulimit -n defaults to 128 for normal users
 else:
     TEST_FILES = 100
 
diff --git a/lib_pypy/_ctypes/basics.py b/lib_pypy/_ctypes/basics.py
--- a/lib_pypy/_ctypes/basics.py
+++ b/lib_pypy/_ctypes/basics.py
@@ -48,12 +48,14 @@
             return self.from_param(as_parameter)
 
     def get_ffi_param(self, value):
-        return self.from_param(value)._to_ffi_param()
+        cdata = self.from_param(value)
+        return cdata, cdata._to_ffi_param()
 
     def get_ffi_argtype(self):
         if self._ffiargtype:
             return self._ffiargtype
-        return _shape_to_ffi_type(self._ffiargshape)
+        self._ffiargtype = _shape_to_ffi_type(self._ffiargshape)
+        return self._ffiargtype
 
     def _CData_output(self, resbuffer, base=None, index=-1):
         #assert isinstance(resbuffer, _rawffi.ArrayInstance)
@@ -165,7 +167,8 @@
     return tp._alignmentofinstances()
 
 def byref(cdata):
-    from ctypes import pointer
+    # "pointer" is imported at the end of this module to avoid circular
+    # imports
     return pointer(cdata)
 
 def cdata_from_address(self, address):
@@ -223,5 +226,9 @@
     'Z' : _ffi.types.void_p,
     'X' : _ffi.types.void_p,
     'v' : _ffi.types.sshort,
+    '?' : _ffi.types.ubyte,
     }
 
+
+# used by "byref"
+from _ctypes.pointer import pointer
diff --git a/lib_pypy/_ctypes/function.py b/lib_pypy/_ctypes/function.py
--- a/lib_pypy/_ctypes/function.py
+++ b/lib_pypy/_ctypes/function.py
@@ -78,8 +78,6 @@
     _com_iid = None
     _is_fastpath = False
 
-    __restype_set = False
-
     def _getargtypes(self):
         return self._argtypes_
 
@@ -93,13 +91,15 @@
                     raise TypeError(
                         "item %d in _argtypes_ has no from_param method" % (
                             i + 1,))
-            #
-            if all([hasattr(argtype, '_ffiargshape') for argtype in argtypes]):
-                fastpath_cls = make_fastpath_subclass(self.__class__)
-                fastpath_cls.enable_fastpath_maybe(self)
             self._argtypes_ = list(argtypes)
+            self._check_argtypes_for_fastpath()
     argtypes = property(_getargtypes, _setargtypes)
 
+    def _check_argtypes_for_fastpath(self):
+        if all([hasattr(argtype, '_ffiargshape') for argtype in self._argtypes_]):
+            fastpath_cls = make_fastpath_subclass(self.__class__)
+            fastpath_cls.enable_fastpath_maybe(self)
+
     def _getparamflags(self):
         return self._paramflags
 
@@ -149,7 +149,6 @@
         return self._restype_
 
     def _setrestype(self, restype):
-        self.__restype_set = True
         self._ptr = None
         if restype is int:
             from ctypes import c_int
@@ -219,6 +218,7 @@
                 import ctypes
                 restype = ctypes.c_int
             self._ptr = self._getfuncptr_fromaddress(self._argtypes_, restype)
+            self._check_argtypes_for_fastpath()
             return
 
         
@@ -296,13 +296,12 @@
                     "This function takes %d argument%s (%s given)"
                     % (len(self._argtypes_), plural, len(args)))
 
-            # check that arguments are convertible
-            ## XXX Not as long as ctypes.cast is a callback function with
-            ## py_object arguments...
-            ## self._convert_args(self._argtypes_, args, {})
-
             try:
-                res = self.callable(*args)
+                newargs = self._convert_args_for_callback(argtypes, args)
+            except (UnicodeError, TypeError, ValueError), e:
+                raise ArgumentError(str(e))
+            try:
+                res = self.callable(*newargs)
             except:
                 exc_info = sys.exc_info()
                 traceback.print_tb(exc_info[2], file=sys.stderr)
@@ -316,10 +315,6 @@
             warnings.warn('C function without declared arguments called',
                           RuntimeWarning, stacklevel=2)
             argtypes = []
-            
-        if not self.__restype_set:
-            warnings.warn('C function without declared return type called',
-                          RuntimeWarning, stacklevel=2)
 
         if self._com_index:
             from ctypes import cast, c_void_p, POINTER
@@ -328,12 +323,14 @@
                     "native COM method call without 'this' parameter"
                     )
             thisarg = cast(args[0], POINTER(POINTER(c_void_p)))
-            newargs, argtypes, outargs = self._convert_args(argtypes, args[1:], kwargs)
+            keepalives, newargs, argtypes, outargs = self._convert_args(argtypes,
+                                                                        args[1:], kwargs)
             newargs.insert(0, args[0].value)
             argtypes.insert(0, c_void_p)
         else:
             thisarg = None
-            newargs, argtypes, outargs = self._convert_args(argtypes, args, kwargs)
+            keepalives, newargs, argtypes, outargs = self._convert_args(argtypes,
+                                                                        args, kwargs)
 
         funcptr = self._getfuncptr(argtypes, self._restype_, thisarg)
         result = self._call_funcptr(funcptr, *newargs)
@@ -364,7 +361,10 @@
             if self._flags_ & _rawffi.FUNCFLAG_USE_LASTERROR:
                 set_last_error(_rawffi.get_last_error())
         #
-        return self._build_result(self._restype_, result, newargs)
+        try:
+            return self._build_result(self._restype_, result, newargs)
+        finally:
+            funcptr.free_temp_buffers()
 
     def _do_errcheck(self, result, args):
         # The 'errcheck' protocol
@@ -437,16 +437,15 @@
     @classmethod
     def _conv_param(cls, argtype, arg):
         if isinstance(argtype, _CDataMeta):
-            #arg = argtype.from_param(arg)
-            arg = argtype.get_ffi_param(arg)
-            return arg, argtype
+            cobj, ffiparam = argtype.get_ffi_param(arg)
+            return cobj, ffiparam, argtype
         
         if argtype is not None:
             arg = argtype.from_param(arg)
         if hasattr(arg, '_as_parameter_'):
             arg = arg._as_parameter_
         if isinstance(arg, _CData):
-            return arg._to_ffi_param(), type(arg)
+            return arg, arg._to_ffi_param(), type(arg)
         #
         # non-usual case: we do the import here to save a lot of code in the
         # jit trace of the normal case
@@ -463,11 +462,25 @@
         else:
             raise TypeError("Don't know how to handle %s" % (arg,))
 
-        return cobj._to_ffi_param(), type(cobj)
+        return cobj, cobj._to_ffi_param(), type(cobj)
+
+    def _convert_args_for_callback(self, argtypes, args):
+        assert len(argtypes) == len(args)
+        newargs = []
+        for argtype, arg in zip(argtypes, args):
+            param = argtype.from_param(arg)
+            _type_ = getattr(argtype, '_type_', None)
+            if _type_ == 'P': # special-case for c_void_p
+                param = param._get_buffer_value()
+            elif self._is_primitive(argtype):
+                param = param.value
+            newargs.append(param)
+        return newargs
 
     def _convert_args(self, argtypes, args, kwargs, marker=object()):
         newargs = []
         outargs = []
+        keepalives = []
         newargtypes = []
         total = len(args)
         paramflags = self._paramflags
@@ -495,7 +508,8 @@
                     val = defval
                     if val is marker:
                         val = 0
-                    newarg, newargtype = self._conv_param(argtype, val)
+                    keepalive, newarg, newargtype = self._conv_param(argtype, val)
+                    keepalives.append(keepalive)
                     newargs.append(newarg)
                     newargtypes.append(newargtype)
                 elif flag in (0, PARAMFLAG_FIN):
@@ -511,28 +525,32 @@
                         raise TypeError("required argument '%s' missing" % name)
                     else:
                         raise TypeError("not enough arguments")
-                    newarg, newargtype = self._conv_param(argtype, val)
+                    keepalive, newarg, newargtype = self._conv_param(argtype, val)
+                    keepalives.append(keepalive)
                     newargs.append(newarg)
                     newargtypes.append(newargtype)
                 elif flag == PARAMFLAG_FOUT:
                     if defval is not marker:
                         outargs.append(defval)
-                        newarg, newargtype = self._conv_param(argtype, defval)
+                        keepalive, newarg, newargtype = self._conv_param(argtype, defval)
                     else:
                         import ctypes
                         val = argtype._type_()
                         outargs.append(val)
+                        keepalive = None
                         newarg = ctypes.byref(val)
                         newargtype = type(newarg)
+                    keepalives.append(keepalive)
                     newargs.append(newarg)
                     newargtypes.append(newargtype)
                 else:
                     raise ValueError("paramflag %d not yet implemented" % flag)
             else:
                 try:
-                    newarg, newargtype = self._conv_param(argtype, args[i])
+                    keepalive, newarg, newargtype = self._conv_param(argtype, args[i])
                 except (UnicodeError, TypeError, ValueError), e:
                     raise ArgumentError(str(e))
+                keepalives.append(keepalive)
                 newargs.append(newarg)
                 newargtypes.append(newargtype)
                 inargs_idx += 1
@@ -541,13 +559,17 @@
             extra = args[len(newargs):]
             for i, arg in enumerate(extra):
                 try:
-                    newarg, newargtype = self._conv_param(None, arg)
+                    keepalive, newarg, newargtype = self._conv_param(None, arg)
                 except (UnicodeError, TypeError, ValueError), e:
                     raise ArgumentError(str(e))
+                keepalives.append(keepalive)
                 newargs.append(newarg)
                 newargtypes.append(newargtype)
-        return newargs, newargtypes, outargs
+        return keepalives, newargs, newargtypes, outargs
 
+    @staticmethod
+    def _is_primitive(argtype):
+        return argtype.__bases__[0] is _SimpleCData
     
     def _wrap_result(self, restype, result):
         """
@@ -556,7 +578,7 @@
         """
         # hack for performance: if restype is a "simple" primitive type, don't
         # allocate the buffer because it's going to be thrown away immediately
-        if restype.__bases__[0] is _SimpleCData and not restype._is_pointer_like():
+        if self._is_primitive(restype) and not restype._is_pointer_like():
             return result
         #
         shape = restype._ffishape
@@ -672,7 +694,7 @@
             try:
                 result = self._call_funcptr(funcptr, *args)
                 result = self._do_errcheck(result, args)
-            except (TypeError, ArgumentError): # XXX, should be FFITypeError
+            except (TypeError, ArgumentError, UnicodeDecodeError):
                 assert self._slowpath_allowed
                 return CFuncPtr.__call__(self, *args)
             return result
diff --git a/lib_pypy/_ctypes/primitive.py b/lib_pypy/_ctypes/primitive.py
--- a/lib_pypy/_ctypes/primitive.py
+++ b/lib_pypy/_ctypes/primitive.py
@@ -10,6 +10,8 @@
 from _ctypes.builtin import ConvMode
 from _ctypes.array import Array
 from _ctypes.pointer import _Pointer, as_ffi_pointer
+#from _ctypes.function import CFuncPtr # this import is moved at the bottom
+                                       # because else it's circular
 
 class NULL(object):
     pass
@@ -86,7 +88,7 @@
         return res
     if isinstance(value, Array):
         return value
-    if isinstance(value, _Pointer):
+    if isinstance(value, (_Pointer, CFuncPtr)):
         return cls.from_address(value._buffer.buffer)
     if isinstance(value, (int, long)):
         return cls(value)
@@ -338,3 +340,5 @@
 
     def __nonzero__(self):
         return self._buffer[0] not in (0, '\x00')
+
+from _ctypes.function import CFuncPtr
diff --git a/lib_pypy/_ctypes/structure.py b/lib_pypy/_ctypes/structure.py
--- a/lib_pypy/_ctypes/structure.py
+++ b/lib_pypy/_ctypes/structure.py
@@ -14,6 +14,15 @@
             raise TypeError("Expected CData subclass, got %s" % (tp,))
         if isinstance(tp, StructOrUnionMeta):
             tp._make_final()
+        if len(f) == 3:
+            if (not hasattr(tp, '_type_')
+                or not isinstance(tp._type_, str)
+                or tp._type_ not in "iIhHbBlL"):
+                #XXX: are those all types?
+                #     we just dont get the type name
+                #     in the interp levle thrown TypeError
+                #     from rawffi if there are more
+                raise TypeError('bit fields not allowed for type ' + tp.__name__)
 
     all_fields = []
     for cls in reversed(inspect.getmro(superclass)):
@@ -34,34 +43,37 @@
     for i, field in enumerate(all_fields):
         name = field[0]
         value = field[1]
+        is_bitfield = (len(field) == 3)
         fields[name] = Field(name,
                              self._ffistruct.fieldoffset(name),
                              self._ffistruct.fieldsize(name),
-                             value, i)
+                             value, i, is_bitfield)
 
     if anonymous_fields:
         resnames = []
         for i, field in enumerate(all_fields):
             name = field[0]
             value = field[1]
+            is_bitfield = (len(field) == 3)
             startpos = self._ffistruct.fieldoffset(name)
             if name in anonymous_fields:
                 for subname in value._names:
                     resnames.append(subname)
-                    relpos = startpos + value._fieldtypes[subname].offset
-                    subvalue = value._fieldtypes[subname].ctype
+                    subfield = getattr(value, subname)
+                    relpos = startpos + subfield.offset
+                    subvalue = subfield.ctype
                     fields[subname] = Field(subname,
                                             relpos, subvalue._sizeofinstances(),
-                                            subvalue, i)
+                                            subvalue, i, is_bitfield)
             else:
                 resnames.append(name)
         names = resnames
     self._names = names
-    self._fieldtypes = fields
+    self.__dict__.update(fields)
 
 class Field(object):
-    def __init__(self, name, offset, size, ctype, num):
-        for k in ('name', 'offset', 'size', 'ctype', 'num'):
+    def __init__(self, name, offset, size, ctype, num, is_bitfield):
+        for k in ('name', 'offset', 'size', 'ctype', 'num', 'is_bitfield'):
             self.__dict__[k] = locals()[k]
 
     def __setattr__(self, name, value):
@@ -71,6 +83,35 @@
         return "<Field '%s' offset=%d size=%d>" % (self.name, self.offset,
                                                    self.size)
 
+    def __get__(self, obj, cls=None):
+        if obj is None:
+            return self
+        if self.is_bitfield:
+            # bitfield member, use direct access
+            return obj._buffer.__getattr__(self.name)
+        else:
+            fieldtype = self.ctype
+            offset = self.num
+            suba = obj._subarray(fieldtype, self.name)
+            return fieldtype._CData_output(suba, obj, offset)
+
+
+    def __set__(self, obj, value):
+        fieldtype = self.ctype
+        cobj = fieldtype.from_param(value)
+        if ensure_objects(cobj) is not None:
+            key = keepalive_key(self.num)
+            store_reference(obj, key, cobj._objects)
+        arg = cobj._get_buffer_value()
+        if fieldtype._fficompositesize is not None:
+            from ctypes import memmove
+            dest = obj._buffer.fieldaddress(self.name)
+            memmove(dest, arg, fieldtype._fficompositesize)
+        else:
+            obj._buffer.__setattr__(self.name, arg)
+
+
+
 # ________________________________________________________________
 
 def _set_shape(tp, rawfields, is_union=False):
@@ -79,17 +120,12 @@
     tp._ffiargshape = tp._ffishape = (tp._ffistruct, 1)
     tp._fficompositesize = tp._ffistruct.size
 
-def struct_getattr(self, name):
-    if name not in ('_fields_', '_fieldtypes'):
-        if hasattr(self, '_fieldtypes') and name in self._fieldtypes:
-            return self._fieldtypes[name]
-    return _CDataMeta.__getattribute__(self, name)
 
 def struct_setattr(self, name, value):
     if name == '_fields_':
         if self.__dict__.get('_fields_', None) is not None:
             raise AttributeError("_fields_ is final")
-        if self in [v for k, v in value]:
+        if self in [f[1] for f in value]:
             raise AttributeError("Structure or union cannot contain itself")
         names_and_fields(
             self,
@@ -127,14 +163,14 @@
         if '_fields_' not in self.__dict__:
             self._fields_ = []
             self._names = []
-            self._fieldtypes = {}
             _set_shape(self, [], self._is_union)
 
-    __getattr__ = struct_getattr
     __setattr__ = struct_setattr
 
     def from_address(self, address):
         instance = StructOrUnion.__new__(self)
+        if isinstance(address, _rawffi.StructureInstance):
+            address = address.buffer
         instance.__dict__['_buffer'] = self._ffistruct.fromaddress(address)
         return instance
 
@@ -200,40 +236,6 @@
         A = _rawffi.Array(fieldtype._ffishape)
         return A.fromaddress(address, 1)
 
-    def __setattr__(self, name, value):
-        try:
-            field = self._fieldtypes[name]
-        except KeyError:
-            return _CData.__setattr__(self, name, value)
-        fieldtype = field.ctype
-        cobj = fieldtype.from_param(value)
-        if ensure_objects(cobj) is not None:
-            key = keepalive_key(field.num)
-            store_reference(self, key, cobj._objects)
-        arg = cobj._get_buffer_value()
-        if fieldtype._fficompositesize is not None:
-            from ctypes import memmove
-            dest = self._buffer.fieldaddress(name)
-            memmove(dest, arg, fieldtype._fficompositesize)
-        else:
-            self._buffer.__setattr__(name, arg)
-
-    def __getattribute__(self, name):
-        if name == '_fieldtypes':
-            return _CData.__getattribute__(self, '_fieldtypes')
-        try:
-            field = self._fieldtypes[name]
-        except KeyError:
-            return _CData.__getattribute__(self, name)
-        if field.size >> 16:
-            # bitfield member, use direct access
-            return self._buffer.__getattr__(name)
-        else:
-            fieldtype = field.ctype
-            offset = field.num
-            suba = self._subarray(fieldtype, name)
-            return fieldtype._CData_output(suba, self, offset)
-
     def _get_buffer_for_param(self):
         return self
 
diff --git a/lib_pypy/_elementtree.py b/lib_pypy/_elementtree.py
new file mode 100644
--- /dev/null
+++ b/lib_pypy/_elementtree.py
@@ -0,0 +1,6 @@
+# Just use ElementTree.
+
+from xml.etree import ElementTree
+
+globals().update(ElementTree.__dict__)
+del __all__
diff --git a/lib_pypy/_pypy_interact.py b/lib_pypy/_pypy_interact.py
--- a/lib_pypy/_pypy_interact.py
+++ b/lib_pypy/_pypy_interact.py
@@ -56,6 +56,10 @@
                 prompt = getattr(sys, 'ps1', '>>> ')
             try:
                 line = raw_input(prompt)
+                # Can be None if sys.stdin was redefined
+                encoding = getattr(sys.stdin, 'encoding', None)
+                if encoding and not isinstance(line, unicode):
+                    line = line.decode(encoding)
             except EOFError:
                 console.write("\n")
                 break
diff --git a/lib_pypy/_sqlite3.py b/lib_pypy/_sqlite3.py
--- a/lib_pypy/_sqlite3.py
+++ b/lib_pypy/_sqlite3.py
@@ -24,6 +24,7 @@
 from ctypes import c_void_p, c_int, c_double, c_int64, c_char_p, cdll
 from ctypes import POINTER, byref, string_at, CFUNCTYPE, cast
 from ctypes import sizeof, c_ssize_t
+from collections import OrderedDict
 import datetime
 import sys
 import time
@@ -274,6 +275,28 @@
 def unicode_text_factory(x):
     return unicode(x, 'utf-8')
 
+
+class StatementCache(object):
+    def __init__(self, connection, maxcount):
+        self.connection = connection
+        self.maxcount = maxcount
+        self.cache = OrderedDict()
+
+    def get(self, sql, cursor, row_factory):
+        try:
+            stat = self.cache[sql]
+        except KeyError:
+            stat = Statement(self.connection, sql)
+            self.cache[sql] = stat
+            if len(self.cache) > self.maxcount:
+                self.cache.popitem(0)
+        #
+        if stat.in_use:
+            stat = Statement(self.connection, sql)
+        stat.set_row_factory(row_factory)
+        return stat
+
+
 class Connection(object):
     def __init__(self, database, timeout=5.0, detect_types=0, isolation_level="",
                  check_same_thread=True, factory=None, cached_statements=100):
@@ -291,6 +314,7 @@
         self.row_factory = None
         self._isolation_level = isolation_level
         self.detect_types = detect_types
+        self.statement_cache = StatementCache(self, cached_statements)
 
         self.cursors = []
 
@@ -399,7 +423,7 @@
         cur = Cursor(self)
         if not isinstance(sql, (str, unicode)):
             raise Warning("SQL is of wrong type. Must be string or unicode.")
-        statement = Statement(cur, sql, self.row_factory)
+        statement = self.statement_cache.get(sql, cur, self.row_factory)
         return statement
 
     def _get_isolation_level(self):
@@ -681,6 +705,8 @@
         from sqlite3.dump import _iterdump
         return _iterdump(self)
 
+DML, DQL, DDL = range(3)
+
 class Cursor(object):
     def __init__(self, con):
         if not isinstance(con, Connection):
@@ -708,12 +734,12 @@
         if type(sql) is unicode:
             sql = sql.encode("utf-8")
         self._check_closed()
-        self.statement = Statement(self, sql, self.row_factory)
+        self.statement = self.connection.statement_cache.get(sql, self, self.row_factory)
 
         if self.connection._isolation_level is not None:
-            if self.statement.kind == "DDL":
+            if self.statement.kind == DDL:
                 self.connection.commit()
-            elif self.statement.kind == "DML":
+            elif self.statement.kind == DML:
                 self.connection._begin()
 
         self.statement.set_params(params)
@@ -724,19 +750,18 @@
             self.statement.reset()
             raise self.connection._get_exception(ret)
 
-        if self.statement.kind == "DQL":
-            if ret == SQLITE_ROW:
-                self.statement._build_row_cast_map()
-                self.statement._readahead()
-            else:
-                self.statement.item = None
-                self.statement.exhausted = True
+        if self.statement.kind == DQL and ret == SQLITE_ROW:
+            self.statement._build_row_cast_map()
+            self.statement._readahead(self)
+        else:
+            self.statement.item = None
+            self.statement.exhausted = True
 
-        if self.statement.kind in ("DML", "DDL"):
+        if self.statement.kind == DML or self.statement.kind == DDL:
             self.statement.reset()
 
         self.rowcount = -1
-        if self.statement.kind == "DML":
+        if self.statement.kind == DML:
             self.rowcount = sqlite.sqlite3_changes(self.connection.db)
 
         return self
@@ -747,8 +772,9 @@
         if type(sql) is unicode:
             sql = sql.encode("utf-8")
         self._check_closed()
-        self.statement = Statement(self, sql, self.row_factory)
-        if self.statement.kind == "DML":
+        self.statement = self.connection.statement_cache.get(sql, self, self.row_factory)
+
+        if self.statement.kind == DML:
             self.connection._begin()
         else:
             raise ProgrammingError, "executemany is only for DML statements"
@@ -800,7 +826,7 @@
         return self
 
     def __iter__(self):
-        return self.statement
+        return iter(self.fetchone, None)
 
     def _check_reset(self):
         if self.reset:
@@ -817,7 +843,7 @@
             return None
 
         try:
-            return self.statement.next()
+            return self.statement.next(self)
         except StopIteration:
             return None
 
@@ -831,7 +857,7 @@
         if size is None:
             size = self.arraysize
         lst = []
-        for row in self.statement:
+        for row in self:
             lst.append(row)
             if len(lst) == size:
                 break
@@ -842,7 +868,7 @@
         self._check_reset()
         if self.statement is None:
             return []
-        return list(self.statement)
+        return list(self)
 
     def _getdescription(self):
         if self._description is None:
@@ -872,39 +898,47 @@
     lastrowid = property(_getlastrowid)
 
 class Statement(object):
-    def __init__(self, cur, sql, row_factory):
+    def __init__(self, connection, sql):
         self.statement = None
         if not isinstance(sql, str):
             raise ValueError, "sql must be a string"
-        self.con = cur.connection
-        self.cur = weakref.ref(cur)
+        self.con = connection
         self.sql = sql # DEBUG ONLY
-        self.row_factory = row_factory
         first_word = self._statement_kind = sql.lstrip().split(" ")[0].upper()
         if first_word in ("INSERT", "UPDATE", "DELETE", "REPLACE"):
-            self.kind = "DML"
+            self.kind = DML
         elif first_word in ("SELECT", "PRAGMA"):
-            self.kind = "DQL"
+            self.kind = DQL
         else:
-            self.kind = "DDL"
+            self.kind = DDL
         self.exhausted = False
+        self.in_use = False
+        #
+        # set by set_row_factory
+        self.row_factory = None
 
         self.statement = c_void_p()
         next_char = c_char_p()
-        ret = sqlite.sqlite3_prepare_v2(self.con.db, sql, -1, byref(self.statement), byref(next_char))
+        sql_char = c_char_p(sql)
+        ret = sqlite.sqlite3_prepare_v2(self.con.db, sql_char, -1, byref(self.statement), byref(next_char))
         if ret == SQLITE_OK and self.statement.value is None:
             # an empty statement, we work around that, as it's the least trouble
             ret = sqlite.sqlite3_prepare_v2(self.con.db, "select 42", -1, byref(self.statement), byref(next_char))
-            self.kind = "DQL"
+            self.kind = DQL
 
         if ret != SQLITE_OK:
             raise self.con._get_exception(ret)
         self.con._remember_statement(self)
         if _check_remaining_sql(next_char.value):
-            raise Warning, "One and only one statement required"
+            raise Warning, "One and only one statement required: %r" % (
+                next_char.value,)
+        # sql_char should remain alive until here
 
         self._build_row_cast_map()
 
+    def set_row_factory(self, row_factory):
+        self.row_factory = row_factory
+
     def _build_row_cast_map(self):
         self.row_cast_map = []
         for i in xrange(sqlite.sqlite3_column_count(self.statement)):
@@ -974,6 +1008,7 @@
         ret = sqlite.sqlite3_reset(self.statement)
         if ret != SQLITE_OK:
             raise self.con._get_exception(ret)
+        self.mark_dirty()
 
         if params is None:
             if sqlite.sqlite3_bind_parameter_count(self.statement) != 0:
@@ -1004,10 +1039,7 @@
                     raise ProgrammingError("missing parameter '%s'" %param)
                 self.set_param(idx, param)
 
-    def __iter__(self):
-        return self
-
-    def next(self):
+    def next(self, cursor):
         self.con._check_closed()
         self.con._check_thread()
         if self.exhausted:
@@ -1023,10 +1055,10 @@
             sqlite.sqlite3_reset(self.statement)
             raise exc
 
-        self._readahead()
+        self._readahead(cursor)
         return item
 
-    def _readahead(self):
+    def _readahead(self, cursor):
         self.column_count = sqlite.sqlite3_column_count(self.statement)
         row = []
         for i in xrange(self.column_count):
@@ -1061,23 +1093,30 @@
 
         row = tuple(row)
         if self.row_factory is not None:
-            row = self.row_factory(self.cur(), row)
+            row = self.row_factory(cursor, row)
         self.item = row
 
     def reset(self):
         self.row_cast_map = None
-        return sqlite.sqlite3_reset(self.statement)
+        ret = sqlite.sqlite3_reset(self.statement)
+        self.in_use = False
+        self.exhausted = False
+        return ret
 
     def finalize(self):
         sqlite.sqlite3_finalize(self.statement)
         self.statement = None
+        self.in_use = False
+
+    def mark_dirty(self):
+        self.in_use = True
 
     def __del__(self):
         sqlite.sqlite3_finalize(self.statement)
         self.statement = None
 
     def _get_description(self):
-        if self.kind == "DML":
+        if self.kind == DML:
             return None
         desc = []
         for i in xrange(sqlite.sqlite3_column_count(self.statement)):
diff --git a/lib_pypy/_subprocess.py b/lib_pypy/_subprocess.py
--- a/lib_pypy/_subprocess.py
+++ b/lib_pypy/_subprocess.py
@@ -35,7 +35,7 @@
 _DuplicateHandle.restype = ctypes.c_int
     
 _WaitForSingleObject = _kernel32.WaitForSingleObject
-_WaitForSingleObject.argtypes = [ctypes.c_int, ctypes.c_int]
+_WaitForSingleObject.argtypes = [ctypes.c_int, ctypes.c_uint]
 _WaitForSingleObject.restype = ctypes.c_int
 
 _GetExitCodeProcess = _kernel32.GetExitCodeProcess
diff --git a/lib_pypy/binascii.py b/lib_pypy/binascii.py
--- a/lib_pypy/binascii.py
+++ b/lib_pypy/binascii.py
@@ -659,7 +659,7 @@
         crc = crc_32_tab[(crc ^ long(ord(c))) & 0xffL] ^ (crc >> 8)
         #/* Note:  (crc >> 8) MUST zero fill on left
 
-        result = crc ^ 0xffffffffL
+    result = crc ^ 0xffffffffL
     
     if result > 2**31:
         result = ((result + 2**31) % 2**32) - 2**31
diff --git a/lib_pypy/cPickle.py b/lib_pypy/cPickle.py
--- a/lib_pypy/cPickle.py
+++ b/lib_pypy/cPickle.py
@@ -27,9 +27,9 @@
             PythonPickler.__init__(self, self.__f, args[0], **kw)
         else:
             PythonPickler.__init__(self, *args, **kw)
-            
+
     def memoize(self, obj):
-        self.memo[None] = None   # cPickle starts counting at one
+        self.memo[id(None)] = None   # cPickle starts counting at one
         return PythonPickler.memoize(self, obj)
 
     def getvalue(self):
diff --git a/lib_pypy/distributed/test/test_distributed.py b/lib_pypy/distributed/test/test_distributed.py
--- a/lib_pypy/distributed/test/test_distributed.py
+++ b/lib_pypy/distributed/test/test_distributed.py
@@ -9,7 +9,7 @@
 class AppTestDistributed(object):
     def setup_class(cls):
         cls.space = gettestobjspace(**{"objspace.std.withtproxy": True,
-            "usemodules":("_stackless",)})
+            "usemodules":("_continuation",)})
 
     def test_init(self):
         import distributed
@@ -91,10 +91,8 @@
 
 class AppTestDistributedTasklets(object):
     spaceconfig = {"objspace.std.withtproxy": True,
-                   "objspace.usemodules._stackless": True}
+                   "objspace.usemodules._continuation": True}
     def setup_class(cls):
-        #cls.space = gettestobjspace(**{"objspace.std.withtproxy": True,
-        #    "usemodules":("_stackless",)})
         cls.w_test_env = cls.space.appexec([], """():
         from distributed import test_env
         return test_env
diff --git a/lib_pypy/distributed/test/test_greensock.py b/lib_pypy/distributed/test/test_greensock.py
--- a/lib_pypy/distributed/test/test_greensock.py
+++ b/lib_pypy/distributed/test/test_greensock.py
@@ -10,7 +10,7 @@
         if not option.runappdirect:
             py.test.skip("Cannot run this on top of py.py because of PopenGateway")
         cls.space = gettestobjspace(**{"objspace.std.withtproxy": True,
-                                       "usemodules":("_stackless",)})
+                                       "usemodules":("_continuation",)})
         cls.w_remote_side_code = cls.space.appexec([], """():
         import sys
         sys.path.insert(0, '%s')
diff --git a/lib_pypy/distributed/test/test_socklayer.py b/lib_pypy/distributed/test/test_socklayer.py
--- a/lib_pypy/distributed/test/test_socklayer.py
+++ b/lib_pypy/distributed/test/test_socklayer.py
@@ -9,7 +9,8 @@
 class AppTestSocklayer:
     def setup_class(cls):
         cls.space = gettestobjspace(**{"objspace.std.withtproxy": True,
-                                       "usemodules":("_stackless","_socket", "select")})
+                                       "usemodules":("_continuation",
+                                                     "_socket", "select")})
     
     def test_socklayer(self):
         class X(object):
diff --git a/lib_pypy/greenlet.py b/lib_pypy/greenlet.py
--- a/lib_pypy/greenlet.py
+++ b/lib_pypy/greenlet.py
@@ -1,1 +1,151 @@
-from _stackless import greenlet
+import _continuation, sys
+
+
+# ____________________________________________________________
+# Exceptions
+
+class GreenletExit(Exception):
+    """This special exception does not propagate to the parent greenlet; it
+can be used to kill a single greenlet."""
+
+error = _continuation.error
+
+# ____________________________________________________________
+# Helper function
+
+def getcurrent():
+    "Returns the current greenlet (i.e. the one which called this function)."
+    try:
+        return _tls.current
+    except AttributeError:
+        # first call in this thread: current == main
+        _green_create_main()
+        return _tls.current
+
+# ____________________________________________________________
+# The 'greenlet' class
+
+_continulet = _continuation.continulet
+
+class greenlet(_continulet):
+    getcurrent = staticmethod(getcurrent)
+    error = error
+    GreenletExit = GreenletExit
+    __main = False
+    __started = False
+
+    def __new__(cls, *args, **kwds):
+        self = _continulet.__new__(cls)
+        self.parent = getcurrent()
+        return self
+
+    def __init__(self, run=None, parent=None):
+        if run is not None:
+            self.run = run
+        if parent is not None:
+            self.parent = parent
+
+    def switch(self, *args):
+        "Switch execution to this greenlet, optionally passing the values "
+        "given as argument(s).  Returns the value passed when switching back."
+        return self.__switch(_continulet.switch, args)
+
+    def throw(self, typ=GreenletExit, val=None, tb=None):
+        "raise exception in greenlet, return value passed when switching back"
+        return self.__switch(_continulet.throw, typ, val, tb)
+
+    def __switch(target, unbound_method, *args):
+        current = getcurrent()
+        #
+        while not target:
+            if not target.__started:
+                if unbound_method != _continulet.throw:
+                    greenlet_func = _greenlet_start
+                else:
+                    greenlet_func = _greenlet_throw
+                _continulet.__init__(target, greenlet_func, *args)
+                unbound_method = _continulet.switch
+                args = ()
+                target.__started = True
+                break
+            # already done, go to the parent instead
+            # (NB. infinite loop possible, but unlikely, unless you mess
+            # up the 'parent' explicitly.  Good enough, because a Ctrl-C
+            # will show that the program is caught in this loop here.)
+            target = target.parent
+        #
+        try:
+            if current.__main:
+                if target.__main:
+                    # switch from main to main
+                    if unbound_method == _continulet.throw:
+                        raise args[0], args[1], args[2]
+                    (args,) = args
+                else:
+                    # enter from main to target
+                    args = unbound_method(target, *args)
+            else:
+                if target.__main:
+                    # leave to go to target=main
+                    args = unbound_method(current, *args)
+                else:
+                    # switch from non-main to non-main
+                    args = unbound_method(current, *args, to=target)
+        except GreenletExit, e:
+            args = (e,)
+        finally:
+            _tls.current = current
+        #
+        if len(args) == 1:
+            return args[0]
+        else:
+            return args
+
+    def __nonzero__(self):
+        return self.__main or _continulet.is_pending(self)
+
+    @property
+    def dead(self):
+        return self.__started and not self
+
+    @property
+    def gr_frame(self):
+        raise NotImplementedError("attribute 'gr_frame' of greenlet objects")
+
+# ____________________________________________________________
+# Internal stuff
+
+try:
+    from thread import _local
+except ImportError:
+    class _local(object):    # assume no threads
+        pass
+
+_tls = _local()
+
+def _green_create_main():
+    # create the main greenlet for this thread
+    _tls.current = None
+    gmain = greenlet.__new__(greenlet)
+    gmain._greenlet__main = True
+    gmain._greenlet__started = True
+    assert gmain.parent is None
+    _tls.main = gmain
+    _tls.current = gmain
+
+def _greenlet_start(greenlet, args):
+    _tls.current = greenlet
+    try:
+        res = greenlet.run(*args)
+    finally:
+        if greenlet.parent is not _tls.main:
+            _continuation.permute(greenlet, greenlet.parent)
+    return (res,)
+
+def _greenlet_throw(greenlet, exc, value, tb):
+    _tls.current = greenlet
+    try:
+        raise exc, value, tb
+    finally:
+        if greenlet.parent is not _tls.main:
+            _continuation.permute(greenlet, greenlet.parent)
diff --git a/lib_pypy/pypy_test/test_coroutine.py b/lib_pypy/pypy_test/test_coroutine.py
--- a/lib_pypy/pypy_test/test_coroutine.py
+++ b/lib_pypy/pypy_test/test_coroutine.py
@@ -2,7 +2,7 @@
 from py.test import skip, raises
 
 try:
-    from lib_pypy.stackless import coroutine, CoroutineExit
+    from stackless import coroutine, CoroutineExit
 except ImportError, e:
     skip('cannot import stackless: %s' % (e,))
 
@@ -20,10 +20,6 @@
         assert not co.is_zombie
 
     def test_is_zombie_del_without_frame(self):
-        try:
-            import _stackless # are we on pypy with a stackless build?
-        except ImportError:
-            skip("only works on pypy-c-stackless")
         import gc
         res = []
         class MyCoroutine(coroutine):
@@ -45,10 +41,6 @@
         assert res[0], "is_zombie was False in __del__"
 
     def test_is_zombie_del_with_frame(self):
-        try:
-            import _stackless # are we on pypy with a stackless build?
-        except ImportError:
-            skip("only works on pypy-c-stackless")
         import gc
         res = []
         class MyCoroutine(coroutine):
diff --git a/lib_pypy/pyrepl/reader.py b/lib_pypy/pyrepl/reader.py
--- a/lib_pypy/pyrepl/reader.py
+++ b/lib_pypy/pyrepl/reader.py
@@ -401,13 +401,19 @@
             return "(arg: %s) "%self.arg
         if "\n" in self.buffer:
             if lineno == 0:
-                return self._ps2
+                res = self.ps2
             elif lineno == self.buffer.count("\n"):
-                return self._ps4
+                res = self.ps4
             else:
-                return self._ps3
+                res = self.ps3
         else:
-            return self._ps1
+            res = self.ps1
+        # Lazily call str() on self.psN, and cache the results using as key
+        # the object on which str() was called.  This ensures that even if the
+        # same object is used e.g. for ps1 and ps2, str() is called only once.
+        if res not in self._pscache:
+            self._pscache[res] = str(res)
+        return self._pscache[res]
 
     def push_input_trans(self, itrans):
         self.input_trans_stack.append(self.input_trans)
@@ -473,8 +479,7 @@
             self.pos = 0
             self.dirty = 1
             self.last_command = None
-            self._ps1, self._ps2, self._ps3, self._ps4 = \
-                           map(str, [self.ps1, self.ps2, self.ps3, self.ps4])
+            self._pscache = {}
         except:
             self.restore()
             raise
@@ -571,7 +576,7 @@
         self.console.push_char(char)
         self.handle1(0)
     
-    def readline(self):
+    def readline(self, returns_unicode=False):
         """Read a line.  The implementation of this method also shows
         how to drive Reader if you want more control over the event
         loop."""
@@ -580,6 +585,8 @@
             self.refresh()
             while not self.finished:
                 self.handle1()
+            if returns_unicode:
+                return self.get_unicode()
             return self.get_buffer()
         finally:
             self.restore()
diff --git a/lib_pypy/pyrepl/readline.py b/lib_pypy/pyrepl/readline.py
--- a/lib_pypy/pyrepl/readline.py
+++ b/lib_pypy/pyrepl/readline.py
@@ -33,7 +33,7 @@
 from pyrepl.unix_console import UnixConsole, _error
 
 
-ENCODING = 'latin1'     # XXX hard-coded
+ENCODING = sys.getfilesystemencoding() or 'latin1'     # XXX review
 
 __all__ = ['add_history',
            'clear_history',
@@ -198,7 +198,7 @@
         reader.ps1 = prompt
         return reader.readline()
 
-    def multiline_input(self, more_lines, ps1, ps2):
+    def multiline_input(self, more_lines, ps1, ps2, returns_unicode=False):
         """Read an input on possibly multiple lines, asking for more
         lines as long as 'more_lines(unicodetext)' returns an object whose
         boolean value is true.
@@ -209,7 +209,7 @@
             reader.more_lines = more_lines
             reader.ps1 = reader.ps2 = ps1
             reader.ps3 = reader.ps4 = ps2
-            return reader.readline()
+            return reader.readline(returns_unicode=returns_unicode)
         finally:
             reader.more_lines = saved
 
diff --git a/lib_pypy/pyrepl/simple_interact.py b/lib_pypy/pyrepl/simple_interact.py
--- a/lib_pypy/pyrepl/simple_interact.py
+++ b/lib_pypy/pyrepl/simple_interact.py
@@ -54,7 +54,8 @@
             ps1 = getattr(sys, 'ps1', '>>> ')
             ps2 = getattr(sys, 'ps2', '... ')
             try:
-                statement = multiline_input(more_lines, ps1, ps2)
+                statement = multiline_input(more_lines, ps1, ps2,
+                                            returns_unicode=True)
             except EOFError:
                 break
             more = console.push(statement)
diff --git a/lib_pypy/pyrepl/unix_console.py b/lib_pypy/pyrepl/unix_console.py
--- a/lib_pypy/pyrepl/unix_console.py
+++ b/lib_pypy/pyrepl/unix_console.py
@@ -384,15 +384,19 @@
 
         self.__maybe_write_code(self._smkx)
 
-        self.old_sigwinch = signal.signal(
-            signal.SIGWINCH, self.__sigwinch)
+        try:
+            self.old_sigwinch = signal.signal(
+                signal.SIGWINCH, self.__sigwinch)
+        except ValueError:
+            pass
 
     def restore(self):
         self.__maybe_write_code(self._rmkx)
         self.flushoutput()
         tcsetattr(self.input_fd, termios.TCSADRAIN, self.__svtermstate)
 
-        signal.signal(signal.SIGWINCH, self.old_sigwinch)
+        if hasattr(self, 'old_sigwinch'):
+            signal.signal(signal.SIGWINCH, self.old_sigwinch)
 
     def __sigwinch(self, signum, frame):
         self.height, self.width = self.getheightwidth()
diff --git a/lib_pypy/stackless.py b/lib_pypy/stackless.py
--- a/lib_pypy/stackless.py
+++ b/lib_pypy/stackless.py
@@ -4,121 +4,124 @@
 Please refer to their documentation.
 """
 
-DEBUG = True
-
-def dprint(*args):
-    for arg in args:
-        print arg,
-    print
 
 import traceback
-import sys
+import _continuation
+from functools import partial
+
+class TaskletExit(Exception):
+    pass
+
+CoroutineExit = TaskletExit
+
+class GWrap(_continuation.continulet):
+    """This is just a wrapper around continulet to allow
+       to stick additional attributes to a continulet.
+       To be more concrete, we need a backreference to
+       the coroutine object"""
+
+
+class coroutine(object):
+    "we can't have continulet as a base, because continulets can't be rebound"
+
+    def __init__(self):
+        self._frame = None
+        self.is_zombie = False
+
+    def __getattr__(self, attr):
+        return getattr(self._frame, attr)
+
+    def __del__(self):
+        self.is_zombie = True
+        del self._frame
+        self._frame = None
+
+    def bind(self, func, *argl, **argd):
+        """coro.bind(f, *argl, **argd) -> None.
+           binds function f to coro. f will be called with
+           arguments *argl, **argd
+        """
+        if self._frame is None or not self._frame.is_pending():
+
+            def _func(c, *args, **kwargs):
+                return func(*args, **kwargs)
+            
+            run = partial(_func, *argl, **argd)
+            self._frame = frame = GWrap(run)
+        else:
+            raise ValueError("cannot bind a bound coroutine")
+
+    def switch(self):
+        """coro.switch() -> returnvalue
+           switches to coroutine coro. If the bound function
+           f finishes, the returnvalue is that of f, otherwise
+           None is returned
+        """
+        current = _getcurrent()
+        current._jump_to(self)
+
+    def _jump_to(self, coroutine):
+        _tls.current_coroutine = coroutine
+        self._frame.switch(to=coroutine._frame)
+
+    def kill(self):
+        """coro.kill() : kill coroutine coro"""
+        _tls.current_coroutine = self
+        self._frame.throw(CoroutineExit)
+
+    def _is_alive(self):
+        if self._frame is None:
+            return False
+        return not self._frame.is_pending()
+    is_alive = property(_is_alive)
+    del _is_alive
+
+    def getcurrent():
+        """coroutine.getcurrent() -> the currently running coroutine"""
+        try:
+            return _getcurrent()
+        except AttributeError:
+            return _maincoro
+    getcurrent = staticmethod(getcurrent)
+
+    def __reduce__(self):
+        raise TypeError, 'pickling is not possible based upon continulets'
+
+
+def _getcurrent():
+    "Returns the current coroutine (i.e. the one which called this function)."
+    try:
+        return _tls.current_coroutine
+    except AttributeError:
+        # first call in this thread: current == main
+        _coroutine_create_main()
+        return _tls.current_coroutine
+
 try:
-    # If _stackless can be imported then TaskletExit and CoroutineExit are 
-    # automatically added to the builtins.
-    from _stackless import coroutine, greenlet
-except ImportError: # we are running from CPython
-    from greenlet import greenlet, GreenletExit
-    TaskletExit = CoroutineExit = GreenletExit
-    del GreenletExit
-    try:
-        from functools import partial
-    except ImportError: # we are not running python 2.5
-        class partial(object):
-            # just enough of 'partial' to be usefull
-            def __init__(self, func, *argl, **argd):
-                self.func = func
-                self.argl = argl
-                self.argd = argd
+    from thread import _local
+except ImportError:
+    class _local(object):    # assume no threads
+        pass
 
-            def __call__(self):
-                return self.func(*self.argl, **self.argd)
+_tls = _local()
 
-    class GWrap(greenlet):
-        """This is just a wrapper around greenlets to allow
-           to stick additional attributes to a greenlet.
-           To be more concrete, we need a backreference to
-           the coroutine object"""
+def _coroutine_create_main():
+    # create the main coroutine for this thread
+    _tls.current_coroutine = None
+    main_coroutine = coroutine()
+    main_coroutine.bind(lambda x:x)
+    _tls.main_coroutine = main_coroutine
+    _tls.current_coroutine = main_coroutine
+    return main_coroutine
 
-    class MWrap(object):
-        def __init__(self,something):
-            self.something = something
 
-        def __getattr__(self, attr):
-            return getattr(self.something, attr)
+_maincoro = _coroutine_create_main()
 
-    class coroutine(object):
-        "we can't have greenlet as a base, because greenlets can't be rebound"
-
-        def __init__(self):
-            self._frame = None
-            self.is_zombie = False
-
-        def __getattr__(self, attr):
-            return getattr(self._frame, attr)
-
-        def __del__(self):
-            self.is_zombie = True
-            del self._frame
-            self._frame = None
-
-        def bind(self, func, *argl, **argd):
-            """coro.bind(f, *argl, **argd) -> None.
-               binds function f to coro. f will be called with
-               arguments *argl, **argd
-            """
-            if self._frame is None or self._frame.dead:
-                self._frame = frame = GWrap()
-                frame.coro = self
-            if hasattr(self._frame, 'run') and self._frame.run:
-                raise ValueError("cannot bind a bound coroutine")
-            self._frame.run = partial(func, *argl, **argd)
-
-        def switch(self):
-            """coro.switch() -> returnvalue
-               switches to coroutine coro. If the bound function
-               f finishes, the returnvalue is that of f, otherwise
-               None is returned
-            """
-            try:
-                return greenlet.switch(self._frame)
-            except TypeError, exp: # self._frame is the main coroutine
-                return greenlet.switch(self._frame.something)
-
-        def kill(self):
-            """coro.kill() : kill coroutine coro"""
-            self._frame.throw()
-
-        def _is_alive(self):
-            if self._frame is None:
-                return False
-            return not self._frame.dead
-        is_alive = property(_is_alive)
-        del _is_alive
-
-        def getcurrent():
-            """coroutine.getcurrent() -> the currently running coroutine"""
-            try:
-                return greenlet.getcurrent().coro
-            except AttributeError:
-                return _maincoro
-        getcurrent = staticmethod(getcurrent)
-
-        def __reduce__(self):
-            raise TypeError, 'pickling is not possible based upon greenlets'
-
-    _maincoro = coroutine()
-    maingreenlet = greenlet.getcurrent()
-    _maincoro._frame = frame = MWrap(maingreenlet)
-    frame.coro = _maincoro
-    del frame
-    del maingreenlet
 
 from collections import deque
 
 import operator
-__all__ = 'run getcurrent getmain schedule tasklet channel coroutine \
-                greenlet'.split()
+__all__ = 'run getcurrent getmain schedule tasklet channel coroutine'.split()
 
 _global_task_id = 0
 _squeue = None
@@ -131,7 +134,8 @@
 def _scheduler_remove(value):
     try:
         del _squeue[operator.indexOf(_squeue, value)]
-    except ValueError:pass
+    except ValueError:
+        pass
 
 def _scheduler_append(value, normal=True):
     if normal:
diff --git a/pypy/annotation/builtin.py b/pypy/annotation/builtin.py
--- a/pypy/annotation/builtin.py
+++ b/pypy/annotation/builtin.py
@@ -308,9 +308,6 @@
             clsdef = clsdef.commonbase(cdef)
     return SomeInstance(clsdef)
 
-def robjmodel_we_are_translated():
-    return immutablevalue(True)
-
 def robjmodel_r_dict(s_eqfn, s_hashfn, s_force_non_null=None):
     if s_force_non_null is None:
         force_non_null = False
@@ -376,8 +373,6 @@
 
 BUILTIN_ANALYZERS[pypy.rlib.rarithmetic.intmask] = rarith_intmask
 BUILTIN_ANALYZERS[pypy.rlib.objectmodel.instantiate] = robjmodel_instantiate
-BUILTIN_ANALYZERS[pypy.rlib.objectmodel.we_are_translated] = (
-    robjmodel_we_are_translated)
 BUILTIN_ANALYZERS[pypy.rlib.objectmodel.r_dict] = robjmodel_r_dict
 BUILTIN_ANALYZERS[pypy.rlib.objectmodel.hlinvoke] = robjmodel_hlinvoke
 BUILTIN_ANALYZERS[pypy.rlib.objectmodel.keepalive_until_here] = robjmodel_keepalive_until_here
@@ -416,7 +411,8 @@
 from pypy.annotation.model import SomePtr
 from pypy.rpython.lltypesystem import lltype
 
-def malloc(s_T, s_n=None, s_flavor=None, s_zero=None, s_track_allocation=None):
+def malloc(s_T, s_n=None, s_flavor=None, s_zero=None, s_track_allocation=None,
+           s_add_memory_pressure=None):
     assert (s_n is None or s_n.knowntype == int
             or issubclass(s_n.knowntype, pypy.rlib.rarithmetic.base_int))
     assert s_T.is_constant()
@@ -432,6 +428,8 @@
     else:
         assert s_flavor.is_constant()
         assert s_track_allocation is None or s_track_allocation.is_constant()
+        assert (s_add_memory_pressure is None or
+                s_add_memory_pressure.is_constant())
         # not sure how to call malloc() for the example 'p' in the
         # presence of s_extraargs
         r = SomePtr(lltype.Ptr(s_T.const))
diff --git a/pypy/annotation/description.py b/pypy/annotation/description.py
--- a/pypy/annotation/description.py
+++ b/pypy/annotation/description.py
@@ -399,9 +399,7 @@
                 if b1 is object:
                     continue
                 if b1.__dict__.get('_mixin_', False):
-                    assert b1.__bases__ == () or b1.__bases__ == (object,), (
-                        "mixin class %r should have no base" % (b1,))
-                    self.add_sources_for_class(b1, mixin=True)
+                    self.add_mixin(b1)
                 else:
                     assert base is object, ("multiple inheritance only supported "
                                             "with _mixin_: %r" % (cls,))
@@ -469,6 +467,15 @@
                 return
         self.classdict[name] = Constant(value)
 
+    def add_mixin(self, base):
+        for subbase in base.__bases__:
+            if subbase is object:
+                continue
+            assert subbase.__dict__.get("_mixin_", False), ("Mixin class %r has non"
+                "mixin base class %r" % (base, subbase))
+            self.add_mixin(subbase)
+        self.add_sources_for_class(base, mixin=True)
+
     def add_sources_for_class(self, cls, mixin=False):
         for name, value in cls.__dict__.items():
             self.add_source_attribute(name, value, mixin)
diff --git a/pypy/config/config.py b/pypy/config/config.py
--- a/pypy/config/config.py
+++ b/pypy/config/config.py
@@ -81,6 +81,12 @@
                                  (self.__class__, name))
         return self._cfgimpl_values[name]
 
+    def __dir__(self):
+        from_type = dir(type(self))
+        from_dict = list(self.__dict__)
+        extras = list(self._cfgimpl_values)
+        return sorted(set(extras + from_type + from_dict))
+
     def __delattr__(self, name):
         # XXX if you use delattr you are responsible for all bad things
         # happening
diff --git a/pypy/config/makerestdoc.py b/pypy/config/makerestdoc.py
--- a/pypy/config/makerestdoc.py
+++ b/pypy/config/makerestdoc.py
@@ -134,7 +134,7 @@
         for child in self._children:
             subpath = fullpath + "." + child._name
             toctree.append(subpath)
-        content.add(Directive("toctree", *toctree, maxdepth=4))
+        content.add(Directive("toctree", *toctree, **{'maxdepth': 4}))
         content.join(
             ListItem(Strong("name:"), self._name),
             ListItem(Strong("description:"), self.doc))
diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py
--- a/pypy/config/pypyoption.py
+++ b/pypy/config/pypyoption.py
@@ -27,13 +27,14 @@
 # --allworkingmodules
 working_modules = default_modules.copy()
 working_modules.update(dict.fromkeys(
-    ["_socket", "unicodedata", "mmap", "fcntl", "_locale",
+    ["_socket", "unicodedata", "mmap", "fcntl", "_locale", "pwd",
      "rctime" , "select", "zipimport", "_lsprof",
      "crypt", "signal", "_rawffi", "termios", "zlib", "bz2",
      "struct", "_hashlib", "_md5", "_sha", "_minimal_curses", "cStringIO",
      "thread", "itertools", "pyexpat", "_ssl", "cpyext", "array",
      "_bisect", "binascii", "_multiprocessing", '_warnings',
-     "_collections", "_multibytecodec", "micronumpy", "_ffi"]
+     "_collections", "_multibytecodec", "micronumpy", "_ffi",
+     "_continuation"]
 ))
 
 translation_modules = default_modules.copy()
@@ -57,6 +58,7 @@
     # unix only modules
     del working_modules["crypt"]
     del working_modules["fcntl"]
+    del working_modules["pwd"]
     del working_modules["termios"]
     del working_modules["_minimal_curses"]
 
@@ -99,6 +101,7 @@
     "_ssl"      : ["pypy.module._ssl.interp_ssl"],
     "_hashlib"  : ["pypy.module._ssl.interp_ssl"],
     "_minimal_curses": ["pypy.module._minimal_curses.fficurses"],
+    "_continuation": ["pypy.rlib.rstacklet"],
     }
 
 def get_module_validator(modname):
@@ -327,6 +330,9 @@
         BoolOption("mutable_builtintypes",
                    "Allow the changing of builtin types", default=False,
                    requires=[("objspace.std.builtinshortcut", True)]),
+        BoolOption("withidentitydict",
+                   "track types that override __hash__, __eq__ or __cmp__ and use a special dict strategy for those which do not",
+                   default=True),
      ]),
 ])
 
diff --git a/pypy/config/support.py b/pypy/config/support.py
--- a/pypy/config/support.py
+++ b/pypy/config/support.py
@@ -9,7 +9,7 @@
         return 1    # don't override MAKEFLAGS.  This will call 'make' without any '-j' option
     if sys.platform == 'darwin':
         return darwin_get_cpu_count()
-    elif sys.platform != 'linux2':
+    elif not sys.platform.startswith('linux'):
         return 1    # implement me
     try:
         if isinstance(filename_or_file, str):
diff --git a/pypy/config/test/test_config.py b/pypy/config/test/test_config.py
--- a/pypy/config/test/test_config.py
+++ b/pypy/config/test/test_config.py
@@ -1,5 +1,5 @@
 from pypy.config.config import *
-import py
+import py, sys
 
 def make_description():
     gcoption = ChoiceOption('name', 'GC name', ['ref', 'framework'], 'ref')
@@ -63,6 +63,22 @@
     py.test.raises(ConfigError, 'config.gc.name = "ref"')
     config.gc.name = "framework"
 
+def test___dir__():
+    descr = make_description()
+    config = Config(descr, bool=False)
+    attrs = dir(config)
+    assert '__repr__' in attrs        # from the type
+    assert '_cfgimpl_values' in attrs # from self
+    if sys.version_info >= (2, 6):
+        assert 'gc' in attrs              # custom attribute
+        assert 'objspace' in attrs        # custom attribute
+    #
+    attrs = dir(config.gc)
+    if sys.version_info >= (2, 6):
+        assert 'name' in attrs
+        assert 'dummy' in attrs
+        assert 'float' in attrs
+
 def test_arbitrary_option():
     descr = OptionDescription("top", "", [
         ArbitraryOption("a", "no help", default=None)
@@ -265,11 +281,11 @@
 
 def test_underscore_in_option_name():
     descr = OptionDescription("opt", "", [
-        BoolOption("_stackless", "", default=False),
+        BoolOption("_foobar", "", default=False),
     ])
     config = Config(descr)
     parser = to_optparse(config)
-    assert parser.has_option("--_stackless")
+    assert parser.has_option("--_foobar")
 
 def test_none():
     dummy1 = BoolOption('dummy1', 'doc dummy', default=False, cmdline=None)
diff --git a/pypy/config/test/test_support.py b/pypy/config/test/test_support.py
--- a/pypy/config/test/test_support.py
+++ b/pypy/config/test/test_support.py
@@ -40,7 +40,7 @@
         return self._value
 
 def test_cpuinfo_linux():
-    if sys.platform != 'linux2':
+    if not sys.platform.startswith('linux'):
         py.test.skip("linux only")
     saved = os.environ
     try:
diff --git a/pypy/config/translationoption.py b/pypy/config/translationoption.py
--- a/pypy/config/translationoption.py
+++ b/pypy/config/translationoption.py
@@ -13,6 +13,10 @@
 DEFL_LOW_INLINE_THRESHOLD = DEFL_INLINE_THRESHOLD / 2.0
 
 DEFL_GC = "minimark"
+if sys.platform.startswith("linux"):
+    DEFL_ROOTFINDER_WITHJIT = "asmgcc"
+else:
+    DEFL_ROOTFINDER_WITHJIT = "shadowstack"
 
 IS_64_BITS = sys.maxint > 2147483647
 
@@ -24,10 +28,9 @@
 
 translation_optiondescription = OptionDescription(
         "translation", "Translation Options", [
-    BoolOption("stackless", "enable stackless features during compilation",
-               default=False, cmdline="--stackless",
-               requires=[("translation.type_system", "lltype"),
-                         ("translation.gcremovetypeptr", False)]),  # XXX?
+    BoolOption("continuation", "enable single-shot continuations",
+               default=False, cmdline="--continuation",
+               requires=[("translation.type_system", "lltype")]),
     ChoiceOption("type_system", "Type system to use when RTyping",
                  ["lltype", "ootype"], cmdline=None, default="lltype",
                  requires={
@@ -66,7 +69,8 @@
                      "statistics": [("translation.gctransformer", "framework")],
                      "generation": [("translation.gctransformer", "framework")],
                      "hybrid": [("translation.gctransformer", "framework")],
-                     "boehm": [("translation.gctransformer", "boehm")],
+                     "boehm": [("translation.gctransformer", "boehm"),
+                               ("translation.continuation", False)],  # breaks
                      "markcompact": [("translation.gctransformer", "framework")],
                      "minimark": [("translation.gctransformer", "framework")],
                      },
@@ -109,7 +113,7 @@
     BoolOption("jit", "generate a JIT",
                default=False,
                suggests=[("translation.gc", DEFL_GC),
-                         ("translation.gcrootfinder", "asmgcc"),
+                         ("translation.gcrootfinder", DEFL_ROOTFINDER_WITHJIT),
                          ("translation.list_comprehension_operations", True)]),
     ChoiceOption("jit_backend", "choose the backend for the JIT",
                  ["auto", "x86", "x86-without-sse2", "llvm"],
@@ -140,7 +144,10 @@
                  ["annotate", "rtype", "backendopt", "database", "source",
                   "pyjitpl"],
                  default=None, cmdline="--fork-before"),
-
+    BoolOption("dont_write_c_files",
+               "Make the C backend write everyting to /dev/null. " +
+               "Useful for benchmarking, so you don't actually involve the disk",
+               default=False, cmdline="--dont-write-c-files"),
     ArbitraryOption("instrumentctl", "internal",
                default=None),
     StrOption("output", "Output file name", cmdline="--output"),
@@ -382,8 +389,6 @@
             config.translation.suggest(withsmallfuncsets=5)
         elif word == 'jit':
             config.translation.suggest(jit=True)
-            if config.translation.stackless:
-                raise NotImplementedError("JIT conflicts with stackless for now")
         elif word == 'removetypeptr':
             config.translation.suggest(gcremovetypeptr=True)
         else:
diff --git a/pypy/doc/_ref.txt b/pypy/doc/_ref.txt
--- a/pypy/doc/_ref.txt
+++ b/pypy/doc/_ref.txt
@@ -1,11 +1,10 @@
 .. _`ctypes_configure/doc/sample.py`: https://bitbucket.org/pypy/pypy/src/default/ctypes_configure/doc/sample.py
 .. _`demo/`: https://bitbucket.org/pypy/pypy/src/default/demo/
-.. _`demo/pickle_coroutine.py`: https://bitbucket.org/pypy/pypy/src/default/demo/pickle_coroutine.py
 .. _`lib-python/`: https://bitbucket.org/pypy/pypy/src/default/lib-python/
 .. _`lib-python/2.7/dis.py`: https://bitbucket.org/pypy/pypy/src/default/lib-python/2.7/dis.py
 .. _`lib_pypy/`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/
+.. _`lib_pypy/greenlet.py`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/greenlet.py
 .. _`lib_pypy/pypy_test/`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/pypy_test/
-.. _`lib_pypy/stackless.py`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/stackless.py
 .. _`lib_pypy/tputil.py`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/tputil.py
 .. _`pypy/annotation`:
 .. _`pypy/annotation/`: https://bitbucket.org/pypy/pypy/src/default/pypy/annotation/
@@ -55,7 +54,6 @@
 .. _`pypy/module`:
 .. _`pypy/module/`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/
 .. _`pypy/module/__builtin__/__init__.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/__builtin__/__init__.py
-.. _`pypy/module/_stackless/test/test_composable_coroutine.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/_stackless/test/test_composable_coroutine.py
 .. _`pypy/objspace`:
 .. _`pypy/objspace/`: https://bitbucket.org/pypy/pypy/src/default/pypy/objspace/
 .. _`pypy/objspace/dump.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/objspace/dump.py
@@ -117,6 +115,7 @@
 .. _`pypy/translator/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/
 .. _`pypy/translator/backendopt/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/backendopt/
 .. _`pypy/translator/c/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/c/
+.. _`pypy/translator/c/src/stacklet/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/c/src/stacklet/
 .. _`pypy/translator/cli/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/cli/
 .. _`pypy/translator/goal/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/goal/
 .. _`pypy/translator/jvm/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/jvm/
diff --git a/pypy/doc/architecture.rst b/pypy/doc/architecture.rst
--- a/pypy/doc/architecture.rst
+++ b/pypy/doc/architecture.rst
@@ -153,7 +153,7 @@
 
 * Optionally, `various transformations`_ can then be applied which, for
   example, perform optimizations such as inlining, add capabilities
-  such as stackless_-style concurrency, or insert code for the
+  such as stackless-style concurrency (deprecated), or insert code for the
   `garbage collector`_.
 
 * Then, the graphs are converted to source code for the target platform
@@ -255,7 +255,6 @@
 
 .. _Python: http://docs.python.org/reference/
 .. _Psyco: http://psyco.sourceforge.net
-.. _stackless: stackless.html
 .. _`generate Just-In-Time Compilers`: jit/index.html
 .. _`JIT Generation in PyPy`: jit/index.html
 .. _`implement your own interpreter`: http://morepypy.blogspot.com/2011/04/tutorial-writing-interpreter-with-pypy.html
diff --git a/pypy/doc/coding-guide.rst b/pypy/doc/coding-guide.rst
--- a/pypy/doc/coding-guide.rst
+++ b/pypy/doc/coding-guide.rst
@@ -929,6 +929,19 @@
 located in the ``py/bin/`` directory.  For switches to
 modify test execution pass the ``-h`` option.
 
+Coverage reports
+----------------
+
+In order to get coverage reports the `pytest-cov`_ plugin is included.
+it adds some extra requirements ( coverage_ and `cov-core`_ )
+and can once they are installed coverage testing can be invoked via::
+
+  python test_all.py --cov file_or_direcory_to_cover file_or_directory
+
+.. _`pytest-cov`: http://pypi.python.org/pypi/pytest-cov
+.. _`coverage`: http://pypi.python.org/pypi/coverage
+.. _`cov-core`: http://pypi.python.org/pypi/cov-core
+
 Test conventions
 ----------------
 
diff --git a/pypy/doc/conf.py b/pypy/doc/conf.py
--- a/pypy/doc/conf.py
+++ b/pypy/doc/conf.py
@@ -45,9 +45,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '1.5'
+version = '1.6'
 # The full version, including alpha/beta/rc tags.
-release = '1.5'
+release = '1.6'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/pypy/doc/config/objspace.std.withidentitydict.txt b/pypy/doc/config/objspace.std.withidentitydict.txt
new file mode 100644
--- /dev/null
+++ b/pypy/doc/config/objspace.std.withidentitydict.txt
@@ -0,0 +1,21 @@
+=============================
+objspace.std.withidentitydict
+=============================
+
+* **name:** withidentitydict
+
+* **description:** enable a dictionary strategy for "by identity" comparisons
+
+* **command-line:** --objspace-std-withidentitydict
+
+* **command-line for negation:** --no-objspace-std-withidentitydict
+
+* **option type:** boolean option
+
+* **default:** True
+
+
+Enable a dictionary strategy specialized for instances of classes which
+compares "by identity", which is the default unless you override ``__hash__``,
+``__eq__`` or ``__cmp__``.  This strategy will be used only with new-style
+classes.
diff --git a/pypy/doc/config/objspace.usemodules._stackless.txt b/pypy/doc/config/objspace.usemodules._continuation.txt
rename from pypy/doc/config/objspace.usemodules._stackless.txt
rename to pypy/doc/config/objspace.usemodules._continuation.txt
--- a/pypy/doc/config/objspace.usemodules._stackless.txt
+++ b/pypy/doc/config/objspace.usemodules._continuation.txt
@@ -1,6 +1,4 @@
-Use the '_stackless' module. 
+Use the '_continuation' module. 
 
-Exposes the `stackless` primitives, and also implies a stackless build. 
-See also :config:`translation.stackless`.
-
-.. _`stackless`: ../stackless.html
+Exposes the `continulet` app-level primitives.
+See also :config:`translation.continuation`.
diff --git a/pypy/doc/config/objspace.usemodules.pwd.txt b/pypy/doc/config/objspace.usemodules.pwd.txt
new file mode 100644
--- /dev/null
+++ b/pypy/doc/config/objspace.usemodules.pwd.txt
@@ -0,0 +1,2 @@
+Use the 'pwd' module. 
+This module is expected to be fully working.
diff --git a/pypy/doc/config/translation.stackless.txt b/pypy/doc/config/translation.continuation.txt
rename from pypy/doc/config/translation.stackless.txt
rename to pypy/doc/config/translation.continuation.txt
--- a/pypy/doc/config/translation.stackless.txt
+++ b/pypy/doc/config/translation.continuation.txt
@@ -1,5 +1,2 @@
-Run the `stackless transform`_ on each generated graph, which enables the use
-of coroutines at RPython level and the "stackless" module when translating
-PyPy.
-
-.. _`stackless transform`: ../stackless.html
+Enable the use of a stackless-like primitive called "stacklet".
+In PyPy, this is exposed at app-level by the "_continuation" module.
diff --git a/pypy/doc/config/translation.dont_write_c_files.txt b/pypy/doc/config/translation.dont_write_c_files.txt
new file mode 100644
--- /dev/null
+++ b/pypy/doc/config/translation.dont_write_c_files.txt
@@ -0,0 +1,4 @@
+write the generated C files to ``/dev/null`` instead of to the disk. Useful if
+you want to use translate.py as a benchmark and don't want to access the disk.
+
+.. _`translation documentation`: ../translation.html
diff --git a/pypy/doc/config/translation.gc.txt b/pypy/doc/config/translation.gc.txt
--- a/pypy/doc/config/translation.gc.txt
+++ b/pypy/doc/config/translation.gc.txt
@@ -1,4 +1,6 @@
-Choose the Garbage Collector used by the translated program:
+Choose the Garbage Collector used by the translated program.
+The good performing collectors are "hybrid" and "minimark".
+The default is "minimark".
 
   - "ref": reference counting. Takes very long to translate and the result is
     slow.
@@ -11,3 +13,12 @@
     older generation.
 
   - "boehm": use the Boehm conservative GC.
+
+  - "hybrid": a hybrid collector of "generation" together with a
+    mark-n-sweep old space
+
+  - "markcompact": a slow, but memory-efficient collector,
+    influenced e.g. by Smalltalk systems.
+
+  - "minimark": a generational mark-n-sweep collector with good
+    performance.  Includes page marking for large arrays.
diff --git a/pypy/doc/contributor.rst b/pypy/doc/contributor.rst
--- a/pypy/doc/contributor.rst
+++ b/pypy/doc/contributor.rst
@@ -9,22 +9,22 @@
     Armin Rigo
     Maciej Fijalkowski
     Carl Friedrich Bolz
+    Antonio Cuni
     Amaury Forgeot d'Arc
-    Antonio Cuni
     Samuele Pedroni
     Michael Hudson
     Holger Krekel
+    Benjamin Peterson
     Christian Tismer
-    Benjamin Peterson
+    Hakan Ardo
+    Alex Gaynor
     Eric van Riet Paap
-    Anders Chrigstr&#246;m
-    H&#229;kan Ard&#246;
+    Anders Chrigstrom
+    David Schneider
     Richard Emslie
     Dan Villiom Podlaski Christiansen
     Alexander Schremmer
-    Alex Gaynor
-    David Schneider
-    Aureli&#233;n Campeas
+    Aurelien Campeas
     Anders Lehmann
     Camillo Bruni
     Niklaus Haldimann
@@ -35,16 +35,17 @@
     Bartosz Skowron
     Jakub Gustak
     Guido Wesdorp
+    Daniel Roberts
     Adrien Di Mascio
     Laura Creighton
     Ludovic Aubry
     Niko Matsakis
-    Daniel Roberts
     Jason Creighton
-    Jacob Hall&#233;n
+    Jacob Hallen
     Alex Martelli
     Anders Hammarquist
     Jan de Mooij
+    Wim Lavrijsen
     Stephan Diehl
     Michael Foord
     Stefan Schwarzer
@@ -55,9 +56,13 @@
     Alexandre Fayolle
     Marius Gedminas
     Simon Burton
+    Justin Peel
     Jean-Paul Calderone
     John Witulski
+    Lukas Diekmann
+    holger krekel
     Wim Lavrijsen
+    Dario Bertini
     Andreas St&#252;hrk
     Jean-Philippe St. Pierre
     Guido van Rossum
@@ -69,15 +74,16 @@
     Georg Brandl
     Gerald Klix
     Wanja Saatkamp
+    Ronny Pfannschmidt
     Boris Feigin
     Oscar Nierstrasz
-    Dario Bertini
     David Malcolm
     Eugene Oden
     Henry Mason
+    Sven Hager
     Lukas Renggli
+    Ilya Osadchiy
     Guenter Jantzen
-    Ronny Pfannschmidt
     Bert Freudenberg
     Amit Regmi
     Ben Young
@@ -94,8 +100,8 @@
     Jared Grubb
     Karl Bartel
     Gabriel Lavoie
+    Victor Stinner
     Brian Dorsey
-    Victor Stinner
     Stuart Williams
     Toby Watson
     Antoine Pitrou
@@ -106,19 +112,23 @@
     Jonathan David Riehl
     Elmo M&#228;ntynen
     Anders Qvist
-    Beatrice D&#252;ring
+    Beatrice During
     Alexander Sedov
+    Timo Paulssen
+    Corbin Simpson
     Vincent Legoll
+    Romain Guillebert
     Alan McIntyre
-    Romain Guillebert
     Alex Perry
     Jens-Uwe Mager
+    Simon Cross
     Dan Stromberg
-    Lukas Diekmann
+    Guillebert Romain
     Carl Meyer
     Pieter Zieschang
     Alejandro J. Cura
     Sylvain Thenault
+    Christoph Gerum
     Travis Francis Athougies
     Henrik Vendelbo
     Lutz Paelike
@@ -129,6 +139,7 @@
     Miguel de Val Borro
     Ignas Mikalajunas
     Artur Lisiecki
+    Philip Jenvey
     Joshua Gilbert
     Godefroid Chappelle
     Yusei Tahara
@@ -137,24 +148,29 @@
     Gustavo Niemeyer
     William Leslie
     Akira Li
-    Kristj&#225;n Valur J&#243;nsson
+    Kristjan Valur Jonsson
     Bobby Impollonia
+    Michael Hudson-Doyle
     Andrew Thompson
     Anders Sigfridsson
+    Floris Bruynooghe
     Jacek Generowicz
     Dan Colish
-    Sven Hager
     Zooko Wilcox-O Hearn
+    Dan Villiom Podlaski Christiansen
     Anders Hammarquist
+    Chris Lambacher
     Dinu Gherman
     Dan Colish
+    Brett Cannon
     Daniel Neuh&#228;user
     Michael Chermside
     Konrad Delong
     Anna Ravencroft
     Greg Price
     Armin Ronacher
+    Christian Muirhead
     Jim Baker
-    Philip Jenvey
     Rodrigo Ara&#250;jo
+    Romain Guillebert
 
diff --git a/pypy/doc/cpython_differences.rst b/pypy/doc/cpython_differences.rst
--- a/pypy/doc/cpython_differences.rst
+++ b/pypy/doc/cpython_differences.rst
@@ -24,6 +24,7 @@
     _bisect
     _codecs
     _collections
+    `_continuation`_
     `_ffi`_
     _hashlib
     _io
@@ -84,9 +85,12 @@
 
     _winreg
 
-  Extra module with Stackless_ only:
-
-    _stackless
+  Note that only some of these modules are built-in in a typical
+  CPython installation, and the rest is from non built-in extension
+  modules.  This means that e.g. ``import parser`` will, on CPython,
+  find a local file ``parser.py``, while ``import sys`` will not find a
+  local file ``sys.py``.  In PyPy the difference does not exist: all
+  these modules are built-in.
 
 * Supported by being rewritten in pure Python (possibly using ``ctypes``):
   see the `lib_pypy/`_ directory.  Examples of modules that we
@@ -101,11 +105,11 @@
 
 .. the nonstandard modules are listed below...
 .. _`__pypy__`: __pypy__-module.html
+.. _`_continuation`: stackless.html
 .. _`_ffi`: ctypes-implementation.html
 .. _`_rawffi`: ctypes-implementation.html
 .. _`_minimal_curses`: config/objspace.usemodules._minimal_curses.html
 .. _`cpyext`: http://morepypy.blogspot.com/2010/04/using-cpython-extension-modules-with.html
-.. _Stackless: stackless.html
 
 
 Differences related to garbage collection strategies
@@ -211,6 +215,38 @@
     >>>> print d1['a']
     42
 
+Mutating classes of objects which are already used as dictionary keys
+---------------------------------------------------------------------
+
+Consider the following snippet of code::
+
+    class X(object):
+        pass
+
+    def __evil_eq__(self, other):
+        print 'hello world'
+        return False
+
+    def evil(y):
+        d = {x(): 1}
+        X.__eq__ = __evil_eq__
+        d[y] # might trigger a call to __eq__?
+
+In CPython, __evil_eq__ **might** be called, although there is no way to write
+a test which reliably calls it.  It happens if ``y is not x`` and ``hash(y) ==
+hash(x)``, where ``hash(x)`` is computed when ``x`` is inserted into the
+dictionary.  If **by chance** the condition is satisfied, then ``__evil_eq__``
+is called.
+
+PyPy uses a special strategy to optimize dictionaries whose keys are instances
+of user-defined classes which do not override the default ``__hash__``,
+``__eq__`` and ``__cmp__``: when using this strategy, ``__eq__`` and
+``__cmp__`` are never called, but instead the lookup is done by identity, so
+in the case above it is guaranteed that ``__eq__`` won't be called.
+
+Note that in all other cases (e.g., if you have a custom ``__hash__`` and
+``__eq__`` in ``y``) the behavior is exactly the same as CPython.
+
 
 Ignored exceptions
 -----------------------
@@ -248,7 +284,14 @@
   never a dictionary as it sometimes is in CPython. Assigning to
   ``__builtins__`` has no effect.
 
-* object identity of immutable keys in dictionaries is not necessarily preserved.
-  Never compare immutable objects with ``is``.
+* Do not compare immutable objects with ``is``.  For example on CPython
+  it is true that ``x is 0`` works, i.e. does the same as ``type(x) is
+  int and x == 0``, but it is so by accident.  If you do instead
+  ``x is 1000``, then it stops working, because 1000 is too large and
+  doesn't come from the internal cache.  In PyPy it fails to work in
+  both cases, because we have no need for a cache at all.
+
+* Also, object identity of immutable keys in dictionaries is not necessarily
+  preserved.
 
 .. include:: _ref.txt
diff --git a/pypy/doc/extending.rst b/pypy/doc/extending.rst
--- a/pypy/doc/extending.rst
+++ b/pypy/doc/extending.rst
@@ -19,12 +19,12 @@
   section
 
 * Write them in pure python and use direct libffi low-level bindings, See
-  \_rawffi_ module description.
+  \_ffi_ module description.
 
 * Write them in RPython as mixedmodule_, using *rffi* as bindings.
 
 .. _ctypes: #CTypes
-.. _\_rawffi: #LibFFI
+.. _\_ffi: #LibFFI
 .. _mixedmodule: #Mixed Modules
 
 CTypes
@@ -42,41 +42,50 @@
 platform-dependent details (compiling small snippets of C code and running
 them), so it'll benefit not pypy-related ctypes-based modules as well.
 
+ctypes call are optimized by the JIT and the resulting machine code contains a
+direct call to the target C function.  However, due to the very dynamic nature
+of ctypes, some overhead over a bare C call is still present, in particular to
+check/convert the types of the parameters.  Moreover, even if most calls are
+optimized, some cannot and thus need to follow the slow path, not optimized by
+the JIT.
+
 .. _`ctypes-configure`: ctypes-implementation.html#ctypes-configure
+.. _`CPython ctypes`: http://docs.python.org/library/ctypes.html
 
 Pros
 ----
 
-Stable, CPython-compatible API
+Stable, CPython-compatible API.  Most calls are fast, optimized by JIT.
 
 Cons
 ----
 
-Only pure-python code (slow), problems with platform-dependency (although
-we partially solve those). PyPy implementation is now very slow.
+Problems with platform-dependency (although we partially solve
+those). Although the JIT optimizes ctypes calls, some overhead is still
+present.  The slow-path is very slow.
 
-_`CPython ctypes`: http://python.net/crew/theller/ctypes/
 
 LibFFI
 ======
 
 Mostly in order to be able to write a ctypes module, we developed a very
-low-level libffi bindings. (libffi is a C-level library for dynamic calling,
+low-level libffi bindings called ``_ffi``. (libffi is a C-level library for dynamic calling,
 which is used by CPython ctypes). This library provides stable and usable API,
 although it's API is a very low-level one. It does not contain any
-magic.
+magic.  It is also optimized by the JIT, but has much less overhead than ctypes.
 
 Pros
 ----
 
-Works. Combines disadvantages of using ctypes with disadvantages of
-using mixed modules. Probably more suitable for a delicate code
-where ctypes magic goes in a way.
+It Works. Probably more suitable for a delicate code where ctypes magic goes
+in a way.  All calls are optimized by the JIT, there is no slow path as in
+ctypes.
 
 Cons
 ----
 
-Slow. CPython-incompatible API, very rough and low-level
+It combines disadvantages of using ctypes with disadvantages of using mixed
+modules. CPython-incompatible API, very rough and low-level.
 
 Mixed Modules
 =============
@@ -87,15 +96,15 @@
 * a mixed module needs to be written in RPython, which is far more
   complicated than Python (XXX link)
 
-* due to lack of separate compilation (as of April 2008), each
+* due to lack of separate compilation (as of July 2011), each
   compilation-check requires to recompile whole PyPy python interpreter,
   which takes 0.5-1h. We plan to solve this at some point in near future.
 
 * although rpython is a garbage-collected language, the border between
   C and RPython needs to be managed by hand (each object that goes into the
-  C level must be explicitly freed) XXX we try to solve this
+  C level must be explicitly freed).
 
-Some document is available `here`_
+Some documentation is available `here`_
 
 .. _`here`: rffi.html
 
diff --git a/pypy/doc/faq.rst b/pypy/doc/faq.rst
--- a/pypy/doc/faq.rst
+++ b/pypy/doc/faq.rst
@@ -315,6 +315,28 @@
 
 .. _`Andrew Brown's tutorial`: http://morepypy.blogspot.com/2011/04/tutorial-writing-interpreter-with-pypy.html
 
+---------------------------------------------------------
+Can RPython modules for PyPy be translated independently?
+---------------------------------------------------------
+
+No, you have to rebuild the entire interpreter.  This means two things:
+
+* It is imperative to use test-driven development.  You have to test
+  exhaustively your module in pure Python, before even attempting to
+  translate it.  Once you translate it, you should have only a few typing
+  issues left to fix, but otherwise the result should work out of the box.
+
+* Second, and perhaps most important: do you have a really good reason
+  for writing the module in RPython in the first place?  Nowadays you
+  should really look at alternatives, like writing it in pure Python,
+  using ctypes if it needs to call C code.  Other alternatives are being
+  developed too (as of summer 2011), like a Cython binding.
+
+In this context it is not that important to be able to translate
+RPython modules independently of translating the complete interpreter.
+(It could be done given enough efforts, but it's a really serious
+undertaking.  Consider it as quite unlikely for now.)
+
 ----------------------------------------------------------
 Why does PyPy draw a Mandelbrot fractal while translating?
 ----------------------------------------------------------
diff --git a/pypy/doc/garbage_collection.rst b/pypy/doc/garbage_collection.rst
--- a/pypy/doc/garbage_collection.rst
+++ b/pypy/doc/garbage_collection.rst
@@ -147,7 +147,7 @@
 You can read more about them at the start of
 `pypy/rpython/memory/gc/minimark.py`_.
 
-In more details:
+In more detail:
 
 - The small newly malloced objects are allocated in the nursery (case 1).
   All objects living in the nursery are "young".
diff --git a/pypy/doc/getting-started-python.rst b/pypy/doc/getting-started-python.rst
--- a/pypy/doc/getting-started-python.rst
+++ b/pypy/doc/getting-started-python.rst
@@ -32,7 +32,10 @@
 .. _`windows document`: windows.html
 
 You can translate the whole of PyPy's Python interpreter to low level C code,
-or `CLI code`_.
+or `CLI code`_.  If you intend to build using gcc, check to make sure that
+the version you have is not 4.2 or you will run into `this bug`_.
+
+.. _`this bug`: https://bugs.launchpad.net/ubuntu/+source/gcc-4.2/+bug/187391
 
 1. First `download a pre-built PyPy`_ for your architecture which you will
    use to translate your Python interpreter.  It is, of course, possible to
@@ -64,7 +67,6 @@
    * ``libssl-dev`` (for the optional ``_ssl`` module)
    * ``libgc-dev`` (for the Boehm garbage collector: only needed when translating with `--opt=0, 1` or `size`)
    * ``python-sphinx`` (for the optional documentation build.  You need version 1.0.7 or later)
-   * ``python-greenlet`` (for the optional stackless support in interpreted mode/testing)
 
 
 3. Translation is time-consuming -- 45 minutes on a very fast machine --
@@ -102,7 +104,7 @@
 
     $ ./pypy-c
     Python 2.7.0 (61ef2a11b56a, Mar 02 2011, 03:00:11)
-    [PyPy 1.5.0-alpha0 with GCC 4.4.3] on linux2
+    [PyPy 1.6.0 with GCC 4.4.3] on linux2
     Type "help", "copyright", "credits" or "license" for more information.
     And now for something completely different: ``this sentence is false''
     >>>> 46 - 4
@@ -117,19 +119,8 @@
 Installation_ below.
 
 The ``translate.py`` script takes a very large number of options controlling
-what to translate and how.  See ``translate.py -h``. Some of the more
-interesting options (but for now incompatible with the JIT) are:
-
-   * ``--stackless``: this produces a pypy-c that includes features
-     inspired by `Stackless Python <http://www.stackless.com>`__.
-
-   * ``--gc=boehm|ref|marknsweep|semispace|generation|hybrid|minimark``:
-     choose between using
-     the `Boehm-Demers-Weiser garbage collector`_, our reference
-     counting implementation or one of own collector implementations
-     (the default depends on the optimization level but is usually
-     ``minimark``).
-
+what to translate and how.  See ``translate.py -h``. The default options
+should be suitable for mostly everybody by now.
 Find a more detailed description of the various options in our `configuration
 sections`_.
 
@@ -162,7 +153,7 @@
 
     $ ./pypy-cli
     Python 2.7.0 (61ef2a11b56a, Mar 02 2011, 03:00:11)
-    [PyPy 1.5.0-alpha0] on linux2
+    [PyPy 1.6.0] on linux2
     Type "help", "copyright", "credits" or "license" for more information.
     And now for something completely different: ``distopian and utopian chairs''
     >>>> 
@@ -199,7 +190,7 @@
 
         $ ./pypy-jvm 
         Python 2.7.0 (61ef2a11b56a, Mar 02 2011, 03:00:11)
-        [PyPy 1.5.0-alpha0] on linux2
+        [PyPy 1.6.0] on linux2
         Type "help", "copyright", "credits" or "license" for more information.
         And now for something completely different: ``# assert did not crash''
         >>>> 
@@ -238,7 +229,7 @@
 the ``bin/pypy`` executable.
 
 To install PyPy system wide on unix-like systems, it is recommended to put the
-whole hierarchy alone (e.g. in ``/opt/pypy1.5``) and put a symlink to the
+whole hierarchy alone (e.g. in ``/opt/pypy1.6``) and put a symlink to the
 ``pypy`` executable into ``/usr/bin`` or ``/usr/local/bin``
 
 If the executable fails to find suitable libraries, it will report
diff --git a/pypy/doc/getting-started.rst b/pypy/doc/getting-started.rst
--- a/pypy/doc/getting-started.rst
+++ b/pypy/doc/getting-started.rst
@@ -53,11 +53,11 @@
 PyPy is ready to be executed as soon as you unpack the tarball or the zip
 file, with no need to install it in any specific location::
 
-    $ tar xf pypy-1.5-linux.tar.bz2
+    $ tar xf pypy-1.6-linux.tar.bz2
 
-    $ ./pypy-1.5-linux/bin/pypy
+    $ ./pypy-1.6/bin/pypy
     Python 2.7.1 (?, Apr 27 2011, 12:44:21)
-    [PyPy 1.5.0-alpha0 with GCC 4.4.3] on linux2
+    [PyPy 1.6.0 with GCC 4.4.3] on linux2
     Type "help", "copyright", "credits" or "license" for more information.
     And now for something completely different: ``implementing LOGO in LOGO:
     "turtles all the way down"''
@@ -73,16 +73,16 @@
 
     $ curl -O http://python-distribute.org/distribute_setup.py
 
-    $ curl -O https://github.com/pypa/pip/raw/master/contrib/get-pip.py
+    $ curl -O https://raw.github.com/pypa/pip/master/contrib/get-pip.py
 
-    $ ./pypy-1.5-linux/bin/pypy distribute_setup.py
+    $ ./pypy-1.6/bin/pypy distribute_setup.py
 
-    $ ./pypy-1.5-linux/bin/pypy get-pip.py
+    $ ./pypy-1.6/bin/pypy get-pip.py
 
-    $ ./pypy-1.5-linux/bin/pip install pygments  # for example
+    $ ./pypy-1.6/bin/pip install pygments  # for example
 
-3rd party libraries will be installed in ``pypy-1.5-linux/site-packages``, and
-the scripts in ``pypy-1.5-linux/bin``.
+3rd party libraries will be installed in ``pypy-1.6/site-packages``, and
+the scripts in ``pypy-1.6/bin``.
 
 Installing using virtualenv
 ---------------------------
diff --git a/pypy/doc/how-to-release.rst b/pypy/doc/how-to-release.rst
--- a/pypy/doc/how-to-release.rst
+++ b/pypy/doc/how-to-release.rst
@@ -21,8 +21,8 @@
 Release Steps
 ----------------
 
-* at code freeze make a release branch under
-  http://codepeak.net/svn/pypy/release/x.y(.z). IMPORTANT: bump the
+* at code freeze make a release branch using release-x.x.x in mercurial
+  IMPORTANT: bump the
   pypy version number in module/sys/version.py and in
   module/cpyext/include/patchlevel.h, notice that the branch
   will capture the revision number of this change for the release;
@@ -42,18 +42,11 @@
     JIT: windows, linux, os/x
     no JIT: windows, linux, os/x
     sandbox: linux, os/x
-    stackless: windows, linux, os/x
 
 * write release announcement pypy/doc/release-x.y(.z).txt
   the release announcement should contain a direct link to the download page
 * update pypy.org (under extradoc/pypy.org), rebuild and commit
 
-* update http://codespeak.net/pypy/trunk:
-   code0> + chmod -R yourname:users /www/codespeak.net/htdocs/pypy/trunk
-   local> cd ..../pypy/doc && py.test
-   local> cd ..../pypy
-   local> rsync -az doc codespeak.net:/www/codespeak.net/htdocs/pypy/trunk/pypy/
-
 * post announcement on morepypy.blogspot.com
 * send announcements to pypy-dev, python-list,
   python-announce, python-dev ...
diff --git a/pypy/doc/index-of-release-notes.rst b/pypy/doc/index-of-release-notes.rst
--- a/pypy/doc/index-of-release-notes.rst
+++ b/pypy/doc/index-of-release-notes.rst
@@ -16,3 +16,4 @@
    release-1.4.0beta.rst
    release-1.4.1.rst
    release-1.5.0.rst
+   release-1.6.0.rst
diff --git a/pypy/doc/index.rst b/pypy/doc/index.rst
--- a/pypy/doc/index.rst
+++ b/pypy/doc/index.rst
@@ -15,7 +15,7 @@
 
 * `FAQ`_: some frequently asked questions.
 
-* `Release 1.5`_: the latest official release
+* `Release 1.6`_: the latest official release
 
 * `PyPy Blog`_: news and status info about PyPy 
 
@@ -35,7 +35,7 @@
 
   * `Differences between PyPy and CPython`_
   * `What PyPy can do for your objects`_
-  * `Stackless and coroutines`_
+  * `Continulets and greenlets`_
   * `JIT Generation in PyPy`_ 
   * `Sandboxing Python code`_
 
@@ -77,7 +77,7 @@
 .. _`Getting Started`: getting-started.html
 .. _`Papers`: extradoc.html
 .. _`Videos`: video-index.html
-.. _`Release 1.5`: http://pypy.org/download.html
+.. _`Release 1.6`: http://pypy.org/download.html
 .. _`speed.pypy.org`: http://speed.pypy.org
 .. _`RPython toolchain`: translation.html
 .. _`potential project ideas`: project-ideas.html
@@ -122,9 +122,9 @@
 Windows, on top of .NET, and on top of Java.
 To dig into PyPy it is recommended to try out the current
 Mercurial default branch, which is always working or mostly working,
-instead of the latest release, which is `1.5`__.
+instead of the latest release, which is `1.6`__.
 
-.. __: release-1.5.0.html
+.. __: release-1.6.0.html
 
 PyPy is mainly developed on Linux and Mac OS X.  Windows is supported,
 but platform-specific bugs tend to take longer before we notice and fix
@@ -292,8 +292,6 @@
 
 `pypy/translator/jvm/`_            the Java backend
 
-`pypy/translator/stackless/`_      the `Stackless Transform`_
-
 `pypy/translator/tool/`_           helper tools for translation, including the Pygame
                                    `graph viewer`_
 
@@ -318,7 +316,7 @@
 .. _`transparent proxies`: objspace-proxies.html#tproxy
 .. _`Differences between PyPy and CPython`: cpython_differences.html
 .. _`What PyPy can do for your objects`: objspace-proxies.html
-.. _`Stackless and coroutines`: stackless.html
+.. _`Continulets and greenlets`: stackless.html
 .. _StdObjSpace: objspace.html#the-standard-object-space 
 .. _`abstract interpretation`: http://en.wikipedia.org/wiki/Abstract_interpretation
 .. _`rpython`: coding-guide.html#rpython 
@@ -337,7 +335,6 @@
 .. _`low-level type system`: rtyper.html#low-level-type
 .. _`object-oriented type system`: rtyper.html#oo-type
 .. _`garbage collector`: garbage_collection.html
-.. _`Stackless Transform`: translation.html#the-stackless-transform
 .. _`main PyPy-translation scripts`: getting-started-python.html#translating-the-pypy-python-interpreter
 .. _`.NET`: http://www.microsoft.com/net/
 .. _Mono: http://www.mono-project.com/
diff --git a/pypy/doc/interpreter-optimizations.rst b/pypy/doc/interpreter-optimizations.rst
--- a/pypy/doc/interpreter-optimizations.rst
+++ b/pypy/doc/interpreter-optimizations.rst
@@ -263,34 +263,6 @@
 You can enable this feature with the :config:`objspace.opcodes.CALL_METHOD`
 option.
 
-.. _`call likely builtin`:
-
-CALL_LIKELY_BUILTIN
-+++++++++++++++++++
-
-A often heard "tip" for speeding up Python programs is to give an often used
-builtin a local name, since local lookups are faster than lookups of builtins,
-which involve doing two dictionary lookups: one in the globals dictionary and
-one in the the builtins dictionary. PyPy approaches this problem at the
-implementation level, with the introduction of the new ``CALL_LIKELY_BUILTIN``
-bytecode. This bytecode is produced by the compiler for a call whose target is
-the name of a builtin.  Since such a syntactic construct is very often actually
-invoking the expected builtin at run-time, this information can be used to make
-the call to the builtin directly, without going through any dictionary lookup.
-
-However, it can occur that the name is shadowed by a global name from the
-current module.  To catch this case, a special dictionary implementation for
-multidicts is introduced, which is used for the dictionaries of modules. This
-implementation keeps track which builtin name is shadowed by it.  The
-``CALL_LIKELY_BUILTIN`` bytecode asks the dictionary whether it is shadowing the
-builtin that is about to be called and asks the dictionary of ``__builtin__``
-whether the original builtin was changed.  These two checks are cheaper than
-full lookups.  In the common case, neither of these cases is true, so the
-builtin can be directly invoked.
-
-You can enable this feature with the
-:config:`objspace.opcodes.CALL_LIKELY_BUILTIN` option.
-
 .. more here?
 
 Overall Effects
diff --git a/pypy/doc/jit/pyjitpl5.rst b/pypy/doc/jit/pyjitpl5.rst
--- a/pypy/doc/jit/pyjitpl5.rst
+++ b/pypy/doc/jit/pyjitpl5.rst
@@ -103,7 +103,7 @@
 
 The meta-interpreter starts interpreting the JIT bytecode.  Each operation is
 executed and then recorded in a list of operations, called the trace.
-Operations can have a list of boxes that operate on, arguments.  Some operations
+Operations can have a list of boxes they operate on, arguments.  Some operations
 (like GETFIELD and GETARRAYITEM) also have special objects that describe how
 their arguments are laid out in memory.  All possible operations generated by
 tracing are listed in metainterp/resoperation.py.  When a (interpreter-level)
diff --git a/pypy/doc/project-ideas.rst b/pypy/doc/project-ideas.rst
--- a/pypy/doc/project-ideas.rst
+++ b/pypy/doc/project-ideas.rst
@@ -48,12 +48,6 @@
 
 .. image:: image/jitviewer.png
 
-We would like to add one level to this hierarchy, by showing the generated
-machine code for each jit operation.  The necessary information is already in
-the log file produced by the JIT, so it is "only" a matter of teaching the
-jitviewer to display it.  Ideally, the machine code should be hidden by
-default and viewable on request.
-
 The jitviewer is a web application based on flask and jinja2 (and jQuery on
 the client): if you have great web developing skills and want to help PyPy,
 this is an ideal task to get started, because it does not require any deep
diff --git a/pypy/doc/release-1.6.0.rst b/pypy/doc/release-1.6.0.rst
new file mode 100644
--- /dev/null
+++ b/pypy/doc/release-1.6.0.rst
@@ -0,0 +1,95 @@
+========================
+PyPy 1.6 - kickass panda
+========================
+
+We're pleased to announce the 1.6 release of PyPy. This release brings a lot
+of bugfixes and performance improvements over 1.5, and improves support for
+Windows 32bit and OS X 64bit. This version fully implements Python 2.7.1 and
+has beta level support for loading CPython C extensions.  You can download it
+here:
+
+    http://pypy.org/download.html
+
+What is PyPy?
+=============
+
+PyPy is a very compliant Python interpreter, almost a drop-in replacement for
+CPython 2.7.1. It's fast (`pypy 1.5 and cpython 2.6.2`_ performance comparison)
+due to its integrated tracing JIT compiler.
+
+This release supports x86 machines running Linux 32/64 or Mac OS X.  Windows 32
+is beta (it roughly works but a lot of small issues have not been fixed so
+far).  Windows 64 is not yet supported.
+
+The main topics of this release are speed and stability: on average on
+our benchmark suite, PyPy 1.6 is between **20% and 30%** faster than PyPy 1.5,
+which was already much faster than CPython on our set of benchmarks.
+
+The speed improvements have been made possible by optimizing many of the
+layers which compose PyPy.  In particular, we improved: the Garbage Collector,
+the JIT warmup time, the optimizations performed by the JIT, the quality of
+the generated machine code and the implementation of our Python interpreter.
+
+.. _`pypy 1.5 and cpython 2.6.2`: http://speed.pypy.org
+
+
+Highlights
+==========
+
+* Numerous performance improvements, overall giving considerable speedups:
+
+  - better GC behavior when dealing with very large objects and arrays
+
+  - **fast ctypes:** now calls to ctypes functions are seen and optimized
+    by the JIT, and they are up to 60 times faster than PyPy 1.5 and 10 times
+    faster than CPython
+
+  - improved generators(1): simple generators now are inlined into the caller
+    loop, making performance up to 3.5 times faster than PyPy 1.5.
+
+  - improved generators(2): thanks to other optimizations, even generators
+    that are not inlined are between 10% and 20% faster than PyPy 1.5.
+
+  - faster warmup time for the JIT
+
+  - JIT support for single floats (e.g., for ``array('f')``)
+
+  - optimized dictionaries: the internal representation of dictionaries is now
+    dynamically selected depending on the type of stored objects, resulting in
+    faster code and smaller memory footprint.  For example, dictionaries whose
+    keys are all strings, or all integers. Other dictionaries are also smaller
+    due to bugfixes.
+
+* JitViewer: this is the first official release which includes the JitViewer,
+  a web-based tool which helps you to see which parts of your Python code have
+  been compiled by the JIT, down until the assembler. The `jitviewer`_ 0.1 has
+  already been release and works well with PyPy 1.6.
+
+* The CPython extension module API has been improved and now supports many
+  more extensions. For information on which one are supported, please refer to
+  our `compatibility wiki`_.
+
+* Multibyte encoding support: this was of of the last areas in which we were
+  still behind CPython, but now we fully support them.
+
+* Preliminary support for NumPy: this release includes a preview of a very
+  fast NumPy module integrated with the PyPy JIT.  Unfortunately, this does
+  not mean that you can expect to take an existing NumPy program and run it on
+  PyPy, because the module is still unfinished and supports only some of the
+  numpy API. However, barring some details, what works should be
+  blazingly fast :-)
+
+* Bugfixes: since the 1.5 release we fixed 53 bugs in our `bug tracker`_, not
+  counting the numerous bugs that were found and reported through other
+  channels than the bug tracker.
+
+Cheers,
+
+Hakan Ardo, Carl Friedrich Bolz, Laura Creighton, Antonio Cuni,
+Maciej Fijalkowski, Amaury Forgeot d'Arc, Alex Gaynor,
+Armin Rigo and the PyPy team
+
+.. _`jitviewer`: http://morepypy.blogspot.com/2011/08/visualization-of-jitted-code.html
+.. _`bug tracker`: https://bugs.pypy.org
+.. _`compatibility wiki`: https://bitbucket.org/pypy/compatibility/wiki/Home
+
diff --git a/pypy/doc/rlib.rst b/pypy/doc/rlib.rst
--- a/pypy/doc/rlib.rst
+++ b/pypy/doc/rlib.rst
@@ -134,69 +134,6 @@
 a hierarchy of Address classes, in a typical static-OO-programming style.
 
 
-``rstack``
-==========
-
-The `pypy/rlib/rstack.py`_ module allows an RPython program to control its own execution stack.
-This is only useful if the program is translated using stackless. An old
-description of the exposed functions is below.
-
-We introduce an RPython type ``frame_stack_top`` and a built-in function
-``yield_current_frame_to_caller()`` that work as follows (see example below):
-
-* The built-in function ``yield_current_frame_to_caller()`` causes the current
-  function's state to be captured in a new ``frame_stack_top`` object that is
-  returned to the parent.  Only one frame, the current one, is captured this
-  way.  The current frame is suspended and the caller continues to run.  Note
-  that the caller is only resumed once: when
-  ``yield_current_frame_to_caller()`` is called.  See below.
-
-* A ``frame_stack_top`` object can be jumped to by calling its ``switch()``
-  method with no argument.
-
-* ``yield_current_frame_to_caller()`` and ``switch()`` themselves return a new
-  ``frame_stack_top`` object: the freshly captured state of the caller of the
-  source ``switch()`` that was just executed, or None in the case described
-  below.
-
-* the function that called ``yield_current_frame_to_caller()`` also has a
-  normal return statement, like all functions.  This statement must return
-  another ``frame_stack_top`` object.  The latter is *not* returned to the
-  original caller; there is no way to return several times to the caller.
-  Instead, it designates the place to which the execution must jump, as if by
-  a ``switch()``.  The place to which we jump this way will see a None as the
-  source frame stack top.
-
-* every frame stack top must be resumed once and only once.  Not resuming
-  it at all causes a leak.  Resuming it several times causes a crash.
-
-* a function that called ``yield_current_frame_to_caller()`` should not raise.
-  It would have no implicit parent frame to propagate the exception to.  That
-  would be a crashingly bad idea.
-
-The following example would print the numbers from 1 to 7 in order::
-
-    def g():
-        print 2
-        frametop_before_5 = yield_current_frame_to_caller()
-        print 4
-        frametop_before_7 = frametop_before_5.switch()
-        print 6
-        return frametop_before_7
-
-    def f():
-        print 1
-        frametop_before_4 = g()
-        print 3
-        frametop_before_6 = frametop_before_4.switch()
-        print 5
-        frametop_after_return = frametop_before_6.switch()
-        print 7
-        assert frametop_after_return is None
-
-    f()
-
-
 ``streamio``
 ============
 
diff --git a/pypy/doc/stackless.rst b/pypy/doc/stackless.rst
--- a/pypy/doc/stackless.rst
+++ b/pypy/doc/stackless.rst
@@ -8,446 +8,299 @@
 ================
 
 PyPy can expose to its user language features similar to the ones
-present in `Stackless Python`_: **no recursion depth limit**, and the
-ability to write code in a **massively concurrent style**.  It actually
-exposes three different paradigms to choose from:
+present in `Stackless Python`_: the ability to write code in a
+**massively concurrent style**.  (It does not (any more) offer the
+ability to run with no `recursion depth limit`_, but the same effect
+can be achieved indirectly.)
 
-* `Tasklets and channels`_;
+This feature is based on a custom primitive called a continulet_.
+Continulets can be directly used by application code, or it is possible
+to write (entirely at app-level) more user-friendly interfaces.
 
-* Greenlets_;
+Currently PyPy implements greenlets_ on top of continulets.  It would be
+easy to implement tasklets and channels as well, emulating the model
+of `Stackless Python`_.
 
-* Plain coroutines_.
+Continulets are extremely light-weight, which means that PyPy should be
+able to handle programs containing large amounts of them.  However, due
+to an implementation restriction, a PyPy compiled with
+``--gcrootfinder=shadowstack`` consumes at least one page of physical
+memory (4KB) per live continulet, and half a megabyte of virtual memory
+on 32-bit or a complete megabyte on 64-bit.  Moreover, the feature is
+only available (so far) on x86 and x86-64 CPUs; for other CPUs you need
+to add a short page of custom assembler to
+`pypy/translator/c/src/stacklet/`_.
 
-All of them are extremely light-weight, which means that PyPy should be
-able to handle programs containing large amounts of coroutines, tasklets
-and greenlets.
 
+Theory
+======
 
-Requirements
-++++++++++++++++
+The fundamental idea is that, at any point in time, the program happens
+to run one stack of frames (or one per thread, in case of
+multi-threading).  To see the stack, start at the top frame and follow
+the chain of ``f_back`` until you reach the bottom frame.  From the
+point of view of one of these frames, it has a ``f_back`` pointing to
+another frame (unless it is the bottom frame), and it is itself being
+pointed to by another frame (unless it is the top frame).
 
-If you are running py.py on top of CPython, then you need to enable
-the _stackless module by running it as follows::
+The theory behind continulets is to literally take the previous sentence
+as definition of "an O.K. situation".  The trick is that there are
+O.K. situations that are more complex than just one stack: you will
+always have one stack, but you can also have in addition one or more
+detached *cycles* of frames, such that by following the ``f_back`` chain
+you run in a circle.  But note that these cycles are indeed completely
+detached: the top frame (the currently running one) is always the one
+which is not the ``f_back`` of anybody else, and it is always the top of
+a stack that ends with the bottom frame, never a part of these extra
+cycles.
 
-    py.py --withmod-_stackless
+How do you create such cycles?  The fundamental operation to do so is to
+take two frames and *permute* their ``f_back`` --- i.e. exchange them.
+You can permute any two ``f_back`` without breaking the rule of "an O.K.
+situation".  Say for example that ``f`` is some frame halfway down the
+stack, and you permute its ``f_back`` with the ``f_back`` of the top
+frame.  Then you have removed from the normal stack all intermediate
+frames, and turned them into one stand-alone cycle.  By doing the same
+permutation again you restore the original situation.
 
-This is implemented internally using greenlets, so it only works on a
-platform where `greenlets`_ are supported.  A few features do
-not work this way, though, and really require a translated
-``pypy-c``.
+In practice, in PyPy, you cannot change the ``f_back`` of an abitrary
+frame, but only of frames stored in ``continulets``.
 
-To obtain a translated version of ``pypy-c`` that includes Stackless
-support, run translate.py as follows::
-
-    cd pypy/translator/goal
-    python translate.py --stackless
+Continulets are internally implemented using stacklets.  Stacklets are a
+bit more primitive (they are really one-shot continuations), but that
+idea only works in C, not in Python.  The basic idea of continulets is
+to have at any point in time a complete valid stack; this is important
+e.g. to correctly propagate exceptions (and it seems to give meaningful
+tracebacks too).
 
 
 Application level interface
 =============================
 
-A stackless PyPy contains a module called ``stackless``.  The interface
-exposed by this module have not been refined much, so it should be
-considered in-flux (as of 2007).
 
-So far, PyPy does not provide support for ``stackless`` in a threaded
-environment.  This limitation is not fundamental, as previous experience
-has shown, so supporting this would probably be reasonably easy.
+.. _continulet:
 
-An interesting point is that the same ``stackless`` module can provide
-a number of different concurrency paradigms at the same time.  From a
-theoretical point of view, none of above-mentioned existing three
-paradigms considered on its own is new: two of them are from previous
-Python work, and the third one is a variant of the classical coroutine.
-The new part is that the PyPy implementation manages to provide all of
-them and let the user implement more.  Moreover - and this might be an
-important theoretical contribution of this work - we manage to provide
-these concurrency concepts in a "composable" way.  In other words, it
-is possible to naturally mix in a single application multiple
-concurrency paradigms, and multiple unrelated usages of the same
-paradigm.  This is discussed in the Composability_ section below.
+Continulets
++++++++++++
 
+A translated PyPy contains by default a module called ``_continuation``
+exporting the type ``continulet``.  A ``continulet`` object from this
+module is a container that stores a "one-shot continuation".  It plays
+the role of an extra frame you can insert in the stack, and whose
+``f_back`` can be changed.
 
-Infinite recursion
-++++++++++++++++++
+To make a continulet object, call ``continulet()`` with a callable and
+optional extra arguments.
 
-Any stackless PyPy executable natively supports recursion that is only
-limited by the available memory.  As in normal Python, though, there is
-an initial recursion limit (which is 5000 in all pypy-c's, and 1000 in
-CPython).  It can be changed with ``sys.setrecursionlimit()``.  With a
-stackless PyPy, any value is acceptable - use ``sys.maxint`` for
-unlimited.
+Later, the first time you ``switch()`` to the continulet, the callable
+is invoked with the same continulet object as the extra first argument.
+At that point, the one-shot continuation stored in the continulet points
+to the caller of ``switch()``.  In other words you have a perfectly
+normal-looking stack of frames.  But when ``switch()`` is called again,
+this stored one-shot continuation is exchanged with the current one; it
+means that the caller of ``switch()`` is suspended with its continuation
+stored in the container, and the old continuation from the continulet
+object is resumed.
 
-In some cases, you can write Python code that causes interpreter-level
-infinite recursion -- i.e. infinite recursion without going via
-application-level function calls.  It is possible to limit that too,
-with ``_stackless.set_stack_depth_limit()``, or to unlimit it completely
-by setting it to ``sys.maxint``.
+The most primitive API is actually 'permute()', which just permutes the
+one-shot continuation stored in two (or more) continulets.
 
+In more details:
 
-Coroutines
-++++++++++
+* ``continulet(callable, *args, **kwds)``: make a new continulet.
+  Like a generator, this only creates it; the ``callable`` is only
+  actually called the first time it is switched to.  It will be
+  called as follows::
 
-A Coroutine is similar to a very small thread, with no preemptive scheduling.
-Within a family of coroutines, the flow of execution is explicitly
-transferred from one to another by the programmer.  When execution is
-transferred to a coroutine, it begins to execute some Python code.  When
-it transfers execution away from itself it is temporarily suspended, and
-when execution returns to it it resumes its execution from the
-point where it was suspended.  Conceptually, only one coroutine is
-actively running at any given time (but see Composability_ below).
+      callable(cont, *args, **kwds)
 
-The ``stackless.coroutine`` class is instantiated with no argument.
-It provides the following methods and attributes:
+  where ``cont`` is the same continulet object.
 
-* ``stackless.coroutine.getcurrent()``
+  Note that it is actually ``cont.__init__()`` that binds
+  the continulet.  It is also possible to create a not-bound-yet
+  continulet by calling explicitly ``continulet.__new__()``, and
+  only bind it later by calling explicitly ``cont.__init__()``.
 
-    Static method returning the currently running coroutine.  There is a
-    so-called "main" coroutine object that represents the "outer"
-    execution context, where your main program started and where it runs
-    as long as it does not switch to another coroutine.
+* ``cont.switch(value=None, to=None)``: start the continulet if
+  it was not started yet.  Otherwise, store the current continuation
+  in ``cont``, and activate the target continuation, which is the
+  one that was previously stored in ``cont``.  Note that the target
+  continuation was itself previously suspended by another call to
+  ``switch()``; this older ``switch()`` will now appear to return.
+  The ``value`` argument is any object that is carried to the target
+  and returned by the target's ``switch()``.
 
-* ``coro.bind(callable, *args, **kwds)``
+  If ``to`` is given, it must be another continulet object.  In
+  that case, performs a "double switch": it switches as described
+  above to ``cont``, and then immediately switches again to ``to``.
+  This is different from switching directly to ``to``: the current
+  continuation gets stored in ``cont``, the old continuation from
+  ``cont`` gets stored in ``to``, and only then we resume the
+  execution from the old continuation out of ``to``.
 
-    Bind the coroutine so that it will execute ``callable(*args,
-    **kwds)``.  The call is not performed immediately, but only the
-    first time we call the ``coro.switch()`` method.  A coroutine must
-    be bound before it is switched to.  When the coroutine finishes
-    (because the call to the callable returns), the coroutine exits and
-    implicitly switches back to another coroutine (its "parent"); after
-    this point, it is possible to bind it again and switch to it again.
-    (Which coroutine is the parent of which is not documented, as it is
-    likely to change when the interface is refined.)
+* ``cont.throw(type, value=None, tb=None, to=None)``: similar to
+  ``switch()``, except that immediately after the switch is done, raise
+  the given exception in the target.
 
-* ``coro.switch()``
+* ``cont.is_pending()``: return True if the continulet is pending.
+  This is False when it is not initialized (because we called
+  ``__new__`` and not ``__init__``) or when it is finished (because
+  the ``callable()`` returned).  When it is False, the continulet
+  object is empty and cannot be ``switch()``-ed to.
 
-    Suspend the current (caller) coroutine, and resume execution in the
-    target coroutine ``coro``.
+* ``permute(*continulets)``: a global function that permutes the
+  continuations stored in the given continulets arguments.  Mostly
+  theoretical.  In practice, using ``cont.switch()`` is easier and
+  more efficient than using ``permute()``; the latter does not on
+  its own change the currently running frame.
 
-* ``coro.kill()``
 
-    Kill ``coro`` by sending a CoroutineExit exception and switching
-    execution immediately to it. This exception can be caught in the 
-    coroutine itself and can be raised from any call to ``coro.switch()``. 
-    This exception isn't propagated to the parent coroutine.
+Genlets
++++++++
 
-* ``coro.throw(type, value)``
+The ``_continuation`` module also exposes the ``generator`` decorator::
 
-    Insert an exception in ``coro`` an resume switches execution
-    immediately to it. In the coroutine itself, this exception
-    will come from any call to ``coro.switch()`` and can be caught. If the
-    exception isn't caught, it will be propagated to the parent coroutine.
+    @generator
+    def f(cont, a, b):
+        cont.switch(a + b)
+        cont.switch(a + b + 1)
 
-When a coroutine is garbage-collected, it gets the ``.kill()`` method sent to
-it. This happens at the point the next ``.switch`` method is called, so the
-target coroutine of this call will be executed only after the ``.kill`` has
-finished.
+    for i in f(10, 20):
+        print i
 
-Example
-~~~~~~~
+This example prints 30 and 31.  The only advantage over using regular
+generators is that the generator itself is not limited to ``yield``
+statements that must all occur syntactically in the same function.
+Instead, we can pass around ``cont``, e.g. to nested sub-functions, and
+call ``cont.switch(x)`` from there.
 
-Here is a classical producer/consumer example: an algorithm computes a
-sequence of values, while another consumes them.  For our purposes we
-assume that the producer can generate several values at once, and the
-consumer can process up to 3 values in a batch - it can also process
-batches with fewer than 3 values without waiting for the producer (which
-would be messy to express with a classical Python generator). ::
+The ``generator`` decorator can also be applied to methods::
 
-    def producer(lst):
-        while True:
-            ...compute some more values...
-            lst.extend(new_values)
-            coro_consumer.switch()
-
-    def consumer(lst):
-        while True:
-            # First ask the producer for more values if needed
-            while len(lst) == 0:
-                coro_producer.switch()
-            # Process the available values in a batch, but at most 3
-            batch = lst[:3]
-            del lst[:3]
-            ...process batch...
-
-    # Initialize two coroutines with a shared list as argument
-    exchangelst = []
-    coro_producer = coroutine()
-    coro_producer.bind(producer, exchangelst)
-    coro_consumer = coroutine()
-    coro_consumer.bind(consumer, exchangelst)
-
-    # Start running the consumer coroutine
-    coro_consumer.switch()
-
-
-Tasklets and channels
-+++++++++++++++++++++
-
-The ``stackless`` module also provides an interface that is roughly
-compatible with the interface of the ``stackless`` module in `Stackless
-Python`_: it contains ``stackless.tasklet`` and ``stackless.channel``
-classes.  Tasklets are also similar to microthreads, but (like coroutines)
-they don't actually run in parallel with other microthreads; instead,
-they synchronize and exchange data with each other over Channels, and
-these exchanges determine which Tasklet runs next.
-
-For usage reference, see the documentation on the `Stackless Python`_
-website.
-
-Note that Tasklets and Channels are implemented at application-level in
-`lib_pypy/stackless.py`_ on top of coroutines_.  You can refer to this
-module for more details and API documentation.
-
-The stackless.py code tries to resemble the stackless C code as much
-as possible. This makes the code somewhat unpythonic.
-
-Bird's eye view of tasklets and channels
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Tasklets are a bit like threads: they encapsulate a function in such a way that
-they can be suspended/restarted any time. Unlike threads, they won't
-run concurrently, but must be cooperative. When using stackless
-features, it is vitally important that no action is performed that blocks
-everything else.  In particular, blocking input/output should be centralized
-to a single tasklet.
-
-Communication between tasklets is done via channels. 
-There are three ways for a tasklet to give up control:
-
-1. call ``stackless.schedule()``
-2. send something over a channel
-3. receive something from a channel
-
-A (live) tasklet can either be running, waiting to get scheduled, or be
-blocked by a channel.
-
-Scheduling is done in strictly round-robin manner. A blocked tasklet
-is removed from the scheduling queue and will be reinserted when it
-becomes unblocked.
-
-Example
-~~~~~~~
-
-Here is a many-producers many-consumers example, where any consumer can
-process the result of any producer.  For this situation we set up a
-single channel where all producer send, and on which all consumers
-wait::
-
-    def producer(chan):
-        while True:
-            chan.send(...next value...)
-
-    def consumer(chan):
-        while True:
-            x = chan.receive()
-            ...do something with x...
-
-    # Set up the N producer and M consumer tasklets
-    common_channel = stackless.channel()
-    for i in range(N):
-        stackless.tasklet(producer, common_channel)()
-    for i in range(M):
-        stackless.tasklet(consumer, common_channel)()
-
-    # Run it all
-    stackless.run()
-
-Each item sent over the channel is received by one of the waiting
-consumers; which one is not specified.  The producers block until their
-item is consumed: the channel is not a queue, but rather a meeting point
-which causes tasklets to block until both a consumer and a producer are
-ready.  In practice, the reason for having several consumers receiving
-on a single channel is that some of the consumers can be busy in other
-ways part of the time.  For example, each consumer might receive a
-database request, process it, and send the result to a further channel
-before it asks for the next request.  In this situation, further
-requests can still be received by other consumers.
+    class X:
+        @generator
+        def f(self, cont, a, b):
+            ...
 
 
 Greenlets
 +++++++++
 
-A Greenlet is a kind of primitive Tasklet with a lower-level interface
-and with exact control over the execution order.  Greenlets are similar
-to Coroutines, with a slightly different interface: greenlets put more
-emphasis on a tree structure.  The various greenlets of a program form a
-precise tree, which fully determines their order of execution.
+Greenlets are implemented on top of continulets in `lib_pypy/greenlet.py`_.
+See the official `documentation of the greenlets`_.
 
-For usage reference, see the `documentation of the greenlets`_.
-The PyPy interface is identical.  You should use ``greenlet.greenlet``
-instead of ``stackless.greenlet`` directly, because the greenlet library
-can give you the latter when you ask for the former on top of PyPy.
+Note that unlike the CPython greenlets, this version does not suffer
+from GC issues: if the program "forgets" an unfinished greenlet, it will
+always be collected at the next garbage collection.
 
-PyPy's greenlets do not suffer from the cyclic GC limitation that the
-CPython greenlets have: greenlets referencing each other via local
-variables tend to leak on top of CPython (where it is mostly impossible
-to do the right thing).  It works correctly on top of PyPy.
 
+Unimplemented features
+++++++++++++++++++++++
 
-Coroutine Pickling
-++++++++++++++++++
+The following features (present in some past Stackless version of PyPy)
+are for the time being not supported any more:
 
-Coroutines and tasklets can be pickled and unpickled, i.e. serialized to
-a string of bytes for the purpose of storage or transmission.  This
-allows "live" coroutines or tasklets to be made persistent, moved to
-other machines, or cloned in any way.  The standard ``pickle`` module
-works with coroutines and tasklets (at least in a translated ``pypy-c``;
-unpickling live coroutines or tasklets cannot be easily implemented on
-top of CPython).
+* Tasklets and channels (currently ``stackless.py`` seems to import,
+  but you have tasklets on top of coroutines on top of greenlets on
+  top of continulets on top of stacklets, and it's probably not too
+  hard to cut two of these levels by adapting ``stackless.py`` to
+  use directly continulets)
 
-To be able to achieve this result, we have to consider many objects that
-are not normally pickleable in CPython.  Here again, the `Stackless
-Python`_ implementation has paved the way, and we follow the same
-general design decisions: simple internal objects like bound method
-objects and various kinds of iterators are supported; frame objects can
-be fully pickled and unpickled
-(by serializing a reference to the bytecode they are
-running in addition to all the local variables).  References to globals
-and modules are pickled by name, similarly to references to functions
-and classes in the traditional CPython ``pickle``.
+* Coroutines (could be rewritten at app-level)
 
-The "magic" part of this process is the implementation of the unpickling
-of a chain of frames.  The Python interpreter of PyPy uses
-interpreter-level recursion to represent application-level calls.  The
-reason for this is that it tremendously simplifies the implementation of
-the interpreter itself.  Indeed, in Python, almost any operation can
-potentially result in a non-tail-recursive call to another Python
-function.  This makes writing a non-recursive interpreter extremely
-tedious; instead, we rely on lower-level transformations during the
-translation process to control this recursion.  This is the `Stackless
-Transform`_, which is at the heart of PyPy's support for stackless-style
-concurrency.
+* Pickling and unpickling continulets (*)
 
-At any point in time, a chain of Python-level frames corresponds to a
-chain of interpreter-level frames (e.g. C frames in pypy-c), where each
-single Python-level frame corresponds to one or a few interpreter-level
-frames - depending on the length of the interpreter-level call chain
-from one bytecode evaluation loop to the next (recursively invoked) one.
+* Continuing execution of a continulet in a different thread (*)
 
-This means that it is not sufficient to simply create a chain of Python
-frame objects in the heap of a process before we can resume execution of
-these newly built frames.  We must recreate a corresponding chain of
-interpreter-level frames.  To this end, we have inserted a few *named
-resume points* (see 3.2.4, in `D07.1 Massive Parallelism and Translation Aspects`_) in the Python interpreter of PyPy.  This is the
-motivation for implementing the interpreter-level primitives
-``resume_state_create()`` and ``resume_state_invoke()``, the powerful
-interface that allows an RPython program to artificially rebuild a chain
-of calls in a reflective way, completely from scratch, and jump to it.
+* Automatic unlimited stack (must be emulated__ so far)
 
-.. _`D07.1 Massive Parallelism and Translation Aspects`: http://codespeak.net/pypy/extradoc/eu-report/D07.1_Massive_Parallelism_and_Translation_Aspects-2007-02-28.pdf
+* Support for other CPUs than x86 and x86-64
 
-Example
-~~~~~~~
+* The app-level ``f_back`` field of frames crossing continulet boundaries
+  is None for now, unlike what I explain in the theoretical overview
+  above.  It mostly means that in a ``pdb.set_trace()`` you cannot go
+  ``up`` past countinulet boundaries.  This could be fixed.
 
-(See `demo/pickle_coroutine.py`_ for the complete source of this demo.)
+.. __: `recursion depth limit`_
 
-Consider a program which contains a part performing a long-running
-computation::
+(*) Pickling, as well as changing threads, could be implemented by using
+a "soft" stack switching mode again.  We would get either "hard" or
+"soft" switches, similarly to Stackless Python 3rd version: you get a
+"hard" switch (like now) when the C stack contains non-trivial C frames
+to save, and a "soft" switch (like previously) when it contains only
+simple calls from Python to Python.  Soft-switched continulets would
+also consume a bit less RAM, and the switch might be a bit faster too
+(unsure about that; what is the Stackless Python experience?).
 
-    def ackermann(x, y):
-        if x == 0:
-            return y + 1
-        if y == 0:
-            return ackermann(x - 1, 1)
-        return ackermann(x - 1, ackermann(x, y - 1))
 
-By using pickling, we can save the state of the computation while it is
-running, for the purpose of restoring it later and continuing the
-computation at another time or on a different machine.  However,
-pickling does not produce a whole-program dump: it can only pickle
-individual coroutines.  This means that the computation should be
-started in its own coroutine::
+Recursion depth limit
++++++++++++++++++++++
 
-    # Make a coroutine that will run 'ackermann(3, 8)'
-    coro = coroutine()
-    coro.bind(ackermann, 3, 8)
+You can use continulets to emulate the infinite recursion depth present
+in Stackless Python and in stackless-enabled older versions of PyPy.
 
-    # Now start running the coroutine
-    result = coro.switch()
+The trick is to start a continulet "early", i.e. when the recursion
+depth is very low, and switch to it "later", i.e. when the recursion
+depth is high.  Example::
 
-The coroutine itself must switch back to the main program when it needs
-to be interrupted (we can only pickle suspended coroutines).  Due to
-current limitations this requires an explicit check in the
-``ackermann()`` function::
+    from _continuation import continulet
 
-    def ackermann(x, y):
-        if interrupt_flag:      # test a global flag
-            main.switch()       # and switch back to 'main' if it is set
-        if x == 0:
-            return y + 1
-        if y == 0:
-            return ackermann(x - 1, 1)
-        return ackermann(x - 1, ackermann(x, y - 1))
+    def invoke(_, callable, arg):
+        return callable(arg)
 
-The global ``interrupt_flag`` would be set for example by a timeout, or
-by a signal handler reacting to Ctrl-C, etc.  It causes the coroutine to
-transfer control back to the main program.  The execution comes back
-just after the line ``coro.switch()``, where we can pickle the coroutine
-if necessary::
+    def bootstrap(c):
+        # this loop runs forever, at a very low recursion depth
+        callable, arg = c.switch()
+        while True:
+            # start a new continulet from here, and switch to
+            # it using an "exchange", i.e. a switch with to=.
+            to = continulet(invoke, callable, arg)
+            callable, arg = c.switch(to=to)
 
-    if not coro.is_alive:
-        print "finished; the result is:", result
-    else:
-        # save the state of the suspended coroutine
-        f = open('demo.pickle', 'w')
-        pickle.dump(coro, f)
-        f.close()
+    c = continulet(bootstrap)
+    c.switch()
 
-The process can then stop.  At any later time, or on another machine,
-we can reload the file and restart the coroutine with::
 
-    f = open('demo.pickle', 'r')
-    coro = pickle.load(f)
-    f.close()
-    result = coro.switch()
+    def recursive(n):
+        if n == 0:
+            return ("ok", n)
+        if n % 200 == 0:
+            prev = c.switch((recursive, n - 1))
+        else:
+            prev = recursive(n - 1)
+        return (prev[0], prev[1] + 1)
 
-Limitations
-~~~~~~~~~~~
+    print recursive(999999)     # prints ('ok', 999999)
 
-Coroutine pickling is subject to some limitations.  First of all, it is
-not a whole-program "memory dump".  It means that only the "local" state
-of a coroutine is saved.  The local state is defined to include the
-chain of calls and the local variables, but not for example the value of
-any global variable.
+Note that if you press Ctrl-C while running this example, the traceback
+will be built with *all* recursive() calls so far, even if this is more
+than the number that can possibly fit in the C stack.  These frames are
+"overlapping" each other in the sense of the C stack; more precisely,
+they are copied out of and into the C stack as needed.
 
-As in normal Python, the pickle will not include any function object's
-code, any class definition, etc., but only references to functions and
-classes.  Unlike normal Python, the pickle contains frames.  A pickled
-frame stores a bytecode index, representing the current execution
-position.  This means that the user program cannot be modified *at all*
-between pickling and unpickling!
+(The example above also makes use of the following general "guideline"
+to help newcomers write continulets: in ``bootstrap(c)``, only call
+methods on ``c``, not on another continulet object.  That's why we wrote
+``c.switch(to=to)`` and not ``to.switch()``, which would mess up the
+state.  This is however just a guideline; in general we would recommend
+to use other interfaces like genlets and greenlets.)
 
-On the other hand, the pickled data is fairly independent from the
-platform and from the PyPy version.
 
-Pickling/unpickling fails if the coroutine is suspended in a state that
-involves Python frames which were *indirectly* called.  To define this
-more precisely, a Python function can issue a regular function or method
-call to invoke another Python function - this is a *direct* call and can
-be pickled and unpickled.  But there are many ways to invoke a Python
-function indirectly.  For example, most operators can invoke a special
-method ``__xyz__()`` on a class, various built-in functions can call
-back Python functions, signals can invoke signal handlers, and so on.
-These cases are not supported yet.
-
-
-Composability
-+++++++++++++
+Theory of composability
++++++++++++++++++++++++
 
 Although the concept of coroutines is far from new, they have not been
 generally integrated into mainstream languages, or only in limited form
 (like generators in Python and iterators in C#).  We can argue that a
 possible reason for that is that they do not scale well when a program's
 complexity increases: they look attractive in small examples, but the
-models that require explicit switching, by naming the target coroutine,
-do not compose naturally.  This means that a program that uses
-coroutines for two unrelated purposes may run into conflicts caused by
-unexpected interactions.
+models that require explicit switching, for example by naming the target
+coroutine, do not compose naturally.  This means that a program that
+uses coroutines for two unrelated purposes may run into conflicts caused
+by unexpected interactions.
 
 To illustrate the problem, consider the following example (simplified
-code; see the full source in
-`pypy/module/_stackless/test/test_composable_coroutine.py`_).  First, a
-simple usage of coroutine::
+code using a theorical ``coroutine`` class).  First, a simple usage of
+coroutine::
 
     main_coro = coroutine.getcurrent()    # the main (outer) coroutine
     data = []
@@ -530,74 +383,35 @@
 main coroutine, which confuses the ``generator_iterator.next()`` method
 (it gets resumed, but not as a result of a call to ``Yield()``).
 
-As part of trying to combine multiple different paradigms into a single
-application-level module, we have built a way to solve this problem.
-The idea is to avoid the notion of a single, global "main" coroutine (or
-a single main greenlet, or a single main tasklet).  Instead, each
-conceptually separated user of one of these concurrency interfaces can
-create its own "view" on what the main coroutine/greenlet/tasklet is,
-which other coroutine/greenlet/tasklets there are, and which of these is
-the currently running one.  Each "view" is orthogonal to the others.  In
-particular, each view has one (and exactly one) "current"
-coroutine/greenlet/tasklet at any point in time.  When the user switches
-to a coroutine/greenlet/tasklet, it implicitly means that he wants to
-switch away from the current coroutine/greenlet/tasklet *that belongs to
-the same view as the target*.
+Thus the notion of coroutine is *not composable*.  By opposition, the
+primitive notion of continulets is composable: if you build two
+different interfaces on top of it, or have a program that uses twice the
+same interface in two parts, then assuming that both parts independently
+work, the composition of the two parts still works.
 
-The precise application-level interface has not been fixed yet; so far,
-"views" in the above sense are objects of the type
-``stackless.usercostate``.  The above two examples can be rewritten in
-the following way::
+A full proof of that claim would require careful definitions, but let us
+just claim that this fact is true because of the following observation:
+the API of continulets is such that, when doing a ``switch()``, it
+requires the program to have some continulet to explicitly operate on.
+It shuffles the current continuation with the continuation stored in
+that continulet, but has no effect outside.  So if a part of a program
+has a continulet object, and does not expose it as a global, then the
+rest of the program cannot accidentally influence the continuation
+stored in that continulet object.
 
-    producer_view = stackless.usercostate()   # a local view
-    main_coro = producer_view.getcurrent()    # the main (outer) coroutine
-    ...
-    producer_coro = producer_view.newcoroutine()
-    ...
-
-and::
-
-    generators_view = stackless.usercostate()
-
-    def generator(f):
-        def wrappedfunc(*args, **kwds):
-            g = generators_view.newcoroutine(generator_iterator)
-            ...
-
-            ...generators_view.getcurrent()...
-
-Then the composition ``grab_values()`` works as expected, because the
-two views are independent.  The coroutine captured as ``self.caller`` in
-the ``generator_iterator.next()`` method is the main coroutine of the
-``generators_view``.  It is no longer the same object as the main
-coroutine of the ``producer_view``, so when ``data_producer()`` issues
-the following command::
-
-    main_coro.switch()
-
-the control flow cannot accidentally jump back to
-``generator_iterator.next()``.  In other words, from the point of view
-of ``producer_view``, the function ``grab_next_value()`` always runs in
-its main coroutine ``main_coro`` and the function ``data_producer`` in
-its coroutine ``producer_coro``.  This is the case independently of
-which ``generators_view``-based coroutine is the current one when
-``grab_next_value()`` is called.
-
-Only code that has explicit access to the ``producer_view`` or its
-coroutine objects can perform switches that are relevant for the
-generator code.  If the view object and the coroutine objects that share
-this view are all properly encapsulated inside the generator logic, no
-external code can accidentally temper with the expected control flow any
-longer.
-
-In conclusion: we will probably change the app-level interface of PyPy's
-stackless module in the future to not expose coroutines and greenlets at
-all, but only views.  They are not much more difficult to use, and they
-scale automatically to larger programs.
+In other words, if we regard the continulet object as being essentially
+a modifiable ``f_back``, then it is just a link between the frame of
+``callable()`` and the parent frame --- and it cannot be arbitrarily
+changed by unrelated code, as long as they don't explicitly manipulate
+the continulet object.  Typically, both the frame of ``callable()``
+(commonly a local function) and its parent frame (which is the frame
+that switched to it) belong to the same class or module; so from that
+point of view the continulet is a purely local link between two local
+frames.  It doesn't make sense to have a concept that allows this link
+to be manipulated from outside.
 
 
 .. _`Stackless Python`: http://www.stackless.com
 .. _`documentation of the greenlets`: http://packages.python.org/greenlet/
-.. _`Stackless Transform`: translation.html#the-stackless-transform
 
 .. include:: _ref.txt
diff --git a/pypy/doc/translation.rst b/pypy/doc/translation.rst
--- a/pypy/doc/translation.rst
+++ b/pypy/doc/translation.rst
@@ -552,14 +552,15 @@
 
 The stackless transform converts functions into a form that knows how
 to save the execution point and active variables into a heap structure
-and resume execution at that point.  This is used to implement
+and resume execution at that point.  This was used to implement
 coroutines as an RPython-level feature, which in turn are used to
-implement `coroutines, greenlets and tasklets`_ as an application
+implement coroutines, greenlets and tasklets as an application
 level feature for the Standard Interpreter.
 
-Enable the stackless transformation with :config:`translation.stackless`.
+The stackless transformation has been deprecated and is no longer
+available in trunk.  It has been replaced with continulets_.
 
-.. _`coroutines, greenlets and tasklets`: stackless.html
+.. _continulets: stackless.html
 
 .. _`preparing the graphs for source generation`:
 
diff --git a/pypy/doc/windows.rst b/pypy/doc/windows.rst
--- a/pypy/doc/windows.rst
+++ b/pypy/doc/windows.rst
@@ -32,6 +32,24 @@
 modules that relies on third-party libraries.  See below how to get
 and build them.
 
+Preping Windows for the Large Build
+-----------------------------------
+
+Normally 32bit programs are limited to 2GB of memory on Windows. It is
+possible to raise this limit, to 3GB on Windows 32bit, and almost 4GB
+on Windows 64bit.
+
+On Windows 32bit, it is necessary to modify the system: follow
+http://usa.autodesk.com/adsk/servlet/ps/dl/item?siteID=123112&id=9583842&linkID=9240617
+to enable the "3GB" feature, and reboot. This step is not necessary on
+Windows 64bit.
+
+Then you need to execute::
+
+    editbin /largeaddressaware pypy.exe
+
+on the pypy.exe file you compiled.
+
 Installing external packages
 ----------------------------
 
diff --git a/pypy/interpreter/astcompiler/ast.py b/pypy/interpreter/astcompiler/ast.py
--- a/pypy/interpreter/astcompiler/ast.py
+++ b/pypy/interpreter/astcompiler/ast.py
@@ -2541,8 +2541,9 @@
 class ASTVisitor(object):
 
     def visit_sequence(self, seq):
-        for node in seq:
-            node.walkabout(self)
+        if seq is not None:
+            for node in seq:
+                node.walkabout(self)
 
     def default_visitor(self, node):
         raise NodeVisitorNotImplemented
@@ -2673,46 +2674,36 @@
 class GenericASTVisitor(ASTVisitor):
 
     def visit_Module(self, node):
-        if node.body:
-            self.visit_sequence(node.body)
+        self.visit_sequence(node.body)
 
     def visit_Interactive(self, node):
-        if node.body:
-            self.visit_sequence(node.body)
+        self.visit_sequence(node.body)
 
     def visit_Expression(self, node):
         node.body.walkabout(self)
 
     def visit_Suite(self, node):
-        if node.body:
-            self.visit_sequence(node.body)
+        self.visit_sequence(node.body)
 
     def visit_FunctionDef(self, node):
         node.args.walkabout(self)
-        if node.body:
-            self.visit_sequence(node.body)
-        if node.decorator_list:
-            self.visit_sequence(node.decorator_list)
+        self.visit_sequence(node.body)
+        self.visit_sequence(node.decorator_list)
 
     def visit_ClassDef(self, node):
-        if node.bases:
-            self.visit_sequence(node.bases)
-        if node.body:
-            self.visit_sequence(node.body)
-        if node.decorator_list:
-            self.visit_sequence(node.decorator_list)
+        self.visit_sequence(node.bases)
+        self.visit_sequence(node.body)
+        self.visit_sequence(node.decorator_list)
 
     def visit_Return(self, node):
         if node.value:
             node.value.walkabout(self)
 
     def visit_Delete(self, node):
-        if node.targets:
-            self.visit_sequence(node.targets)
+        self.visit_sequence(node.targets)
 
     def visit_Assign(self, node):
-        if node.targets:
-            self.visit_sequence(node.targets)
+        self.visit_sequence(node.targets)
         node.value.walkabout(self)
 
     def visit_AugAssign(self, node):
@@ -2722,37 +2713,29 @@
     def visit_Print(self, node):
         if node.dest:
             node.dest.walkabout(self)
-        if node.values:
-            self.visit_sequence(node.values)
+        self.visit_sequence(node.values)
 
     def visit_For(self, node):
         node.target.walkabout(self)
         node.iter.walkabout(self)
-        if node.body:
-            self.visit_sequence(node.body)
-        if node.orelse:
-            self.visit_sequence(node.orelse)
+        self.visit_sequence(node.body)
+        self.visit_sequence(node.orelse)
 
     def visit_While(self, node):
         node.test.walkabout(self)
-        if node.body:
-            self.visit_sequence(node.body)
-        if node.orelse:
-            self.visit_sequence(node.orelse)
+        self.visit_sequence(node.body)
+        self.visit_sequence(node.orelse)
 
     def visit_If(self, node):
         node.test.walkabout(self)
-        if node.body:
-            self.visit_sequence(node.body)
-        if node.orelse:
-            self.visit_sequence(node.orelse)
+        self.visit_sequence(node.body)
+        self.visit_sequence(node.orelse)
 
     def visit_With(self, node):
         node.context_expr.walkabout(self)
         if node.optional_vars:
             node.optional_vars.walkabout(self)
-        if node.body:
-            self.visit_sequence(node.body)
+        self.visit_sequence(node.body)
 
     def visit_Raise(self, node):
         if node.type:
@@ -2763,18 +2746,13 @@
             node.tback.walkabout(self)
 
     def visit_TryExcept(self, node):
-        if node.body:
-            self.visit_sequence(node.body)
-        if node.handlers:
-            self.visit_sequence(node.handlers)
-        if node.orelse:
-            self.visit_sequence(node.orelse)
+        self.visit_sequence(node.body)
+        self.visit_sequence(node.handlers)
+        self.visit_sequence(node.orelse)
 
     def visit_TryFinally(self, node):
-        if node.body:
-            self.visit_sequence(node.body)
-        if node.finalbody:
-            self.visit_sequence(node.finalbody)
+        self.visit_sequence(node.body)
+        self.visit_sequence(node.finalbody)
 
     def visit_Assert(self, node):
         node.test.walkabout(self)
@@ -2782,12 +2760,10 @@
             node.msg.walkabout(self)
 
     def visit_Import(self, node):
-        if node.names:
-            self.visit_sequence(node.names)
+        self.visit_sequence(node.names)
 
     def visit_ImportFrom(self, node):
-        if node.names:
-            self.visit_sequence(node.names)
+        self.visit_sequence(node.names)
 
     def visit_Exec(self, node):
         node.body.walkabout(self)
@@ -2812,8 +2788,7 @@
         pass
 
     def visit_BoolOp(self, node):
-        if node.values:
-            self.visit_sequence(node.values)
+        self.visit_sequence(node.values)
 
     def visit_BinOp(self, node):
         node.left.walkabout(self)
@@ -2832,35 +2807,28 @@
         node.orelse.walkabout(self)
 
     def visit_Dict(self, node):
-        if node.keys:
-            self.visit_sequence(node.keys)
-        if node.values:
-            self.visit_sequence(node.values)
+        self.visit_sequence(node.keys)
+        self.visit_sequence(node.values)
 
     def visit_Set(self, node):
-        if node.elts:
-            self.visit_sequence(node.elts)
+        self.visit_sequence(node.elts)
 
     def visit_ListComp(self, node):
         node.elt.walkabout(self)
-        if node.generators:
-            self.visit_sequence(node.generators)
+        self.visit_sequence(node.generators)
 
     def visit_SetComp(self, node):
         node.elt.walkabout(self)
-        if node.generators:
-            self.visit_sequence(node.generators)
+        self.visit_sequence(node.generators)
 
     def visit_DictComp(self, node):
         node.key.walkabout(self)
         node.value.walkabout(self)
-        if node.generators:
-            self.visit_sequence(node.generators)
+        self.visit_sequence(node.generators)
 
     def visit_GeneratorExp(self, node):
         node.elt.walkabout(self)
-        if node.generators:
-            self.visit_sequence(node.generators)
+        self.visit_sequence(node.generators)
 
     def visit_Yield(self, node):
         if node.value:
@@ -2868,15 +2836,12 @@
 
     def visit_Compare(self, node):
         node.left.walkabout(self)
-        if node.comparators:
-            self.visit_sequence(node.comparators)
+        self.visit_sequence(node.comparators)
 
     def visit_Call(self, node):
         node.func.walkabout(self)
-        if node.args:
-            self.visit_sequence(node.args)
-        if node.keywords:
-            self.visit_sequence(node.keywords)
+        self.visit_sequence(node.args)
+        self.visit_sequence(node.keywords)
         if node.starargs:
             node.starargs.walkabout(self)
         if node.kwargs:
@@ -2902,12 +2867,10 @@
         pass
 
     def visit_List(self, node):
-        if node.elts:
-            self.visit_sequence(node.elts)
+        self.visit_sequence(node.elts)
 
     def visit_Tuple(self, node):
-        if node.elts:
-            self.visit_sequence(node.elts)
+        self.visit_sequence(node.elts)
 
     def visit_Const(self, node):
         pass
@@ -2924,8 +2887,7 @@
             node.step.walkabout(self)
 
     def visit_ExtSlice(self, node):
-        if node.dims:
-            self.visit_sequence(node.dims)
+        self.visit_sequence(node.dims)
 
     def visit_Index(self, node):
         node.value.walkabout(self)
@@ -2933,22 +2895,18 @@
     def visit_comprehension(self, node):
         node.target.walkabout(self)
         node.iter.walkabout(self)
-        if node.ifs:
-            self.visit_sequence(node.ifs)
+        self.visit_sequence(node.ifs)
 
     def visit_ExceptHandler(self, node):
         if node.type:
             node.type.walkabout(self)
         if node.name:
             node.name.walkabout(self)
-        if node.body:
-            self.visit_sequence(node.body)
+        self.visit_sequence(node.body)
 
     def visit_arguments(self, node):
-        if node.args:
-            self.visit_sequence(node.args)
-        if node.defaults:
-            self.visit_sequence(node.defaults)
+        self.visit_sequence(node.args)
+        self.visit_sequence(node.defaults)
 
     def visit_keyword(self, node):
         node.value.walkabout(self)
@@ -3069,6 +3027,7 @@
             raise
         w_self.setdictvalue(space, 'body', w_new_value)
         return
+    w_self.deldictvalue(space, 'body')
     w_self.initialization_state |= 1
 
 _Expression_field_unroller = unrolling_iterable(['body'])
@@ -3157,6 +3116,7 @@
             raise
         w_self.setdictvalue(space, 'lineno', w_new_value)
         return
+    w_self.deldictvalue(space, 'lineno')
     w_self.initialization_state |= w_self._lineno_mask
 
 def stmt_get_col_offset(space, w_self):
@@ -3178,6 +3138,7 @@
             raise
         w_self.setdictvalue(space, 'col_offset', w_new_value)
         return
+    w_self.deldictvalue(space, 'col_offset')
     w_self.initialization_state |= w_self._col_offset_mask
 
 stmt.typedef = typedef.TypeDef("stmt",
@@ -3208,6 +3169,7 @@
             raise
         w_self.setdictvalue(space, 'name', w_new_value)
         return
+    w_self.deldictvalue(space, 'name')
     w_self.initialization_state |= 1
 
 def FunctionDef_get_args(space, w_self):
@@ -3229,6 +3191,7 @@
             raise
         w_self.setdictvalue(space, 'args', w_new_value)
         return
+    w_self.deldictvalue(space, 'args')
     w_self.initialization_state |= 2
 
 def FunctionDef_get_body(space, w_self):
@@ -3315,6 +3278,7 @@
             raise
         w_self.setdictvalue(space, 'name', w_new_value)
         return
+    w_self.deldictvalue(space, 'name')
     w_self.initialization_state |= 1
 
 def ClassDef_get_bases(space, w_self):
@@ -3420,6 +3384,7 @@
             raise
         w_self.setdictvalue(space, 'value', w_new_value)
         return
+    w_self.deldictvalue(space, 'value')
     w_self.initialization_state |= 1
 
 _Return_field_unroller = unrolling_iterable(['value'])
@@ -3526,6 +3491,7 @@
             raise
         w_self.setdictvalue(space, 'value', w_new_value)
         return
+    w_self.deldictvalue(space, 'value')
     w_self.initialization_state |= 2
 
 _Assign_field_unroller = unrolling_iterable(['targets', 'value'])
@@ -3573,6 +3539,7 @@
             raise
         w_self.setdictvalue(space, 'target', w_new_value)
         return
+    w_self.deldictvalue(space, 'target')
     w_self.initialization_state |= 1
 
 def AugAssign_get_op(space, w_self):
@@ -3590,13 +3557,13 @@
     try:
         obj = space.interp_w(operator, w_new_value)
         w_self.op = obj.to_simple_int(space)
-        # need to save the original object too
-        w_self.setdictvalue(space, 'op', w_new_value)
     except OperationError, e:
         if not e.match(space, space.w_TypeError):
             raise
         w_self.setdictvalue(space, 'op', w_new_value)
         return
+    # need to save the original object too
+    w_self.setdictvalue(space, 'op', w_new_value)
     w_self.initialization_state |= 2
 
 def AugAssign_get_value(space, w_self):
@@ -3618,6 +3585,7 @@
             raise
         w_self.setdictvalue(space, 'value', w_new_value)
         return
+    w_self.deldictvalue(space, 'value')
     w_self.initialization_state |= 4
 
 _AugAssign_field_unroller = unrolling_iterable(['target', 'op', 'value'])
@@ -3665,6 +3633,7 @@
             raise
         w_self.setdictvalue(space, 'dest', w_new_value)
         return
+    w_self.deldictvalue(space, 'dest')
     w_self.initialization_state |= 1
 
 def Print_get_values(space, w_self):
@@ -3704,6 +3673,7 @@
             raise
         w_self.setdictvalue(space, 'nl', w_new_value)
         return
+    w_self.deldictvalue(space, 'nl')
     w_self.initialization_state |= 4
 
 _Print_field_unroller = unrolling_iterable(['dest', 'values', 'nl'])
@@ -3752,6 +3722,7 @@
             raise
         w_self.setdictvalue(space, 'target', w_new_value)
         return
+    w_self.deldictvalue(space, 'target')
     w_self.initialization_state |= 1
 
 def For_get_iter(space, w_self):
@@ -3773,6 +3744,7 @@
             raise
         w_self.setdictvalue(space, 'iter', w_new_value)
         return
+    w_self.deldictvalue(space, 'iter')
     w_self.initialization_state |= 2
 
 def For_get_body(space, w_self):
@@ -3859,6 +3831,7 @@
             raise
         w_self.setdictvalue(space, 'test', w_new_value)
         return
+    w_self.deldictvalue(space, 'test')
     w_self.initialization_state |= 1
 
 def While_get_body(space, w_self):
@@ -3944,6 +3917,7 @@
             raise
         w_self.setdictvalue(space, 'test', w_new_value)
         return
+    w_self.deldictvalue(space, 'test')
     w_self.initialization_state |= 1
 
 def If_get_body(space, w_self):
@@ -4029,6 +4003,7 @@
             raise
         w_self.setdictvalue(space, 'context_expr', w_new_value)
         return
+    w_self.deldictvalue(space, 'context_expr')
     w_self.initialization_state |= 1
 
 def With_get_optional_vars(space, w_self):
@@ -4050,6 +4025,7 @@
             raise
         w_self.setdictvalue(space, 'optional_vars', w_new_value)
         return
+    w_self.deldictvalue(space, 'optional_vars')
     w_self.initialization_state |= 2
 
 def With_get_body(space, w_self):
@@ -4116,6 +4092,7 @@
             raise
         w_self.setdictvalue(space, 'type', w_new_value)
         return
+    w_self.deldictvalue(space, 'type')
     w_self.initialization_state |= 1
 
 def Raise_get_inst(space, w_self):
@@ -4137,6 +4114,7 @@
             raise
         w_self.setdictvalue(space, 'inst', w_new_value)
         return
+    w_self.deldictvalue(space, 'inst')
     w_self.initialization_state |= 2
 
 def Raise_get_tback(space, w_self):
@@ -4158,6 +4136,7 @@
             raise
         w_self.setdictvalue(space, 'tback', w_new_value)
         return
+    w_self.deldictvalue(space, 'tback')
     w_self.initialization_state |= 4
 
 _Raise_field_unroller = unrolling_iterable(['type', 'inst', 'tback'])
@@ -4351,6 +4330,7 @@
             raise
         w_self.setdictvalue(space, 'test', w_new_value)
         return
+    w_self.deldictvalue(space, 'test')
     w_self.initialization_state |= 1
 
 def Assert_get_msg(space, w_self):
@@ -4372,6 +4352,7 @@
             raise
         w_self.setdictvalue(space, 'msg', w_new_value)
         return
+    w_self.deldictvalue(space, 'msg')
     w_self.initialization_state |= 2
 
 _Assert_field_unroller = unrolling_iterable(['test', 'msg'])
@@ -4464,6 +4445,7 @@
             raise
         w_self.setdictvalue(space, 'module', w_new_value)
         return
+    w_self.deldictvalue(space, 'module')
     w_self.initialization_state |= 1
 
 def ImportFrom_get_names(space, w_self):
@@ -4503,6 +4485,7 @@
             raise
         w_self.setdictvalue(space, 'level', w_new_value)
         return
+    w_self.deldictvalue(space, 'level')
     w_self.initialization_state |= 4
 
 _ImportFrom_field_unroller = unrolling_iterable(['module', 'names', 'level'])
@@ -4551,6 +4534,7 @@
             raise
         w_self.setdictvalue(space, 'body', w_new_value)
         return
+    w_self.deldictvalue(space, 'body')
     w_self.initialization_state |= 1
 
 def Exec_get_globals(space, w_self):
@@ -4572,6 +4556,7 @@
             raise
         w_self.setdictvalue(space, 'globals', w_new_value)
         return
+    w_self.deldictvalue(space, 'globals')
     w_self.initialization_state |= 2
 
 def Exec_get_locals(space, w_self):
@@ -4593,6 +4578,7 @@
             raise
         w_self.setdictvalue(space, 'locals', w_new_value)
         return
+    w_self.deldictvalue(space, 'locals')
     w_self.initialization_state |= 4
 
 _Exec_field_unroller = unrolling_iterable(['body', 'globals', 'locals'])
@@ -4683,6 +4669,7 @@
             raise
         w_self.setdictvalue(space, 'value', w_new_value)
         return
+    w_self.deldictvalue(space, 'value')
     w_self.initialization_state |= 1
 
 _Expr_field_unroller = unrolling_iterable(['value'])
@@ -4779,6 +4766,7 @@
             raise
         w_self.setdictvalue(space, 'lineno', w_new_value)
         return
+    w_self.deldictvalue(space, 'lineno')
     w_self.initialization_state |= w_self._lineno_mask
 
 def expr_get_col_offset(space, w_self):
@@ -4800,6 +4788,7 @@
             raise
         w_self.setdictvalue(space, 'col_offset', w_new_value)
         return
+    w_self.deldictvalue(space, 'col_offset')
     w_self.initialization_state |= w_self._col_offset_mask
 
 expr.typedef = typedef.TypeDef("expr",
@@ -4826,13 +4815,13 @@
     try:
         obj = space.interp_w(boolop, w_new_value)
         w_self.op = obj.to_simple_int(space)
-        # need to save the original object too
-        w_self.setdictvalue(space, 'op', w_new_value)
     except OperationError, e:
         if not e.match(space, space.w_TypeError):
             raise
         w_self.setdictvalue(space, 'op', w_new_value)
         return
+    # need to save the original object too
+    w_self.setdictvalue(space, 'op', w_new_value)
     w_self.initialization_state |= 1
 
 def BoolOp_get_values(space, w_self):
@@ -4898,6 +4887,7 @@
             raise
         w_self.setdictvalue(space, 'left', w_new_value)
         return
+    w_self.deldictvalue(space, 'left')
     w_self.initialization_state |= 1
 
 def BinOp_get_op(space, w_self):
@@ -4915,13 +4905,13 @@
     try:
         obj = space.interp_w(operator, w_new_value)
         w_self.op = obj.to_simple_int(space)
-        # need to save the original object too
-        w_self.setdictvalue(space, 'op', w_new_value)
     except OperationError, e:
         if not e.match(space, space.w_TypeError):
             raise
         w_self.setdictvalue(space, 'op', w_new_value)
         return
+    # need to save the original object too
+    w_self.setdictvalue(space, 'op', w_new_value)
     w_self.initialization_state |= 2
 
 def BinOp_get_right(space, w_self):
@@ -4943,6 +4933,7 @@
             raise
         w_self.setdictvalue(space, 'right', w_new_value)
         return
+    w_self.deldictvalue(space, 'right')
     w_self.initialization_state |= 4
 
 _BinOp_field_unroller = unrolling_iterable(['left', 'op', 'right'])
@@ -4986,13 +4977,13 @@
     try:
         obj = space.interp_w(unaryop, w_new_value)
         w_self.op = obj.to_simple_int(space)
-        # need to save the original object too
-        w_self.setdictvalue(space, 'op', w_new_value)
     except OperationError, e:
         if not e.match(space, space.w_TypeError):
             raise
         w_self.setdictvalue(space, 'op', w_new_value)
         return
+    # need to save the original object too
+    w_self.setdictvalue(space, 'op', w_new_value)
     w_self.initialization_state |= 1
 
 def UnaryOp_get_operand(space, w_self):
@@ -5014,6 +5005,7 @@
             raise
         w_self.setdictvalue(space, 'operand', w_new_value)
         return
+    w_self.deldictvalue(space, 'operand')
     w_self.initialization_state |= 2
 
 _UnaryOp_field_unroller = unrolling_iterable(['op', 'operand'])
@@ -5060,6 +5052,7 @@
             raise
         w_self.setdictvalue(space, 'args', w_new_value)
         return
+    w_self.deldictvalue(space, 'args')
     w_self.initialization_state |= 1
 
 def Lambda_get_body(space, w_self):
@@ -5081,6 +5074,7 @@
             raise
         w_self.setdictvalue(space, 'body', w_new_value)
         return
+    w_self.deldictvalue(space, 'body')
     w_self.initialization_state |= 2
 
 _Lambda_field_unroller = unrolling_iterable(['args', 'body'])
@@ -5127,6 +5121,7 @@
             raise
         w_self.setdictvalue(space, 'test', w_new_value)
         return
+    w_self.deldictvalue(space, 'test')
     w_self.initialization_state |= 1
 
 def IfExp_get_body(space, w_self):
@@ -5148,6 +5143,7 @@
             raise
         w_self.setdictvalue(space, 'body', w_new_value)
         return
+    w_self.deldictvalue(space, 'body')
     w_self.initialization_state |= 2
 
 def IfExp_get_orelse(space, w_self):
@@ -5169,6 +5165,7 @@
             raise
         w_self.setdictvalue(space, 'orelse', w_new_value)
         return
+    w_self.deldictvalue(space, 'orelse')
     w_self.initialization_state |= 4
 
 _IfExp_field_unroller = unrolling_iterable(['test', 'body', 'orelse'])
@@ -5322,6 +5319,7 @@
             raise
         w_self.setdictvalue(space, 'elt', w_new_value)
         return
+    w_self.deldictvalue(space, 'elt')
     w_self.initialization_state |= 1
 
 def ListComp_get_generators(space, w_self):
@@ -5387,6 +5385,7 @@
             raise
         w_self.setdictvalue(space, 'elt', w_new_value)
         return
+    w_self.deldictvalue(space, 'elt')
     w_self.initialization_state |= 1
 
 def SetComp_get_generators(space, w_self):
@@ -5452,6 +5451,7 @@
             raise
         w_self.setdictvalue(space, 'key', w_new_value)
         return
+    w_self.deldictvalue(space, 'key')
     w_self.initialization_state |= 1
 
 def DictComp_get_value(space, w_self):
@@ -5473,6 +5473,7 @@
             raise
         w_self.setdictvalue(space, 'value', w_new_value)
         return
+    w_self.deldictvalue(space, 'value')
     w_self.initialization_state |= 2
 
 def DictComp_get_generators(space, w_self):
@@ -5539,6 +5540,7 @@
             raise
         w_self.setdictvalue(space, 'elt', w_new_value)
         return
+    w_self.deldictvalue(space, 'elt')
     w_self.initialization_state |= 1
 
 def GeneratorExp_get_generators(space, w_self):
@@ -5604,6 +5606,7 @@
             raise
         w_self.setdictvalue(space, 'value', w_new_value)
         return
+    w_self.deldictvalue(space, 'value')
     w_self.initialization_state |= 1
 
 _Yield_field_unroller = unrolling_iterable(['value'])
@@ -5649,6 +5652,7 @@
             raise
         w_self.setdictvalue(space, 'left', w_new_value)
         return
+    w_self.deldictvalue(space, 'left')
     w_self.initialization_state |= 1
 
 def Compare_get_ops(space, w_self):
@@ -5734,6 +5738,7 @@
             raise
         w_self.setdictvalue(space, 'func', w_new_value)
         return
+    w_self.deldictvalue(space, 'func')
     w_self.initialization_state |= 1
 
 def Call_get_args(space, w_self):
@@ -5791,6 +5796,7 @@
             raise
         w_self.setdictvalue(space, 'starargs', w_new_value)
         return
+    w_self.deldictvalue(space, 'starargs')
     w_self.initialization_state |= 8
 
 def Call_get_kwargs(space, w_self):
@@ -5812,6 +5818,7 @@
             raise
         w_self.setdictvalue(space, 'kwargs', w_new_value)
         return
+    w_self.deldictvalue(space, 'kwargs')
     w_self.initialization_state |= 16
 
 _Call_field_unroller = unrolling_iterable(['func', 'args', 'keywords', 'starargs', 'kwargs'])
@@ -5863,6 +5870,7 @@
             raise
         w_self.setdictvalue(space, 'value', w_new_value)
         return
+    w_self.deldictvalue(space, 'value')
     w_self.initialization_state |= 1
 
 _Repr_field_unroller = unrolling_iterable(['value'])
@@ -5908,6 +5916,7 @@
             raise
         w_self.setdictvalue(space, 'n', w_new_value)
         return
+    w_self.deldictvalue(space, 'n')
     w_self.initialization_state |= 1
 
 _Num_field_unroller = unrolling_iterable(['n'])
@@ -5953,6 +5962,7 @@
             raise
         w_self.setdictvalue(space, 's', w_new_value)
         return
+    w_self.deldictvalue(space, 's')
     w_self.initialization_state |= 1
 
 _Str_field_unroller = unrolling_iterable(['s'])
@@ -5998,6 +6008,7 @@
             raise
         w_self.setdictvalue(space, 'value', w_new_value)
         return
+    w_self.deldictvalue(space, 'value')
     w_self.initialization_state |= 1
 
 def Attribute_get_attr(space, w_self):
@@ -6019,6 +6030,7 @@
             raise
         w_self.setdictvalue(space, 'attr', w_new_value)
         return
+    w_self.deldictvalue(space, 'attr')
     w_self.initialization_state |= 2
 
 def Attribute_get_ctx(space, w_self):
@@ -6036,13 +6048,13 @@
     try:
         obj = space.interp_w(expr_context, w_new_value)
         w_self.ctx = obj.to_simple_int(space)
-        # need to save the original object too
-        w_self.setdictvalue(space, 'ctx', w_new_value)
     except OperationError, e:
         if not e.match(space, space.w_TypeError):
             raise
         w_self.setdictvalue(space, 'ctx', w_new_value)
         return
+    # need to save the original object too
+    w_self.setdictvalue(space, 'ctx', w_new_value)
     w_self.initialization_state |= 4
 
 _Attribute_field_unroller = unrolling_iterable(['value', 'attr', 'ctx'])
@@ -6090,6 +6102,7 @@
             raise
         w_self.setdictvalue(space, 'value', w_new_value)
         return
+    w_self.deldictvalue(space, 'value')
     w_self.initialization_state |= 1
 
 def Subscript_get_slice(space, w_self):
@@ -6111,6 +6124,7 @@
             raise
         w_self.setdictvalue(space, 'slice', w_new_value)
         return
+    w_self.deldictvalue(space, 'slice')
     w_self.initialization_state |= 2
 
 def Subscript_get_ctx(space, w_self):
@@ -6128,13 +6142,13 @@
     try:
         obj = space.interp_w(expr_context, w_new_value)
         w_self.ctx = obj.to_simple_int(space)
-        # need to save the original object too
-        w_self.setdictvalue(space, 'ctx', w_new_value)
     except OperationError, e:
         if not e.match(space, space.w_TypeError):
             raise
         w_self.setdictvalue(space, 'ctx', w_new_value)
         return
+    # need to save the original object too
+    w_self.setdictvalue(space, 'ctx', w_new_value)
     w_self.initialization_state |= 4
 
 _Subscript_field_unroller = unrolling_iterable(['value', 'slice', 'ctx'])
@@ -6182,6 +6196,7 @@
             raise
         w_self.setdictvalue(space, 'id', w_new_value)
         return
+    w_self.deldictvalue(space, 'id')
     w_self.initialization_state |= 1
 
 def Name_get_ctx(space, w_self):
@@ -6199,13 +6214,13 @@
     try:
         obj = space.interp_w(expr_context, w_new_value)
         w_self.ctx = obj.to_simple_int(space)
-        # need to save the original object too
-        w_self.setdictvalue(space, 'ctx', w_new_value)
     except OperationError, e:
         if not e.match(space, space.w_TypeError):
             raise
         w_self.setdictvalue(space, 'ctx', w_new_value)
         return
+    # need to save the original object too
+    w_self.setdictvalue(space, 'ctx', w_new_value)
     w_self.initialization_state |= 2
 
 _Name_field_unroller = unrolling_iterable(['id', 'ctx'])
@@ -6266,13 +6281,13 @@
     try:
         obj = space.interp_w(expr_context, w_new_value)
         w_self.ctx = obj.to_simple_int(space)
-        # need to save the original object too
-        w_self.setdictvalue(space, 'ctx', w_new_value)
     except OperationError, e:
         if not e.match(space, space.w_TypeError):
             raise
         w_self.setdictvalue(space, 'ctx', w_new_value)
         return
+    # need to save the original object too
+    w_self.setdictvalue(space, 'ctx', w_new_value)
     w_self.initialization_state |= 2
 
 _List_field_unroller = unrolling_iterable(['elts', 'ctx'])
@@ -6334,13 +6349,13 @@
     try:
         obj = space.interp_w(expr_context, w_new_value)
         w_self.ctx = obj.to_simple_int(space)
-        # need to save the original object too
-        w_self.setdictvalue(space, 'ctx', w_new_value)
     except OperationError, e:
         if not e.match(space, space.w_TypeError):
             raise
         w_self.setdictvalue(space, 'ctx', w_new_value)
         return
+    # need to save the original object too
+    w_self.setdictvalue(space, 'ctx', w_new_value)
     w_self.initialization_state |= 2
 
 _Tuple_field_unroller = unrolling_iterable(['elts', 'ctx'])
@@ -6388,6 +6403,7 @@
             raise
         w_self.setdictvalue(space, 'value', w_new_value)
         return
+    w_self.deldictvalue(space, 'value')
     w_self.initialization_state |= 1
 
 _Const_field_unroller = unrolling_iterable(['value'])
@@ -6506,6 +6522,7 @@
             raise
         w_self.setdictvalue(space, 'lower', w_new_value)
         return
+    w_self.deldictvalue(space, 'lower')
     w_self.initialization_state |= 1
 
 def Slice_get_upper(space, w_self):
@@ -6527,6 +6544,7 @@
             raise
         w_self.setdictvalue(space, 'upper', w_new_value)
         return
+    w_self.deldictvalue(space, 'upper')
     w_self.initialization_state |= 2
 
 def Slice_get_step(space, w_self):
@@ -6548,6 +6566,7 @@
             raise
         w_self.setdictvalue(space, 'step', w_new_value)
         return
+    w_self.deldictvalue(space, 'step')
     w_self.initialization_state |= 4
 
 _Slice_field_unroller = unrolling_iterable(['lower', 'upper', 'step'])
@@ -6638,6 +6657,7 @@
             raise
         w_self.setdictvalue(space, 'value', w_new_value)
         return
+    w_self.deldictvalue(space, 'value')
     w_self.initialization_state |= 1
 
 _Index_field_unroller = unrolling_iterable(['value'])
@@ -6907,6 +6927,7 @@
             raise
         w_self.setdictvalue(space, 'target', w_new_value)
         return
+    w_self.deldictvalue(space, 'target')
     w_self.initialization_state |= 1
 
 def comprehension_get_iter(space, w_self):
@@ -6928,6 +6949,7 @@
             raise
         w_self.setdictvalue(space, 'iter', w_new_value)
         return
+    w_self.deldictvalue(space, 'iter')
     w_self.initialization_state |= 2
 
 def comprehension_get_ifs(space, w_self):
@@ -6994,6 +7016,7 @@
             raise
         w_self.setdictvalue(space, 'lineno', w_new_value)
         return
+    w_self.deldictvalue(space, 'lineno')
     w_self.initialization_state |= w_self._lineno_mask
 
 def excepthandler_get_col_offset(space, w_self):
@@ -7015,6 +7038,7 @@
             raise
         w_self.setdictvalue(space, 'col_offset', w_new_value)
         return
+    w_self.deldictvalue(space, 'col_offset')
     w_self.initialization_state |= w_self._col_offset_mask
 
 excepthandler.typedef = typedef.TypeDef("excepthandler",
@@ -7045,6 +7069,7 @@
             raise
         w_self.setdictvalue(space, 'type', w_new_value)
         return
+    w_self.deldictvalue(space, 'type')
     w_self.initialization_state |= 1
 
 def ExceptHandler_get_name(space, w_self):
@@ -7066,6 +7091,7 @@
             raise
         w_self.setdictvalue(space, 'name', w_new_value)
         return
+    w_self.deldictvalue(space, 'name')
     w_self.initialization_state |= 2
 
 def ExceptHandler_get_body(space, w_self):
@@ -7153,6 +7179,7 @@
             raise
         w_self.setdictvalue(space, 'vararg', w_new_value)
         return
+    w_self.deldictvalue(space, 'vararg')
     w_self.initialization_state |= 2
 
 def arguments_get_kwarg(space, w_self):
@@ -7177,6 +7204,7 @@
             raise
         w_self.setdictvalue(space, 'kwarg', w_new_value)
         return
+    w_self.deldictvalue(space, 'kwarg')
     w_self.initialization_state |= 4
 
 def arguments_get_defaults(space, w_self):
@@ -7245,6 +7273,7 @@
             raise
         w_self.setdictvalue(space, 'arg', w_new_value)
         return
+    w_self.deldictvalue(space, 'arg')
     w_self.initialization_state |= 1
 
 def keyword_get_value(space, w_self):
@@ -7266,6 +7295,7 @@
             raise
         w_self.setdictvalue(space, 'value', w_new_value)
         return
+    w_self.deldictvalue(space, 'value')
     w_self.initialization_state |= 2
 
 _keyword_field_unroller = unrolling_iterable(['arg', 'value'])
@@ -7312,6 +7342,7 @@
             raise
         w_self.setdictvalue(space, 'name', w_new_value)
         return
+    w_self.deldictvalue(space, 'name')
     w_self.initialization_state |= 1
 
 def alias_get_asname(space, w_self):
@@ -7336,6 +7367,7 @@
             raise
         w_self.setdictvalue(space, 'asname', w_new_value)
         return
+    w_self.deldictvalue(space, 'asname')
     w_self.initialization_state |= 2
 
 _alias_field_unroller = unrolling_iterable(['name', 'asname'])
diff --git a/pypy/interpreter/astcompiler/codegen.py b/pypy/interpreter/astcompiler/codegen.py
--- a/pypy/interpreter/astcompiler/codegen.py
+++ b/pypy/interpreter/astcompiler/codegen.py
@@ -295,15 +295,11 @@
     def visit_FunctionDef(self, func):
         self.update_position(func.lineno, True)
         # Load decorators first, but apply them after the function is created.
-        if func.decorator_list:
-            self.visit_sequence(func.decorator_list)
+        self.visit_sequence(func.decorator_list)
         args = func.args
         assert isinstance(args, ast.arguments)
-        if args.defaults:
-            self.visit_sequence(args.defaults)
-            num_defaults = len(args.defaults)
-        else:
-            num_defaults = 0
+        self.visit_sequence(args.defaults)
+        num_defaults = len(args.defaults) if args.defaults is not None else 0
         code = self.sub_scope(FunctionCodeGenerator, func.name, func,
                               func.lineno)
         self._make_function(code, num_defaults)
@@ -317,24 +313,17 @@
         self.update_position(lam.lineno)
         args = lam.args
         assert isinstance(args, ast.arguments)
-        if args.defaults:
-            self.visit_sequence(args.defaults)
-            default_count = len(args.defaults)
-        else:
-            default_count = 0
+        self.visit_sequence(args.defaults)
+        default_count = len(args.defaults) if args.defaults is not None else 0
         code = self.sub_scope(LambdaCodeGenerator, "<lambda>", lam, lam.lineno)
         self._make_function(code, default_count)
 
     def visit_ClassDef(self, cls):
         self.update_position(cls.lineno, True)
-        if cls.decorator_list:
-            self.visit_sequence(cls.decorator_list)
+        self.visit_sequence(cls.decorator_list)
         self.load_const(self.space.wrap(cls.name))
-        if cls.bases:
-            bases_count = len(cls.bases)
-            self.visit_sequence(cls.bases)
-        else:
-            bases_count = 0
+        self.visit_sequence(cls.bases)
+        bases_count = len(cls.bases) if cls.bases is not None else 0
         self.emit_op_arg(ops.BUILD_TUPLE, bases_count)
         code = self.sub_scope(ClassCodeGenerator, cls.name, cls, cls.lineno)
         self._make_function(code, 0)
@@ -446,8 +435,7 @@
         end = self.new_block()
         test_constant = if_.test.as_constant_truth(self.space)
         if test_constant == optimize.CONST_FALSE:
-            if if_.orelse:
-                self.visit_sequence(if_.orelse)
+            self.visit_sequence(if_.orelse)
         elif test_constant == optimize.CONST_TRUE:
             self.visit_sequence(if_.body)
         else:
@@ -515,16 +503,14 @@
         self.use_next_block(cleanup)
         self.emit_op(ops.POP_BLOCK)
         self.pop_frame_block(F_BLOCK_LOOP, start)
-        if fr.orelse:
-            self.visit_sequence(fr.orelse)
+        self.visit_sequence(fr.orelse)
         self.use_next_block(end)
 
     def visit_While(self, wh):
         self.update_position(wh.lineno, True)
         test_constant = wh.test.as_constant_truth(self.space)
         if test_constant == optimize.CONST_FALSE:
-            if wh.orelse:
-                self.visit_sequence(wh.orelse)
+            self.visit_sequence(wh.orelse)
         else:
             end = self.new_block()
             anchor = None
@@ -544,8 +530,7 @@
                 self.use_next_block(anchor)
                 self.emit_op(ops.POP_BLOCK)
             self.pop_frame_block(F_BLOCK_LOOP, loop)
-            if wh.orelse:
-                self.visit_sequence(wh.orelse)
+            self.visit_sequence(wh.orelse)
             self.use_next_block(end)
 
     def visit_TryExcept(self, te):
@@ -581,8 +566,7 @@
             self.use_next_block(next_except)
         self.emit_op(ops.END_FINALLY)
         self.use_next_block(otherwise)
-        if te.orelse:
-            self.visit_sequence(te.orelse)
+        self.visit_sequence(te.orelse)
         self.use_next_block(end)
 
     def visit_TryFinally(self, tf):
@@ -893,27 +877,19 @@
 
     def visit_Tuple(self, tup):
         self.update_position(tup.lineno)
-        if tup.elts:
-            elt_count = len(tup.elts)
-        else:
-            elt_count = 0
+        elt_count = len(tup.elts) if tup.elts is not None else 0
         if tup.ctx == ast.Store:
             self.emit_op_arg(ops.UNPACK_SEQUENCE, elt_count)
-        if elt_count:
-            self.visit_sequence(tup.elts)
+        self.visit_sequence(tup.elts)
         if tup.ctx == ast.Load:
             self.emit_op_arg(ops.BUILD_TUPLE, elt_count)
 
     def visit_List(self, l):
         self.update_position(l.lineno)
-        if l.elts:
-            elt_count = len(l.elts)
-        else:
-            elt_count = 0
+        elt_count = len(l.elts) if l.elts is not None else 0
         if l.ctx == ast.Store:
             self.emit_op_arg(ops.UNPACK_SEQUENCE, elt_count)
-        if elt_count:
-            self.visit_sequence(l.elts)
+        self.visit_sequence(l.elts)
         if l.ctx == ast.Load:
             self.emit_op_arg(ops.BUILD_LIST, elt_count)
 
@@ -944,11 +920,9 @@
         if self._optimize_method_call(call):
             return
         call.func.walkabout(self)
-        arg = 0
+        arg = len(call.args) if call.args is not None else 0
         call_type = 0
-        if call.args:
-            arg = len(call.args)
-            self.visit_sequence(call.args)
+        self.visit_sequence(call.args)
         if call.keywords:
             self.visit_sequence(call.keywords)
             arg |= len(call.keywords) << 8
@@ -984,16 +958,10 @@
         assert isinstance(attr_lookup, ast.Attribute)
         attr_lookup.value.walkabout(self)
         self.emit_op_name(ops.LOOKUP_METHOD, self.names, attr_lookup.attr)
-        if call.args:
-            self.visit_sequence(call.args)
-            arg_count = len(call.args)
-        else:
-            arg_count = 0
-        if call.keywords:
-            self.visit_sequence(call.keywords)
-            kwarg_count = len(call.keywords)
-        else:
-            kwarg_count = 0
+        self.visit_sequence(call.args)
+        arg_count = len(call.args) if call.args is not None else 0
+        self.visit_sequence(call.keywords)
+        kwarg_count = len(call.keywords) if call.keywords is not None else 0
         self.emit_op_arg(ops.CALL_METHOD, (kwarg_count << 8) | arg_count)
         return True
 
@@ -1251,7 +1219,10 @@
     def _compile(self, func):
         assert isinstance(func, ast.FunctionDef)
         # If there's a docstring, store it as the first constant.
-        doc_expr = self.possible_docstring(func.body[0])
+        if func.body:
+            doc_expr = self.possible_docstring(func.body[0])
+        else:
+            doc_expr = None
         if doc_expr is not None:
             self.add_const(doc_expr.s)
             start = 1
@@ -1263,8 +1234,9 @@
         if args.args:
             self._handle_nested_args(args.args)
             self.argcount = len(args.args)
-        for i in range(start, len(func.body)):
-            func.body[i].walkabout(self)
+        if func.body:
+            for i in range(start, len(func.body)):
+                func.body[i].walkabout(self)
 
 
 class LambdaCodeGenerator(AbstractFunctionCodeGenerator):
diff --git a/pypy/interpreter/astcompiler/misc.py b/pypy/interpreter/astcompiler/misc.py
--- a/pypy/interpreter/astcompiler/misc.py
+++ b/pypy/interpreter/astcompiler/misc.py
@@ -27,9 +27,10 @@
     _emit_syntax_warning(space, w_msg, w_filename, w_lineno, w_offset)
 
 
-def parse_future(tree):
+def parse_future(tree, feature_flags):
     future_lineno = 0
     future_column = 0
+    flags = 0
     have_docstring = False
     body = None
     if isinstance(tree, ast.Module):
@@ -37,7 +38,7 @@
     elif isinstance(tree, ast.Interactive):
         body = tree.body
     if body is None:
-        return 0, 0
+        return 0, 0, 0
     for stmt in body:
         if isinstance(stmt, ast.Expr) and isinstance(stmt.value, ast.Str):
             if have_docstring:
@@ -48,11 +49,16 @@
             if stmt.module == "__future__":
                 future_lineno = stmt.lineno
                 future_column = stmt.col_offset
+                for alias in stmt.names:
+                    assert isinstance(alias, ast.alias)
+                    # If this is an invalid flag, it will be caught later in
+                    # codegen.py.
+                    flags |= feature_flags.get(alias.name, 0)
             else:
                 break
         else:
             break
-    return future_lineno, future_column
+    return flags, future_lineno, future_column
 
 
 class ForbiddenNameAssignment(Exception):
diff --git a/pypy/interpreter/astcompiler/symtable.py b/pypy/interpreter/astcompiler/symtable.py
--- a/pypy/interpreter/astcompiler/symtable.py
+++ b/pypy/interpreter/astcompiler/symtable.py
@@ -356,10 +356,8 @@
         # Function defaults and decorators happen in the outer scope.
         args = func.args
         assert isinstance(args, ast.arguments)
-        if args.defaults:
-            self.visit_sequence(args.defaults)
-        if func.decorator_list:
-            self.visit_sequence(func.decorator_list)
+        self.visit_sequence(args.defaults)
+        self.visit_sequence(func.decorator_list)
         new_scope = FunctionScope(func.name, func.lineno, func.col_offset)
         self.push_scope(new_scope, func)
         func.args.walkabout(self)
@@ -372,10 +370,8 @@
 
     def visit_ClassDef(self, clsdef):
         self.note_symbol(clsdef.name, SYM_ASSIGNED)
-        if clsdef.bases:
-            self.visit_sequence(clsdef.bases)
-        if clsdef.decorator_list:
-            self.visit_sequence(clsdef.decorator_list)
+        self.visit_sequence(clsdef.bases)
+        self.visit_sequence(clsdef.decorator_list)
         self.push_scope(ClassScope(clsdef), clsdef)
         self.visit_sequence(clsdef.body)
         self.pop_scope()
@@ -431,8 +427,7 @@
     def visit_Lambda(self, lamb):
         args = lamb.args
         assert isinstance(args, ast.arguments)
-        if args.defaults:
-            self.visit_sequence(args.defaults)
+        self.visit_sequence(args.defaults)
         new_scope = FunctionScope("lambda", lamb.lineno, lamb.col_offset)
         self.push_scope(new_scope, lamb)
         lamb.args.walkabout(self)
@@ -447,8 +442,7 @@
         self.push_scope(new_scope, node)
         self.implicit_arg(0)
         outer.target.walkabout(self)
-        if outer.ifs:
-            self.visit_sequence(outer.ifs)
+        self.visit_sequence(outer.ifs)
         self.visit_sequence(comps[1:])
         for item in list(consider):
             item.walkabout(self)
diff --git a/pypy/interpreter/astcompiler/tools/asdl_py.py b/pypy/interpreter/astcompiler/tools/asdl_py.py
--- a/pypy/interpreter/astcompiler/tools/asdl_py.py
+++ b/pypy/interpreter/astcompiler/tools/asdl_py.py
@@ -221,8 +221,9 @@
         self.emit("class ASTVisitor(object):")
         self.emit("")
         self.emit("def visit_sequence(self, seq):", 1)
-        self.emit("for node in seq:", 2)
-        self.emit("node.walkabout(self)", 3)
+        self.emit("if seq is not None:", 2)
+        self.emit("for node in seq:", 3)
+        self.emit("node.walkabout(self)", 4)
         self.emit("")
         self.emit("def default_visitor(self, node):", 1)
         self.emit("raise NodeVisitorNotImplemented", 2)
@@ -280,15 +281,13 @@
     def visitField(self, field):
         if field.type.value not in asdl.builtin_types and \
                 field.type.value not in self.data.simple_types:
-            if field.seq or field.opt:
+            level = 2
+            template = "node.%s.walkabout(self)"
+            if field.seq:
+                template = "self.visit_sequence(node.%s)"
+            elif field.opt:
                 self.emit("if node.%s:" % (field.name,), 2)
                 level = 3
-            else:
-                level = 2
-            if field.seq:
-                template = "self.visit_sequence(node.%s)"
-            else:
-                template = "node.%s.walkabout(self)"
             self.emit(template % (field.name,), level)
             return True
         return False
@@ -446,6 +445,7 @@
         if field.seq:
             self.emit("w_self.w_%s = w_new_value" % (field.name,), 1)
         else:
+            save_original_object = False
             self.emit("try:", 1)
             if field.type.value not in asdl.builtin_types:
                 # These are always other AST nodes.
@@ -454,9 +454,7 @@
                                   (field.type,), 2)
                     self.emit("w_self.%s = obj.to_simple_int(space)" %
                               (field.name,), 2)
-                    self.emit("# need to save the original object too", 2)
-                    self.emit("w_self.setdictvalue(space, '%s', w_new_value)"
-                              % (field.name,), 2)
+                    save_original_object = True
                 else:
                     config = (field.name, field.type, repr(field.opt))
                     self.emit("w_self.%s = space.interp_w(%s, w_new_value, %s)" %
@@ -480,6 +478,12 @@
             self.emit("    w_self.setdictvalue(space, '%s', w_new_value)"
                       % (field.name,), 1)
             self.emit("    return", 1)
+            if save_original_object:
+                self.emit("# need to save the original object too", 1)
+                self.emit("w_self.setdictvalue(space, '%s', w_new_value)"
+                          % (field.name,), 1)
+            else:
+                self.emit("w_self.deldictvalue(space, '%s')" %(field.name,), 1)
         self.emit("w_self.initialization_state |= %s" % (flag,), 1)
         self.emit("")
 
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -44,11 +44,11 @@
             return True
         return False
 
-    def deldictvalue(self, space, w_name):
+    def deldictvalue(self, space, attr):
         w_dict = self.getdict(space)
         if w_dict is not None:
             try:
-                space.delitem(w_dict, w_name)
+                space.delitem(w_dict, space.wrap(attr))
                 return True
             except OperationError, ex:
                 if not ex.match(space, space.w_KeyError):
@@ -111,6 +111,9 @@
     def setslotvalue(self, index, w_val):
         raise NotImplementedError
 
+    def delslotvalue(self, index):
+        raise NotImplementedError
+
     def descr_call_mismatch(self, space, opname, RequiredClass, args):
         if RequiredClass is None:
             classname = '?'
@@ -130,6 +133,9 @@
         raise operationerrfmt(space.w_TypeError,
             "cannot create weak reference to '%s' object", typename)
 
+    def delweakref(self):
+        pass
+
     def clear_all_weakrefs(self):
         """Call this at the beginning of interp-level __del__() methods
         in subclasses.  It ensures that weakrefs (if any) are cleared
@@ -143,29 +149,28 @@
             # app-level, e.g. a user-defined __del__(), and this code
             # tries to use weakrefs again, it won't reuse the broken
             # (already-cleared) weakrefs from this lifeline.
-            self.setweakref(lifeline.space, None)
+            self.delweakref()
             lifeline.clear_all_weakrefs()
 
-    __already_enqueued_for_destruction = False
+    __already_enqueued_for_destruction = ()
 
-    def _enqueue_for_destruction(self, space, call_user_del=True):
+    def enqueue_for_destruction(self, space, callback, descrname):
         """Put the object in the destructor queue of the space.
-        At a later, safe point in time, UserDelAction will use
-        space.userdel() to call the object's app-level __del__ method.
+        At a later, safe point in time, UserDelAction will call
+        callback(self).  If that raises OperationError, prints it
+        to stderr with the descrname string.
+
+        Note that 'callback' will usually need to start with:
+            assert isinstance(self, W_SpecificClass)
         """
         # this function always resurect the object, so when
         # running on top of CPython we must manually ensure that
         # we enqueue it only once
         if not we_are_translated():
-            if self.__already_enqueued_for_destruction:
+            if callback in self.__already_enqueued_for_destruction:
                 return
-            self.__already_enqueued_for_destruction = True
-        self.clear_all_weakrefs()
-        if call_user_del:
-            space.user_del_action.register_dying_object(self)
-
-    def _call_builtin_destructor(self):
-        pass     # method overridden in typedef.py
+            self.__already_enqueued_for_destruction += (callback,)
+        space.user_del_action.register_callback(self, callback, descrname)
 
     # hooks that the mapdict implementations needs:
     def _get_mapdict_map(self):
@@ -621,9 +626,9 @@
             self.default_compiler = compiler
             return compiler
 
-    def createframe(self, code, w_globals, closure=None):
+    def createframe(self, code, w_globals, outer_func=None):
         "Create an empty PyFrame suitable for this code object."
-        return self.FrameClass(self, code, w_globals, closure)
+        return self.FrameClass(self, code, w_globals, outer_func)
 
     def allocate_lock(self):
         """Return an interp-level Lock object if threads are enabled,
@@ -925,6 +930,9 @@
                 return self.w_True
         return self.w_False
 
+    def issequence_w(self, w_obj):
+        return (self.findattr(w_obj, self.wrap("__getitem__")) is not None)
+
     def isinstance_w(self, w_obj, w_type):
         return self.is_true(self.isinstance(w_obj, w_type))
 
@@ -1279,6 +1287,17 @@
                                  self.wrap("expected a 32-bit integer"))
         return value
 
+    def truncatedint(self, w_obj):
+        # Like space.gateway_int_w(), but return the integer truncated
+        # instead of raising OverflowError.  For obscure cases only.
+        try:
+            return self.int_w(w_obj)
+        except OperationError, e:
+            if not e.match(self, self.w_OverflowError):
+                raise
+            from pypy.rlib.rarithmetic import intmask
+            return intmask(self.bigint_w(w_obj).uintmask())
+
     def c_filedescriptor_w(self, w_fd):
         # This is only used sometimes in CPython, e.g. for os.fsync() but
         # not os.close().  It's likely designed for 'select'.  It's irregular
diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py
--- a/pypy/interpreter/error.py
+++ b/pypy/interpreter/error.py
@@ -189,7 +189,7 @@
             if space.is_w(w_value, space.w_None):
                 # raise Type: we assume we have to instantiate Type
                 w_value = space.call_function(w_type)
-                w_type = space.exception_getclass(w_value)
+                w_type = self._exception_getclass(space, w_value)
             else:
                 w_valuetype = space.exception_getclass(w_value)
                 if space.exception_issubclass_w(w_valuetype, w_type):
@@ -204,18 +204,12 @@
                     else:
                         # raise Type, X: assume X is the constructor argument
                         w_value = space.call_function(w_type, w_value)
-                    w_type = space.exception_getclass(w_value)
+                    w_type = self._exception_getclass(space, w_value)
 
         else:
             # the only case left here is (inst, None), from a 'raise inst'.
             w_inst = w_type
-            w_instclass = space.exception_getclass(w_inst)
-            if not space.exception_is_valid_class_w(w_instclass):
-                instclassname = w_instclass.getname(space)
-                msg = ("exceptions must be old-style classes or derived "
-                       "from BaseException, not %s")
-                raise operationerrfmt(space.w_TypeError, msg, instclassname)
-
+            w_instclass = self._exception_getclass(space, w_inst)
             if not space.is_w(w_value, space.w_None):
                 raise OperationError(space.w_TypeError,
                                      space.wrap("instance exception may not "
@@ -226,6 +220,15 @@
         self.w_type   = w_type
         self._w_value = w_value
 
+    def _exception_getclass(self, space, w_inst):
+        w_type = space.exception_getclass(w_inst)
+        if not space.exception_is_valid_class_w(w_type):
+            typename = w_type.getname(space)
+            msg = ("exceptions must be old-style classes or derived "
+                   "from BaseException, not %s")
+            raise operationerrfmt(space.w_TypeError, msg, typename)
+        return w_type
+
     def write_unraisable(self, space, where, w_object=None):
         if w_object is None:
             objrepr = ''
diff --git a/pypy/interpreter/executioncontext.py b/pypy/interpreter/executioncontext.py
--- a/pypy/interpreter/executioncontext.py
+++ b/pypy/interpreter/executioncontext.py
@@ -484,44 +484,31 @@
 
     def __init__(self, space):
         AsyncAction.__init__(self, space)
-        self.dying_objects_w = []
-        self.weakrefs_w = []
+        self.dying_objects = []
         self.finalizers_lock_count = 0
 
-    def register_dying_object(self, w_obj):
-        self.dying_objects_w.append(w_obj)
-        self.fire()
-
-    def register_weakref_callback(self, w_ref):
-        self.weakrefs_w.append(w_ref)
+    def register_callback(self, w_obj, callback, descrname):
+        self.dying_objects.append((w_obj, callback, descrname))
         self.fire()
 
     def perform(self, executioncontext, frame):
         if self.finalizers_lock_count > 0:
             return
-        # Each call to perform() first grabs the self.dying_objects_w
+        # Each call to perform() first grabs the self.dying_objects
         # and replaces it with an empty list.  We do this to try to
         # avoid too deep recursions of the kind of __del__ being called
         # while in the middle of another __del__ call.
-        pending_w = self.dying_objects_w
-        self.dying_objects_w = []
+        pending = self.dying_objects
+        self.dying_objects = []
         space = self.space
-        for i in range(len(pending_w)):
-            w_obj = pending_w[i]
-            pending_w[i] = None
+        for i in range(len(pending)):
+            w_obj, callback, descrname = pending[i]
+            pending[i] = (None, None, None)
             try:
-                space.userdel(w_obj)
+                callback(w_obj)
             except OperationError, e:
-                e.write_unraisable(space, 'method __del__ of ', w_obj)
+                e.write_unraisable(space, descrname, w_obj)
                 e.clear(space)   # break up reference cycles
-            # finally, this calls the interp-level destructor for the
-            # cases where there is both an app-level and a built-in __del__.
-            w_obj._call_builtin_destructor()
-        pending_w = self.weakrefs_w
-        self.weakrefs_w = []
-        for i in range(len(pending_w)):
-            w_ref = pending_w[i]
-            w_ref.activate_callback()
 
 class FrameTraceAction(AsyncAction):
     """An action that calls the local trace functions (w_f_trace)."""
diff --git a/pypy/interpreter/function.py b/pypy/interpreter/function.py
--- a/pypy/interpreter/function.py
+++ b/pypy/interpreter/function.py
@@ -30,8 +30,9 @@
     can_change_code = True
     _immutable_fields_ = ['code?',
                           'w_func_globals?',
-                          'closure?',
-                          'defs_w?[*]']
+                          'closure?[*]',
+                          'defs_w?[*]',
+                          'name?']
 
     def __init__(self, space, code, w_globals=None, defs_w=[], closure=None,
                  forcename=None):
@@ -95,7 +96,7 @@
             assert isinstance(code, PyCode)
             if nargs < 5:
                 new_frame = self.space.createframe(code, self.w_func_globals,
-                                                   self.closure)
+                                                   self)
                 for i in funccallunrolling:
                     if i < nargs:
                         new_frame.locals_stack_w[i] = args_w[i]
@@ -155,7 +156,7 @@
     def _flat_pycall(self, code, nargs, frame):
         # code is a PyCode
         new_frame = self.space.createframe(code, self.w_func_globals,
-                                                   self.closure)
+                                                   self)
         for i in xrange(nargs):
             w_arg = frame.peekvalue(nargs-1-i)
             new_frame.locals_stack_w[i] = w_arg
@@ -166,7 +167,7 @@
     def _flat_pycall_defaults(self, code, nargs, frame, defs_to_load):
         # code is a PyCode
         new_frame = self.space.createframe(code, self.w_func_globals,
-                                                   self.closure)
+                                                   self)
         for i in xrange(nargs):
             w_arg = frame.peekvalue(nargs-1-i)
             new_frame.locals_stack_w[i] = w_arg
diff --git a/pypy/interpreter/gateway.py b/pypy/interpreter/gateway.py
--- a/pypy/interpreter/gateway.py
+++ b/pypy/interpreter/gateway.py
@@ -64,7 +64,7 @@
                 self.visit_self(el[1], *args)
             else:
                 self.visit_function(el, *args)
-        else:
+        elif isinstance(el, type):
             for typ in self.bases_order:
                 if issubclass(el, typ):
                     visit = getattr(self, "visit__%s" % (typ.__name__,))
@@ -73,6 +73,8 @@
             else:
                 raise Exception("%s: no match for unwrap_spec element %s" % (
                     self.__class__.__name__, el))
+        else:
+            raise Exception("unable to dispatch, %s, perhaps your parameter should have started with w_?" % el)
 
     def apply_over(self, unwrap_spec, *extra):
         dispatch = self.dispatch
@@ -140,6 +142,9 @@
     def visit_c_nonnegint(self, el, app_sig):
         self.checked_space_method(el, app_sig)
 
+    def visit_truncatedint(self, el, app_sig):
+        self.checked_space_method(el, app_sig)
+
     def visit__Wrappable(self, el, app_sig):
         name = el.__name__
         argname = self.orig_arg()
@@ -257,6 +262,9 @@
     def visit_c_nonnegint(self, typ):
         self.run_args.append("space.c_nonnegint_w(%s)" % (self.scopenext(),))
 
+    def visit_truncatedint(self, typ):
+        self.run_args.append("space.truncatedint(%s)" % (self.scopenext(),))
+
     def _make_unwrap_activation_class(self, unwrap_spec, cache={}):
         try:
             key = tuple(unwrap_spec)
@@ -387,6 +395,9 @@
     def visit_c_nonnegint(self, typ):
         self.unwrap.append("space.c_nonnegint_w(%s)" % (self.nextarg(),))
 
+    def visit_truncatedint(self, typ):
+        self.unwrap.append("space.truncatedint(%s)" % (self.nextarg(),))
+
     def make_fastfunc(unwrap_spec, func):
         unwrap_info = UnwrapSpec_FastFunc_Unwrap()
         unwrap_info.apply_over(unwrap_spec)
@@ -396,11 +407,14 @@
             fastfunc = func
         else:
             # try to avoid excessive bloat
-            if func.__module__ == 'pypy.interpreter.astcompiler.ast':
+            mod = func.__module__
+            if mod is None:
+                mod = ""
+            if mod == 'pypy.interpreter.astcompiler.ast':
                 raise FastFuncNotSupported
-            if (not func.__module__.startswith('pypy.module.__builtin__') and
-                not func.__module__.startswith('pypy.module.sys') and
-                not func.__module__.startswith('pypy.module.math')):
+            if (not mod.startswith('pypy.module.__builtin__') and
+                not mod.startswith('pypy.module.sys') and
+                not mod.startswith('pypy.module.math')):
                 if not func.__name__.startswith('descr'):
                     raise FastFuncNotSupported
             d = {}
diff --git a/pypy/interpreter/generator.py b/pypy/interpreter/generator.py
--- a/pypy/interpreter/generator.py
+++ b/pypy/interpreter/generator.py
@@ -114,6 +114,7 @@
  
     def descr_close(self):
         """x.close(arg) -> raise GeneratorExit inside generator."""
+        assert isinstance(self, GeneratorIterator)
         space = self.space
         try:
             w_retval = self.throw(space.w_GeneratorExit, space.w_None,
@@ -141,22 +142,16 @@
         code_name = self.pycode.co_name
         return space.wrap(code_name)
 
-    def descr__del__(self):        
-        """
-        applevel __del__, which is called at a safe point after the
-        interp-level __del__ enqueued the object for destruction
-        """
-        self.descr_close()
-
     def __del__(self):
         # Only bother enqueuing self to raise an exception if the frame is
         # still not finished and finally or except blocks are present.
-        must_call_close = False
+        self.clear_all_weakrefs()
         if self.frame is not None:
             block = self.frame.lastblock
             while block is not None:
                 if not isinstance(block, LoopBlock):
-                    must_call_close = True
+                    self.enqueue_for_destruction(self.space,
+                                                 GeneratorIterator.descr_close,
+                                                 "interrupting generator of ")
                     break
                 block = block.previous
-        self._enqueue_for_destruction(self.space, must_call_close)
diff --git a/pypy/interpreter/miscutils.py b/pypy/interpreter/miscutils.py
--- a/pypy/interpreter/miscutils.py
+++ b/pypy/interpreter/miscutils.py
@@ -167,3 +167,7 @@
 
     def getmainthreadvalue(self):
         return self._value
+
+    def getallvalues(self):
+        return {0: self._value}
+
diff --git a/pypy/interpreter/module.py b/pypy/interpreter/module.py
--- a/pypy/interpreter/module.py
+++ b/pypy/interpreter/module.py
@@ -9,6 +9,8 @@
 class Module(Wrappable):
     """A module."""
 
+    _immutable_fields_ = ["w_dict?"]
+
     _frozen = False
 
     def __init__(self, space, w_name, w_dict=None, add_package=True):
diff --git a/pypy/interpreter/nestedscope.py b/pypy/interpreter/nestedscope.py
--- a/pypy/interpreter/nestedscope.py
+++ b/pypy/interpreter/nestedscope.py
@@ -8,7 +8,7 @@
 
 class Cell(Wrappable):
     "A simple container for a wrapped value."
-    
+
     def __init__(self, w_value=None):
         self.w_value = w_value
 
@@ -90,32 +90,33 @@
     #     variables coming from a parent function in which i'm nested
     # 'closure' is a list of Cell instances: the received free vars.
 
-    cells = None
-
     @jit.unroll_safe
-    def initialize_frame_scopes(self, closure, code):
-        super_initialize_frame_scopes(self, closure, code)
+    def initialize_frame_scopes(self, outer_func, code):
+        super_initialize_frame_scopes(self, outer_func, code)
         ncellvars = len(code.co_cellvars)
         nfreevars = len(code.co_freevars)
         if not nfreevars:
             if not ncellvars:
+                self.cells = []
                 return            # no self.cells needed - fast path
-            if closure is None:
-                closure = []
-        elif closure is None:
+        elif outer_func is None:
             space = self.space
             raise OperationError(space.w_TypeError,
                                  space.wrap("directly executed code object "
                                             "may not contain free variables"))
-        if len(closure) != nfreevars:
+        if outer_func and outer_func.closure:
+            closure_size = len(outer_func.closure)
+        else:
+            closure_size = 0
+        if closure_size != nfreevars:
             raise ValueError("code object received a closure with "
                                  "an unexpected number of free variables")
         self.cells = [None] * (ncellvars + nfreevars)
         for i in range(ncellvars):
             self.cells[i] = Cell()
         for i in range(nfreevars):
-            self.cells[i + ncellvars] = closure[i]
-    
+            self.cells[i + ncellvars] = outer_func.closure[i]
+
     def _getcells(self):
         return self.cells
 
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -198,7 +198,7 @@
 
     def funcrun(self, func, args):
         frame = self.space.createframe(self, func.w_func_globals,
-                                  func.closure)
+                                  func)
         sig = self._signature
         # speed hack
         fresh_frame = jit.hint(frame, access_directly=True,
@@ -211,7 +211,7 @@
 
     def funcrun_obj(self, func, w_obj, args):
         frame = self.space.createframe(self, func.w_func_globals,
-                                  func.closure)
+                                  func)
         sig = self._signature
         # speed hack
         fresh_frame = jit.hint(frame, access_directly=True,
diff --git a/pypy/interpreter/pycompiler.py b/pypy/interpreter/pycompiler.py
--- a/pypy/interpreter/pycompiler.py
+++ b/pypy/interpreter/pycompiler.py
@@ -119,7 +119,10 @@
             raise OperationError(self.space.w_TypeError, self.space.wrap(
                 "invalid node type"))
 
-        future_pos = misc.parse_future(node)
+        fut = misc.parse_future(node, self.future_flags.compiler_features)
+        f_flags, f_lineno, f_col = fut
+        future_pos = f_lineno, f_col
+        flags |= f_flags
         info = pyparse.CompileInfo(filename, mode, flags, future_pos)
         return self._compile_ast(node, info)
 
diff --git a/pypy/interpreter/pyframe.py b/pypy/interpreter/pyframe.py
--- a/pypy/interpreter/pyframe.py
+++ b/pypy/interpreter/pyframe.py
@@ -51,7 +51,10 @@
     is_being_profiled        = False
     escaped                  = False  # see mark_as_escaped()
 
-    def __init__(self, space, code, w_globals, closure):
+    def __init__(self, space, code, w_globals, outer_func):
+        if not we_are_translated():
+            assert type(self) in (space.FrameClass, CPythonFrame), (
+                "use space.FrameClass(), not directly PyFrame()")
         self = hint(self, access_directly=True, fresh_virtualizable=True)
         assert isinstance(code, pycode.PyCode)
         self.pycode = code
@@ -67,7 +70,7 @@
             self.builtin = space.builtin.pick_builtin(w_globals)
         # regular functions always have CO_OPTIMIZED and CO_NEWLOCALS.
         # class bodies only have CO_NEWLOCALS.
-        self.initialize_frame_scopes(closure, code)
+        self.initialize_frame_scopes(outer_func, code)
         self.f_lineno = code.co_firstlineno
 
     def mark_as_escaped(self):
@@ -80,7 +83,7 @@
         self.escaped = True
 
     def append_block(self, block):
-        block.previous = self.lastblock
+        assert block.previous is self.lastblock
         self.lastblock = block
 
     def pop_block(self):
@@ -106,15 +109,16 @@
         while i >= 0:
             block = lst[i]
             i -= 1
-            self.append_block(block)
+            block.previous = self.lastblock
+            self.lastblock = block
 
     def get_builtin(self):
         if self.space.config.objspace.honor__builtins__:
             return self.builtin
         else:
             return self.space.builtin
-        
-    def initialize_frame_scopes(self, closure, code): 
+
+    def initialize_frame_scopes(self, outer_func, code):
         # regular functions always have CO_OPTIMIZED and CO_NEWLOCALS.
         # class bodies only have CO_NEWLOCALS.
         # CO_NEWLOCALS: make a locals dict unless optimized is also set
@@ -381,7 +385,11 @@
         
         # do not use the instance's __init__ but the base's, because we set
         # everything like cells from here
-        PyFrame.__init__(self, space, pycode, w_globals, closure)
+        # XXX hack
+        from pypy.interpreter.function import Function
+        outer_func = Function(space, None, closure=closure,
+                             forcename="fake")
+        PyFrame.__init__(self, space, pycode, w_globals, outer_func)
         f_back = space.interp_w(PyFrame, w_f_back, can_be_None=True)
         new_frame.f_backref = jit.non_virtual_ref(f_back)
 
diff --git a/pypy/interpreter/pyopcode.py b/pypy/interpreter/pyopcode.py
--- a/pypy/interpreter/pyopcode.py
+++ b/pypy/interpreter/pyopcode.py
@@ -892,32 +892,31 @@
         raise BytecodeCorruption, "old opcode, no longer in use"
 
     def SETUP_LOOP(self, offsettoend, next_instr):
-        block = LoopBlock(self, next_instr + offsettoend)
-        self.append_block(block)
+        block = LoopBlock(self, next_instr + offsettoend, self.lastblock)
+        self.lastblock = block
 
     def SETUP_EXCEPT(self, offsettoend, next_instr):
-        block = ExceptBlock(self, next_instr + offsettoend)
-        self.append_block(block)
+        block = ExceptBlock(self, next_instr + offsettoend, self.lastblock)
+        self.lastblock = block
 
     def SETUP_FINALLY(self, offsettoend, next_instr):
-        block = FinallyBlock(self, next_instr + offsettoend)
-        self.append_block(block)
+        block = FinallyBlock(self, next_instr + offsettoend, self.lastblock)
+        self.lastblock = block
 
     def SETUP_WITH(self, offsettoend, next_instr):
         w_manager = self.peekvalue()
+        w_enter = self.space.lookup(w_manager, "__enter__")
         w_descr = self.space.lookup(w_manager, "__exit__")
-        if w_descr is None:
-            raise OperationError(self.space.w_AttributeError,
-                                 self.space.wrap("__exit__"))
+        if w_enter is None or w_descr is None:
+            typename = self.space.type(w_manager).getname(self.space)
+            raise operationerrfmt(self.space.w_AttributeError,
+                "'%s' object is not a context manager"
+                " (no __enter__/__exit__ method)", typename)
         w_exit = self.space.get(w_descr, w_manager)
         self.settopvalue(w_exit)
-        w_enter = self.space.lookup(w_manager, "__enter__")
-        if w_enter is None:
-            raise OperationError(self.space.w_AttributeError,
-                                 self.space.wrap("__enter__"))
         w_result = self.space.get_and_call_function(w_enter, w_manager)
-        block = WithBlock(self, next_instr + offsettoend)
-        self.append_block(block)
+        block = WithBlock(self, next_instr + offsettoend, self.lastblock)
+        self.lastblock = block
         self.pushvalue(w_result)
 
     def WITH_CLEANUP(self, oparg, next_instr):
@@ -1248,10 +1247,10 @@
 
     _immutable_ = True
 
-    def __init__(self, frame, handlerposition):
+    def __init__(self, frame, handlerposition, previous):
         self.handlerposition = handlerposition
         self.valuestackdepth = frame.valuestackdepth
-        self.previous = None # this makes a linked list of blocks
+        self.previous = previous   # this makes a linked list of blocks
 
     def __eq__(self, other):
         return (self.__class__ is other.__class__ and
@@ -1524,10 +1523,8 @@
 
         if not isinstance(prog, codetype):
             filename = '<string>'
-            if not isinstance(prog, str):
-                if isinstance(prog, basestring):
-                    prog = str(prog)
-                elif isinstance(prog, file):
+            if not isinstance(prog, basestring):
+                if isinstance(prog, file):
                     filename = prog.name
                     prog = prog.read()
                 else:
diff --git a/pypy/interpreter/pyparser/future.py b/pypy/interpreter/pyparser/future.py
--- a/pypy/interpreter/pyparser/future.py
+++ b/pypy/interpreter/pyparser/future.py
@@ -109,25 +109,19 @@
             self.getc() == self.getc(+2)):
             self.pos += 3
             while 1: # Deal with a triple quoted docstring
-                if self.getc() == '\\':
-                    self.pos += 2
+                c = self.getc()
+                if c == '\\':
+                    self.pos += 1
+                    self._skip_next_char_from_docstring()
+                elif c != endchar:
+                    self._skip_next_char_from_docstring()
                 else:
-                    c = self.getc()
-                    if c != endchar:
-                        self.pos += 1
-                        if c == '\n':
-                            self.atbol()
-                        elif c == '\r':
-                            if self.getc() == '\n':
-                                self.pos += 1
-                                self.atbol()
-                    else:
-                        self.pos += 1
-                        if (self.getc() == endchar and
-                            self.getc(+1) == endchar):
-                            self.pos += 2
-                            self.consume_empty_line()
-                            break
+                    self.pos += 1
+                    if (self.getc() == endchar and
+                        self.getc(+1) == endchar):
+                        self.pos += 2
+                        self.consume_empty_line()
+                        break
 
         else: # Deal with a single quoted docstring
             self.pos += 1
@@ -138,17 +132,21 @@
                     self.consume_empty_line()
                     return
                 elif c == '\\':
-                    # Deal with linefeeds
-                    if self.getc() != '\r':
-                        self.pos += 1
-                    else:
-                        self.pos += 1
-                        if self.getc() == '\n':
-                            self.pos += 1
+                    self._skip_next_char_from_docstring()
                 elif c in '\r\n':
                     # Syntax error
                     return
 
+    def _skip_next_char_from_docstring(self):
+        c = self.getc()
+        self.pos += 1
+        if c == '\n':
+            self.atbol()
+        elif c == '\r':
+            if self.getc() == '\n':
+                self.pos += 1
+            self.atbol()
+
     def consume_continuation(self):
         c = self.getc()
         if c in '\n\r':
diff --git a/pypy/interpreter/pyparser/test/test_futureautomaton.py b/pypy/interpreter/pyparser/test/test_futureautomaton.py
--- a/pypy/interpreter/pyparser/test/test_futureautomaton.py
+++ b/pypy/interpreter/pyparser/test/test_futureautomaton.py
@@ -221,6 +221,14 @@
     assert f.lineno == 3
     assert f.col_offset == 0
 
+def test_lots_of_continuation_lines():
+    s = "\\\n\\\n\\\n\\\n\\\n\\\n\nfrom __future__ import with_statement\n"
+    f = run(s)
+    assert f.pos == len(s)
+    assert f.flags == fut.CO_FUTURE_WITH_STATEMENT
+    assert f.lineno == 8
+    assert f.col_offset == 0
+
 # This looks like a bug in cpython parser
 # and would require extensive modifications
 # to future.py in order to emulate the same behaviour
@@ -239,3 +247,19 @@
         raise AssertionError('IndentationError not raised')
     assert f.lineno == 2
     assert f.col_offset == 0
+
+def test_continuation_lines_in_docstring_single_quoted():
+    s = '"\\\n\\\n\\\n\\\n\\\n\\\n"\nfrom  __future__ import division\n'
+    f = run(s)
+    assert f.pos == len(s)
+    assert f.flags == fut.CO_FUTURE_DIVISION
+    assert f.lineno == 8
+    assert f.col_offset == 0
+
+def test_continuation_lines_in_docstring_triple_quoted():
+    s = '"""\\\n\\\n\\\n\\\n\\\n\\\n"""\nfrom  __future__ import division\n'
+    f = run(s)
+    assert f.pos == len(s)
+    assert f.flags == fut.CO_FUTURE_DIVISION
+    assert f.lineno == 8
+    assert f.col_offset == 0
diff --git a/pypy/interpreter/test/test_exec.py b/pypy/interpreter/test/test_exec.py
--- a/pypy/interpreter/test/test_exec.py
+++ b/pypy/interpreter/test/test_exec.py
@@ -219,3 +219,30 @@
             raise e
 
         assert res == 1
+
+    def test_exec_unicode(self):
+        # 's' is a string
+        s = "x = u'\xd0\xb9\xd1\x86\xd1\x83\xd0\xba\xd0\xb5\xd0\xbd'"
+        # 'u' is a unicode
+        u = s.decode('utf-8')
+        exec u
+        assert len(x) == 6
+        assert ord(x[0]) == 0x0439
+        assert ord(x[1]) == 0x0446
+        assert ord(x[2]) == 0x0443
+        assert ord(x[3]) == 0x043a
+        assert ord(x[4]) == 0x0435
+        assert ord(x[5]) == 0x043d
+
+    def test_eval_unicode(self):
+        u = "u'%s'" % unichr(0x1234)
+        v = eval(u)
+        assert v == unichr(0x1234)
+
+    def test_compile_unicode(self):
+        s = "x = u'\xd0\xb9\xd1\x86\xd1\x83\xd0\xba\xd0\xb5\xd0\xbd'"
+        u = s.decode('utf-8')
+        c = compile(u, '<input>', 'exec')
+        exec c
+        assert len(x) == 6
+        assert ord(x[0]) == 0x0439
diff --git a/pypy/interpreter/test/test_gateway.py b/pypy/interpreter/test/test_gateway.py
--- a/pypy/interpreter/test/test_gateway.py
+++ b/pypy/interpreter/test/test_gateway.py
@@ -704,7 +704,7 @@
 class TestPassThroughArguments_CALL_METHOD(TestPassThroughArguments):
 
     def setup_class(cls):
-        space = gettestobjspace(usemodules=('_stackless',), **{
+        space = gettestobjspace(usemodules=('itertools',), **{
             "objspace.opcodes.CALL_METHOD": True
             })
         cls.space = space
diff --git a/pypy/interpreter/test/test_raise.py b/pypy/interpreter/test/test_raise.py
--- a/pypy/interpreter/test/test_raise.py
+++ b/pypy/interpreter/test/test_raise.py
@@ -274,3 +274,9 @@
             pass
         except A:
             pass
+
+    def test_new_returns_bad_instance(self):
+        class MyException(Exception):
+            def __new__(cls, *args):
+                return object()
+        raises(TypeError, "raise MyException")
diff --git a/pypy/interpreter/test/test_typedef.py b/pypy/interpreter/test/test_typedef.py
--- a/pypy/interpreter/test/test_typedef.py
+++ b/pypy/interpreter/test/test_typedef.py
@@ -1,3 +1,4 @@
+import gc
 from pypy.interpreter import typedef
 from pypy.tool.udir import udir
 from pypy.interpreter.baseobjspace import Wrappable
@@ -180,6 +181,85 @@
             assert err.value.message == "'some_type' objects are unhashable"
             """)
 
+    def test_destructor(self):
+        space = self.space
+        class W_Level1(Wrappable):
+            def __init__(self, space1):
+                assert space1 is space
+            def __del__(self):
+                space.call_method(w_seen, 'append', space.wrap(1))
+        class W_Level2(Wrappable):
+            def __init__(self, space1):
+                assert space1 is space
+            def __del__(self):
+                self.enqueue_for_destruction(space, W_Level2.destructormeth,
+                                             'FOO ')
+            def destructormeth(self):
+                space.call_method(w_seen, 'append', space.wrap(2))
+        W_Level1.typedef = typedef.TypeDef(
+            'level1',
+            __new__ = typedef.generic_new_descr(W_Level1))
+        W_Level2.typedef = typedef.TypeDef(
+            'level2',
+            __new__ = typedef.generic_new_descr(W_Level2))
+        #
+        w_seen = space.newlist([])
+        W_Level1(space)
+        gc.collect(); gc.collect()
+        assert space.unwrap(w_seen) == [1]
+        #
+        w_seen = space.newlist([])
+        W_Level2(space)
+        gc.collect(); gc.collect()
+        assert space.str_w(space.repr(w_seen)) == "[]"  # not called yet
+        ec = space.getexecutioncontext()
+        self.space.user_del_action.perform(ec, None)
+        assert space.unwrap(w_seen) == [2]
+        #
+        w_seen = space.newlist([])
+        self.space.appexec([self.space.gettypeobject(W_Level1.typedef)],
+        """(level1):
+            class A3(level1):
+                pass
+            A3()
+        """)
+        gc.collect(); gc.collect()
+        assert space.unwrap(w_seen) == [1]
+        #
+        w_seen = space.newlist([])
+        self.space.appexec([self.space.gettypeobject(W_Level1.typedef),
+                            w_seen],
+        """(level1, seen):
+            class A4(level1):
+                def __del__(self):
+                    seen.append(4)
+            A4()
+        """)
+        gc.collect(); gc.collect()
+        assert space.unwrap(w_seen) == [4, 1]
+        #
+        w_seen = space.newlist([])
+        self.space.appexec([self.space.gettypeobject(W_Level2.typedef)],
+        """(level2):
+            class A5(level2):
+                pass
+            A5()
+        """)
+        gc.collect(); gc.collect()
+        assert space.unwrap(w_seen) == [2]
+        #
+        w_seen = space.newlist([])
+        self.space.appexec([self.space.gettypeobject(W_Level2.typedef),
+                            w_seen],
+        """(level2, seen):
+            class A6(level2):
+                def __del__(self):
+                    seen.append(6)
+            A6()
+        """)
+        gc.collect(); gc.collect()
+        assert space.unwrap(w_seen) == [6, 2]
+
 
 class AppTestTypeDef:
 
diff --git a/pypy/interpreter/typedef.py b/pypy/interpreter/typedef.py
--- a/pypy/interpreter/typedef.py
+++ b/pypy/interpreter/typedef.py
@@ -23,7 +23,7 @@
             self.hasdict     |= __base.hasdict
             self.weakrefable |= __base.weakrefable
         self.rawdict = {}
-        self.acceptable_as_base_class = True
+        self.acceptable_as_base_class = '__new__' in rawdict
         self.applevel_subclasses_base = None
         # xxx used by faking
         self.fakedcpytype = None
@@ -228,21 +228,26 @@
                 return self._lifeline_
             def setweakref(self, space, weakreflifeline):
                 self._lifeline_ = weakreflifeline
+            def delweakref(self):
+                self._lifeline_ = None
         add(Proto)
 
     if "del" in features:
+        parent_destructor = getattr(supercls, '__del__', None)
+        def call_parent_del(self):
+            assert isinstance(self, subcls)
+            parent_destructor(self)
+        def call_applevel_del(self):
+            assert isinstance(self, subcls)
+            self.space.userdel(self)
         class Proto(object):
             def __del__(self):
-                self._enqueue_for_destruction(self.space)
-        # if the base class needs its own interp-level __del__,
-        # we override the _call_builtin_destructor() method to invoke it
-        # after the app-level destructor.
-        parent_destructor = getattr(supercls, '__del__', None)
-        if parent_destructor is not None:
-            def _call_builtin_destructor(self):
-                parent_destructor(self)
-            Proto._call_builtin_destructor = _call_builtin_destructor
-
+                self.clear_all_weakrefs()
+                self.enqueue_for_destruction(self.space, call_applevel_del,
+                                             'method __del__ of ')
+                if parent_destructor is not None:
+                    self.enqueue_for_destruction(self.space, call_parent_del,
+                                                 'internal destructor of ')
         add(Proto)
 
     if "slots" in features:
@@ -253,6 +258,11 @@
                     self.slots_w = [None] * nslots
             def setslotvalue(self, index, w_value):
                 self.slots_w[index] = w_value
+            def delslotvalue(self, index):
+                if self.slots_w[index] is None:
+                    return False
+                self.slots_w[index] = None
+                return True
             def getslotvalue(self, index):
                 return self.slots_w[index]
         add(Proto)
@@ -525,11 +535,10 @@
         """member.__delete__(obj)
         Delete the value of the slot 'member' from the given 'obj'."""
         self.typecheck(space, w_obj)
-        w_oldresult = w_obj.getslotvalue(self.index)
-        if w_oldresult is None:
+        success = w_obj.delslotvalue(self.index)
+        if not success:
             raise OperationError(space.w_AttributeError,
                                  space.wrap(self.name)) # XXX better message
-        w_obj.setslotvalue(self.index, None)
 
 Member.typedef = TypeDef(
     "member_descriptor",
@@ -630,9 +639,12 @@
         return self._lifeline_
     def setweakref(self, space, weakreflifeline):
         self._lifeline_ = weakreflifeline
+    def delweakref(self):
+        self._lifeline_ = None
     cls._lifeline_ = None
     cls.getweakref = getweakref
     cls.setweakref = setweakref
+    cls.delweakref = delweakref
     return weakref_descr
 
 
@@ -858,8 +870,6 @@
                             descrmismatch='close'),
     __iter__   = interp2app(GeneratorIterator.descr__iter__,
                             descrmismatch='__iter__'),
-    __del__    = interp2app(GeneratorIterator.descr__del__,
-                            descrmismatch='__del__'),
     gi_running = interp_attrproperty('running', cls=GeneratorIterator),
     gi_frame   = GetSetProperty(GeneratorIterator.descr_gi_frame),
     gi_code    = GetSetProperty(GeneratorIterator.descr_gi_code),
diff --git a/pypy/jit/backend/llgraph/llimpl.py b/pypy/jit/backend/llgraph/llimpl.py
--- a/pypy/jit/backend/llgraph/llimpl.py
+++ b/pypy/jit/backend/llgraph/llimpl.py
@@ -57,6 +57,12 @@
     else:
         return LLSupport.from_rstr(s)
 
+FLOAT_ARRAY_TP = lltype.Ptr(lltype.Array(lltype.Float, hints={"nolength": True}))
+def maybe_uncast(TP, array):
+    if array._TYPE.TO._hints.get("uncast_on_llgraph"):
+        array = rffi.cast(TP, array)
+    return array
+
 # a list of argtypes of all operations - couldn't find any and it's
 # very useful.  Note however that the table is half-broken here and
 # there, in ways that are sometimes a bit hard to fix; that's why
@@ -1071,13 +1077,15 @@
         return heaptracker.adr2int(llmemory.cast_ptr_to_adr(x))
     if TP == llmemory.Address:
         return heaptracker.adr2int(x)
+    if TP is lltype.SingleFloat:
+        return longlong.singlefloat2int(x)
     return lltype.cast_primitive(lltype.Signed, x)
 
 def cast_from_int(TYPE, x):
     if isinstance(TYPE, lltype.Ptr):
         if isinstance(x, (int, long, llmemory.AddressAsInt)):
             x = llmemory.cast_int_to_adr(x)
-        if TYPE is rffi.VOIDP:
+        if TYPE is rffi.VOIDP or TYPE.TO._hints.get("uncast_on_llgraph"):
             # assume that we want a "C-style" cast, without typechecking the value
             return rffi.cast(TYPE, x)
         return llmemory.cast_adr_to_ptr(x, TYPE)
@@ -1086,6 +1094,9 @@
             x = llmemory.cast_int_to_adr(x)
         assert lltype.typeOf(x) == llmemory.Address
         return x
+    elif TYPE is lltype.SingleFloat:
+        assert lltype.typeOf(x) is lltype.Signed
+        return longlong.int2singlefloat(x)
     else:
         if lltype.typeOf(x) == llmemory.Address:
             x = heaptracker.adr2int(x)
@@ -1140,6 +1151,7 @@
     del _future_values[:]
 
 def set_future_value_int(index, value):
+    assert lltype.typeOf(value) is lltype.Signed
     set_future_value_ref(index, value)
 
 def set_future_value_float(index, value):
@@ -1323,8 +1335,8 @@
     return cast_to_floatstorage(array.getitem(index))
 
 def do_getarrayitem_raw_float(array, index):
-    array = array.adr.ptr._obj
-    return cast_to_floatstorage(array.getitem(index))
+    array = maybe_uncast(FLOAT_ARRAY_TP, array.adr.ptr)
+    return cast_to_floatstorage(array._obj.getitem(index))
 
 def do_getarrayitem_gc_ptr(array, index):
     array = array._obj.container
@@ -1386,8 +1398,9 @@
     newvalue = cast_from_floatstorage(ITEMTYPE, newvalue)
     array.setitem(index, newvalue)
 
+
 def do_setarrayitem_raw_float(array, index, newvalue):
-    array = array.adr.ptr
+    array = maybe_uncast(FLOAT_ARRAY_TP, array.adr.ptr)
     ITEMTYPE = lltype.typeOf(array).TO.OF
     newvalue = cast_from_floatstorage(ITEMTYPE, newvalue)
     array._obj.setitem(index, newvalue)
@@ -1488,6 +1501,7 @@
     'i': lltype.Signed,
     'f': lltype.Float,
     'L': lltype.SignedLongLong,
+    'S': lltype.SingleFloat,
     'v': lltype.Void,
     }
 
diff --git a/pypy/jit/backend/llgraph/runner.py b/pypy/jit/backend/llgraph/runner.py
--- a/pypy/jit/backend/llgraph/runner.py
+++ b/pypy/jit/backend/llgraph/runner.py
@@ -25,13 +25,14 @@
 class Descr(history.AbstractDescr):
 
     def __init__(self, ofs, typeinfo, extrainfo=None, name=None,
-                 arg_types=None, count_fields_if_immut=-1):
+                 arg_types=None, count_fields_if_immut=-1, ffi_flags=0):
         self.ofs = ofs
         self.typeinfo = typeinfo
         self.extrainfo = extrainfo
         self.name = name
         self.arg_types = arg_types
         self.count_fields_if_immut = count_fields_if_immut
+        self.ffi_flags = ffi_flags
 
     def get_arg_types(self):
         return self.arg_types
@@ -67,6 +68,9 @@
     def count_fields_if_immutable(self):
         return self.count_fields_if_immut
 
+    def get_ffi_flags(self):
+        return self.ffi_flags
+
     def __lt__(self, other):
         raise TypeError("cannot use comparison on Descrs")
     def __le__(self, other):
@@ -91,6 +95,7 @@
 class BaseCPU(model.AbstractCPU):
     supports_floats = True
     supports_longlong = llimpl.IS_32_BIT
+    supports_singlefloats = True
 
     def __init__(self, rtyper, stats=None, opts=None,
                  translate_support_code=False,
@@ -113,14 +118,14 @@
         return False
 
     def getdescr(self, ofs, typeinfo='?', extrainfo=None, name=None,
-                 arg_types=None, count_fields_if_immut=-1):
+                 arg_types=None, count_fields_if_immut=-1, ffi_flags=0):
         key = (ofs, typeinfo, extrainfo, name, arg_types,
-               count_fields_if_immut)
+               count_fields_if_immut, ffi_flags)
         try:
             return self._descrs[key]
         except KeyError:
             descr = Descr(ofs, typeinfo, extrainfo, name, arg_types,
-                          count_fields_if_immut)
+                          count_fields_if_immut, ffi_flags)
             self._descrs[key] = descr
             return descr
 
@@ -311,7 +316,7 @@
         token = history.getkind(getattr(S, fieldname))
         return self.getdescr(ofs, token[0], name=fieldname)
 
-    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo):
         arg_types = []
         for ARG in ARGS:
             token = history.getkind(ARG)
@@ -325,16 +330,21 @@
         return self.getdescr(0, token[0], extrainfo=extrainfo,
                              arg_types=''.join(arg_types))
 
-    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo=None):
+    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo, ffi_flags):
         from pypy.jit.backend.llsupport.ffisupport import get_ffi_type_kind
+        from pypy.jit.backend.llsupport.ffisupport import UnsupportedKind
         arg_types = []
-        for arg in ffi_args:
-            kind = get_ffi_type_kind(arg)
-            if kind != history.VOID:
-                arg_types.append(kind)
-        reskind = get_ffi_type_kind(ffi_result)
+        try:
+            for arg in ffi_args:
+                kind = get_ffi_type_kind(self, arg)
+                if kind != history.VOID:
+                    arg_types.append(kind)
+            reskind = get_ffi_type_kind(self, ffi_result)
+        except UnsupportedKind:
+            return None
         return self.getdescr(0, reskind, extrainfo=extrainfo,
-                             arg_types=''.join(arg_types))
+                             arg_types=''.join(arg_types),
+                             ffi_flags=ffi_flags)
 
 
     def grab_exc_value(self):
@@ -517,7 +527,7 @@
         return FieldDescr.new(T1, fieldname)
 
     @staticmethod
-    def calldescrof(FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(FUNC, ARGS, RESULT, extrainfo):
         return StaticMethDescr.new(FUNC, ARGS, RESULT, extrainfo)
 
     @staticmethod
diff --git a/pypy/jit/backend/llgraph/test/test_llgraph.py b/pypy/jit/backend/llgraph/test/test_llgraph.py
--- a/pypy/jit/backend/llgraph/test/test_llgraph.py
+++ b/pypy/jit/backend/llgraph/test/test_llgraph.py
@@ -19,6 +19,9 @@
     def setup_method(self, _):
         self.cpu = self.cpu_type(None)
 
+    def test_memoryerror(self):
+        py.test.skip("does not make much sense on the llgraph backend")
+
 
 def test_cast_adr_to_int_and_back():
     X = lltype.Struct('X', ('foo', lltype.Signed))
diff --git a/pypy/jit/backend/llsupport/descr.py b/pypy/jit/backend/llsupport/descr.py
--- a/pypy/jit/backend/llsupport/descr.py
+++ b/pypy/jit/backend/llsupport/descr.py
@@ -260,10 +260,12 @@
     _clsname = ''
     loop_token = None
     arg_classes = ''     # <-- annotation hack
+    ffi_flags = 0
 
-    def __init__(self, arg_classes, extrainfo=None):
+    def __init__(self, arg_classes, extrainfo=None, ffi_flags=0):
         self.arg_classes = arg_classes    # string of "r" and "i" (ref/int)
         self.extrainfo = extrainfo
+        self.ffi_flags = ffi_flags
 
     def __repr__(self):
         res = '%s(%s)' % (self.__class__.__name__, self.arg_classes)
@@ -284,6 +286,13 @@
     def get_extra_info(self):
         return self.extrainfo
 
+    def get_ffi_flags(self):
+        return self.ffi_flags
+
+    def get_call_conv(self):
+        from pypy.rlib.clibffi import get_call_conv
+        return get_call_conv(self.ffi_flags, True)
+
     def get_arg_types(self):
         return self.arg_classes
 
@@ -303,6 +312,8 @@
                 c = 'f'
             elif c == 'f' and longlong.supports_longlong:
                 return 'longlong.getrealfloat(%s)' % (process('L'),)
+            elif c == 'S':
+                return 'longlong.int2singlefloat(%s)' % (process('i'),)
             arg = 'args_%s[%d]' % (c, seen[c])
             seen[c] += 1
             return arg
@@ -318,6 +329,8 @@
                 return lltype.Void
             elif arg == 'L':
                 return lltype.SignedLongLong
+            elif arg == 'S':
+                return lltype.SingleFloat
             else:
                 raise AssertionError(arg)
 
@@ -334,6 +347,8 @@
             result = 'rffi.cast(lltype.SignedLongLong, res)'
         elif self.get_return_type() == history.VOID:
             result = 'None'
+        elif self.get_return_type() == 'S':
+            result = 'longlong.singlefloat2int(res)'
         else:
             assert 0
         source = py.code.Source("""
@@ -344,14 +359,15 @@
         """ % locals())
         ARGS = [TYPE(arg) for arg in self.arg_classes]
         FUNC = lltype.FuncType(ARGS, RESULT)
-        d = locals().copy()
-        d.update(globals())
+        d = globals().copy()
+        d.update(locals())
         exec source.compile() in d
         self.call_stub = d['call_stub']
 
     def verify_types(self, args_i, args_r, args_f, return_type):
         assert self._return_type in return_type
-        assert self.arg_classes.count('i') == len(args_i or ())
+        assert (self.arg_classes.count('i') +
+                self.arg_classes.count('S')) == len(args_i or ())
         assert self.arg_classes.count('r') == len(args_r or ())
         assert (self.arg_classes.count('f') +
                 self.arg_classes.count('L')) == len(args_f or ())
@@ -384,8 +400,8 @@
     """
     _clsname = 'DynamicIntCallDescr'
 
-    def __init__(self, arg_classes, result_size, result_sign, extrainfo=None):
-        BaseIntCallDescr.__init__(self, arg_classes, extrainfo)
+    def __init__(self, arg_classes, result_size, result_sign, extrainfo=None, ffi_flags=0):
+        BaseIntCallDescr.__init__(self, arg_classes, extrainfo, ffi_flags)
         assert isinstance(result_sign, bool)
         self._result_size = chr(result_size)
         self._result_sign = result_sign
@@ -428,23 +444,39 @@
     def get_result_size(self, translate_support_code):
         return 0
 
+_SingleFloatCallDescr = None   # built lazily
+
 def getCallDescrClass(RESULT):
     if RESULT is lltype.Void:
         return VoidCallDescr
     if RESULT is lltype.Float:
         return FloatCallDescr
+    if RESULT is lltype.SingleFloat:
+        global _SingleFloatCallDescr
+        if _SingleFloatCallDescr is None:
+            assert rffi.sizeof(rffi.UINT) == rffi.sizeof(RESULT)
+            class SingleFloatCallDescr(getCallDescrClass(rffi.UINT)):
+                _clsname = 'SingleFloatCallDescr'
+                _return_type = 'S'
+            _SingleFloatCallDescr = SingleFloatCallDescr
+        return _SingleFloatCallDescr
     if is_longlong(RESULT):
         return LongLongCallDescr
     return getDescrClass(RESULT, BaseIntCallDescr, GcPtrCallDescr,
                          NonGcPtrCallDescr, 'Call', 'get_result_size',
                          Ellipsis,  # <= floatattrname should not be used here
                          '_is_result_signed')
+getCallDescrClass._annspecialcase_ = 'specialize:memo'
 
 def get_call_descr(gccache, ARGS, RESULT, extrainfo=None):
     arg_classes = []
     for ARG in ARGS:
         kind = getkind(ARG)
-        if   kind == 'int': arg_classes.append('i')
+        if   kind == 'int':
+            if ARG is lltype.SingleFloat:
+                arg_classes.append('S')
+            else:
+                arg_classes.append('i')
         elif kind == 'ref': arg_classes.append('r')
         elif kind == 'float':
             if is_longlong(ARG):
@@ -476,6 +508,9 @@
             return GcPtrDescr
         else:
             return NonGcPtrDescr
+    if TYPE is lltype.SingleFloat:
+        assert rffi.sizeof(rffi.UINT) == rffi.sizeof(TYPE)
+        TYPE = rffi.UINT
     try:
         return _cache[nameprefix, TYPE]
     except KeyError:
diff --git a/pypy/jit/backend/llsupport/ffisupport.py b/pypy/jit/backend/llsupport/ffisupport.py
--- a/pypy/jit/backend/llsupport/ffisupport.py
+++ b/pypy/jit/backend/llsupport/ffisupport.py
@@ -1,41 +1,58 @@
 from pypy.rlib.rarithmetic import intmask
 from pypy.jit.metainterp import history
-from pypy.jit.backend.llsupport.descr import DynamicIntCallDescr, NonGcPtrCallDescr,\
-    FloatCallDescr, VoidCallDescr
+from pypy.rpython.lltypesystem import rffi
+from pypy.jit.backend.llsupport.descr import (
+    DynamicIntCallDescr, NonGcPtrCallDescr, FloatCallDescr, VoidCallDescr,
+    LongLongCallDescr, getCallDescrClass)
 
 class UnsupportedKind(Exception):
     pass
 
-def get_call_descr_dynamic(ffi_args, ffi_result, extrainfo=None):
+def get_call_descr_dynamic(cpu, ffi_args, ffi_result, extrainfo=None, ffi_flags=0):
     """Get a call descr: the types of result and args are represented by
     rlib.libffi.types.*"""
     try:
-        reskind = get_ffi_type_kind(ffi_result)
-        argkinds = [get_ffi_type_kind(arg) for arg in ffi_args]
+        reskind = get_ffi_type_kind(cpu, ffi_result)
+        argkinds = [get_ffi_type_kind(cpu, arg) for arg in ffi_args]
     except UnsupportedKind:
-        return None # ??
+        return None
     arg_classes = ''.join(argkinds)
     if reskind == history.INT:
         size = intmask(ffi_result.c_size)
         signed = is_ffi_type_signed(ffi_result)
-        return DynamicIntCallDescr(arg_classes, size, signed, extrainfo)
+        return DynamicIntCallDescr(arg_classes, size, signed, extrainfo,
+                                   ffi_flags=ffi_flags)
     elif reskind == history.REF:
-        return  NonGcPtrCallDescr(arg_classes, extrainfo)
+        return  NonGcPtrCallDescr(arg_classes, extrainfo,
+                                  ffi_flags=ffi_flags)
     elif reskind == history.FLOAT:
-        return FloatCallDescr(arg_classes, extrainfo)
+        return FloatCallDescr(arg_classes, extrainfo,
+                              ffi_flags=ffi_flags)
     elif reskind == history.VOID:
-        return VoidCallDescr(arg_classes, extrainfo)
+        return VoidCallDescr(arg_classes, extrainfo,
+                             ffi_flags=ffi_flags)
+    elif reskind == 'L':
+        return LongLongCallDescr(arg_classes, extrainfo,
+                                 ffi_flags=ffi_flags)
+    elif reskind == 'S':
+        SingleFloatCallDescr = getCallDescrClass(rffi.FLOAT)
+        return SingleFloatCallDescr(arg_classes, extrainfo,
+                                    ffi_flags=ffi_flags)
     assert False
 
-def get_ffi_type_kind(ffi_type):
+def get_ffi_type_kind(cpu, ffi_type):
     from pypy.rlib.libffi import types
     kind = types.getkind(ffi_type)
     if kind == 'i' or kind == 'u':
         return history.INT
-    elif kind == 'f':
+    elif cpu.supports_floats and kind == 'f':
         return history.FLOAT
     elif kind == 'v':
         return history.VOID
+    elif cpu.supports_longlong and (kind == 'I' or kind == 'U'):     # longlong
+        return 'L'
+    elif cpu.supports_singlefloats and kind == 's':    # singlefloat
+        return 'S'
     raise UnsupportedKind("Unsupported kind '%s'" % kind)
 
 def is_ffi_type_signed(ffi_type):
diff --git a/pypy/jit/backend/llsupport/gc.py b/pypy/jit/backend/llsupport/gc.py
--- a/pypy/jit/backend/llsupport/gc.py
+++ b/pypy/jit/backend/llsupport/gc.py
@@ -366,36 +366,92 @@
 
     def add_jit2gc_hooks(self, jit2gc):
         #
-        def collect_jit_stack_root(callback, gc, addr):
-            if addr.signed[0] != GcRootMap_shadowstack.MARKER:
-                # common case
-                if gc.points_to_valid_gc_object(addr):
-                    callback(gc, addr)
-                return WORD
-            else:
-                # case of a MARKER followed by an assembler stack frame
-                follow_stack_frame_of_assembler(callback, gc, addr)
-                return 2 * WORD
+        # ---------------
+        # This is used to enumerate the shadowstack in the presence
+        # of the JIT.  It is also used by the stacklet support in
+        # rlib/_stacklet_shadowstack.  That's why it is written as
+        # an iterator that can also be used with a custom_trace.
         #
-        def follow_stack_frame_of_assembler(callback, gc, addr):
-            frame_addr = addr.signed[1]
-            addr = llmemory.cast_int_to_adr(frame_addr + self.force_index_ofs)
-            force_index = addr.signed[0]
-            if force_index < 0:
-                force_index = ~force_index
-            callshape = self._callshapes[force_index]
-            n = 0
-            while True:
-                offset = rffi.cast(lltype.Signed, callshape[n])
-                if offset == 0:
-                    break
-                addr = llmemory.cast_int_to_adr(frame_addr + offset)
-                if gc.points_to_valid_gc_object(addr):
-                    callback(gc, addr)
-                n += 1
+        class RootIterator:
+            _alloc_flavor_ = "raw"
+
+            def next(iself, gc, next, range_highest):
+                # Return the "next" valid GC object' address.  This usually
+                # means just returning "next", until we reach "range_highest",
+                # except that we are skipping NULLs.  If "next" contains a
+                # MARKER instead, then we go into JIT-frame-lookup mode.
+                #
+                while True:
+                    #
+                    # If we are not iterating right now in a JIT frame
+                    if iself.frame_addr == 0:
+                        #
+                        # Look for the next shadowstack address that
+                        # contains a valid pointer
+                        while next != range_highest:
+                            if next.signed[0] == self.MARKER:
+                                break
+                            if gc.points_to_valid_gc_object(next):
+                                return next
+                            next += llmemory.sizeof(llmemory.Address)
+                        else:
+                            return llmemory.NULL     # done
+                        #
+                        # It's a JIT frame.  Save away 'next' for later, and
+                        # go into JIT-frame-exploring mode.
+                        next += llmemory.sizeof(llmemory.Address)
+                        frame_addr = next.signed[0]
+                        iself.saved_next = next
+                        iself.frame_addr = frame_addr
+                        addr = llmemory.cast_int_to_adr(frame_addr +
+                                                        self.force_index_ofs)
+                        addr = iself.translateptr(iself.context, addr)
+                        force_index = addr.signed[0]
+                        if force_index < 0:
+                            force_index = ~force_index
+                        # NB: the next line reads a still-alive _callshapes,
+                        # because we ensure that just before we called this
+                        # piece of assembler, we put on the (same) stack a
+                        # pointer to a loop_token that keeps the force_index
+                        # alive.
+                        callshape = self._callshapes[force_index]
+                    else:
+                        # Continuing to explore this JIT frame
+                        callshape = iself.callshape
+                    #
+                    # 'callshape' points to the next INT of the callshape.
+                    # If it's zero we are done with the JIT frame.
+                    while rffi.cast(lltype.Signed, callshape[0]) != 0:
+                        #
+                        # Non-zero: it's an offset inside the JIT frame.
+                        # Read it and increment 'callshape'.
+                        offset = rffi.cast(lltype.Signed, callshape[0])
+                        callshape = lltype.direct_ptradd(callshape, 1)
+                        addr = llmemory.cast_int_to_adr(iself.frame_addr +
+                                                        offset)
+                        addr = iself.translateptr(iself.context, addr)
+                        if gc.points_to_valid_gc_object(addr):
+                            #
+                            # The JIT frame contains a valid GC pointer at
+                            # this address (as opposed to NULL).  Save
+                            # 'callshape' for the next call, and return the
+                            # address.
+                            iself.callshape = callshape
+                            return addr
+                    #
+                    # Restore 'prev' and loop back to the start.
+                    iself.frame_addr = 0
+                    next = iself.saved_next
+                    next += llmemory.sizeof(llmemory.Address)
+
+        # ---------------
         #
+        root_iterator = RootIterator()
+        root_iterator.frame_addr = 0
+        root_iterator.context = llmemory.NULL
+        root_iterator.translateptr = lambda context, addr: addr
         jit2gc.update({
-            'rootstackhook': collect_jit_stack_root,
+            'root_iterator': root_iterator,
             })
 
     def initialize(self):
@@ -453,21 +509,33 @@
 
 class WriteBarrierDescr(AbstractDescr):
     def __init__(self, gc_ll_descr):
+        GCClass = gc_ll_descr.GCClass
         self.llop1 = gc_ll_descr.llop1
         self.WB_FUNCPTR = gc_ll_descr.WB_FUNCPTR
         self.WB_ARRAY_FUNCPTR = gc_ll_descr.WB_ARRAY_FUNCPTR
-        self.fielddescr_tid = get_field_descr(gc_ll_descr,
-                                              gc_ll_descr.GCClass.HDR, 'tid')
-        self.jit_wb_if_flag = gc_ll_descr.GCClass.JIT_WB_IF_FLAG
-        # if convenient for the backend, we also compute the info about
+        self.fielddescr_tid = get_field_descr(gc_ll_descr, GCClass.HDR, 'tid')
+        #
+        self.jit_wb_if_flag = GCClass.JIT_WB_IF_FLAG
+        self.jit_wb_if_flag_byteofs, self.jit_wb_if_flag_singlebyte = (
+            self.extract_flag_byte(self.jit_wb_if_flag))
+        #
+        if hasattr(GCClass, 'JIT_WB_CARDS_SET'):
+            self.jit_wb_cards_set = GCClass.JIT_WB_CARDS_SET
+            self.jit_wb_card_page_shift = GCClass.JIT_WB_CARD_PAGE_SHIFT
+            self.jit_wb_cards_set_byteofs, self.jit_wb_cards_set_singlebyte = (
+                self.extract_flag_byte(self.jit_wb_cards_set))
+        else:
+            self.jit_wb_cards_set = 0
+
+    def extract_flag_byte(self, flag_word):
+        # if convenient for the backend, we compute the info about
         # the flag as (byte-offset, single-byte-flag).
         import struct
-        value = struct.pack("l", self.jit_wb_if_flag)
+        value = struct.pack("l", flag_word)
         assert value.count('\x00') == len(value) - 1    # only one byte is != 0
         i = 0
         while value[i] == '\x00': i += 1
-        self.jit_wb_if_flag_byteofs = i
-        self.jit_wb_if_flag_singlebyte = struct.unpack('b', value[i])[0]
+        return (i, struct.unpack('b', value[i])[0])
 
     def get_write_barrier_fn(self, cpu):
         llop1 = self.llop1
@@ -532,18 +600,19 @@
         assert self.GCClass.inline_simple_malloc
         assert self.GCClass.inline_simple_malloc_varsize
 
-        # make a malloc function, with three arguments
+        # make a malloc function, with two arguments
         def malloc_basic(size, tid):
             type_id = llop.extract_ushort(llgroup.HALFWORD, tid)
             has_finalizer = bool(tid & (1<<llgroup.HALFSHIFT))
             check_typeid(type_id)
-            try:
-                res = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
-                                                      type_id, size, True,
-                                                      has_finalizer, False)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                res = lltype.nullptr(llmemory.GCREF.TO)
+            res = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
+                                                  type_id, size,
+                                                  has_finalizer, False)
+            # In case the operation above failed, we are returning NULL
+            # from this function to assembler.  There is also an RPython
+            # exception set, typically MemoryError; but it's easier and
+            # faster to check for the NULL return value, as done by
+            # translator/exceptiontransform.py.
             #llop.debug_print(lltype.Void, "\tmalloc_basic", size, type_id,
             #                 "-->", res)
             return res
@@ -559,14 +628,10 @@
         def malloc_array(itemsize, tid, num_elem):
             type_id = llop.extract_ushort(llgroup.HALFWORD, tid)
             check_typeid(type_id)
-            try:
-                return llop1.do_malloc_varsize_clear(
-                    llmemory.GCREF,
-                    type_id, num_elem, self.array_basesize, itemsize,
-                    self.array_length_ofs, True)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                return lltype.nullptr(llmemory.GCREF.TO)
+            return llop1.do_malloc_varsize_clear(
+                llmemory.GCREF,
+                type_id, num_elem, self.array_basesize, itemsize,
+                self.array_length_ofs)
         self.malloc_array = malloc_array
         self.GC_MALLOC_ARRAY = lltype.Ptr(lltype.FuncType(
             [lltype.Signed] * 3, llmemory.GCREF))
@@ -579,23 +644,15 @@
         unicode_type_id = self.layoutbuilder.get_type_id(rstr.UNICODE)
         #
         def malloc_str(length):
-            try:
-                return llop1.do_malloc_varsize_clear(
-                    llmemory.GCREF,
-                    str_type_id, length, str_basesize, str_itemsize,
-                    str_ofs_length, True)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                return lltype.nullptr(llmemory.GCREF.TO)
+            return llop1.do_malloc_varsize_clear(
+                llmemory.GCREF,
+                str_type_id, length, str_basesize, str_itemsize,
+                str_ofs_length)
         def malloc_unicode(length):
-            try:
-                return llop1.do_malloc_varsize_clear(
-                    llmemory.GCREF,
-                    unicode_type_id, length, unicode_basesize,unicode_itemsize,
-                    unicode_ofs_length, True)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                return lltype.nullptr(llmemory.GCREF.TO)
+            return llop1.do_malloc_varsize_clear(
+                llmemory.GCREF,
+                unicode_type_id, length, unicode_basesize,unicode_itemsize,
+                unicode_ofs_length)
         self.malloc_str = malloc_str
         self.malloc_unicode = malloc_unicode
         self.GC_MALLOC_STR_UNICODE = lltype.Ptr(lltype.FuncType(
@@ -616,16 +673,12 @@
             if self.DEBUG:
                 random_usage_of_xmm_registers()
             assert size >= self.minimal_size_in_nursery
-            try:
-                # NB. although we call do_malloc_fixedsize_clear() here,
-                # it's a bit of a hack because we set tid to 0 and may
-                # also use it to allocate varsized objects.  The tid
-                # and possibly the length are both set afterward.
-                gcref = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
-                                            0, size, True, False, False)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                return 0
+            # NB. although we call do_malloc_fixedsize_clear() here,
+            # it's a bit of a hack because we set tid to 0 and may
+            # also use it to allocate varsized objects.  The tid
+            # and possibly the length are both set afterward.
+            gcref = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
+                                        0, size, False, False)
             return rffi.cast(lltype.Signed, gcref)
         self.malloc_slowpath = malloc_slowpath
         self.MALLOC_SLOWPATH = lltype.FuncType([lltype.Signed], lltype.Signed)
diff --git a/pypy/jit/backend/llsupport/llmodel.py b/pypy/jit/backend/llsupport/llmodel.py
--- a/pypy/jit/backend/llsupport/llmodel.py
+++ b/pypy/jit/backend/llsupport/llmodel.py
@@ -254,13 +254,13 @@
         return ofs, size, sign
     unpack_arraydescr_size._always_inline_ = True
 
-    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo):
         return get_call_descr(self.gc_ll_descr, ARGS, RESULT, extrainfo)
 
-    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo=None):
+    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo, ffi_flags):
         from pypy.jit.backend.llsupport import ffisupport
-        return ffisupport.get_call_descr_dynamic(ffi_args, ffi_result,
-                                                 extrainfo)
+        return ffisupport.get_call_descr_dynamic(self, ffi_args, ffi_result,
+                                                 extrainfo, ffi_flags)
 
     def get_overflow_error(self):
         ovf_vtable = self.cast_adr_to_int(self._ovf_error_vtable)
@@ -499,7 +499,7 @@
     def bh_call_i(self, func, calldescr, args_i, args_r, args_f):
         assert isinstance(calldescr, BaseIntCallDescr)
         if not we_are_translated():
-            calldescr.verify_types(args_i, args_r, args_f, history.INT)
+            calldescr.verify_types(args_i, args_r, args_f, history.INT + 'S')
         return calldescr.call_stub(func, args_i, args_r, args_f)
 
     def bh_call_r(self, func, calldescr, args_i, args_r, args_f):
diff --git a/pypy/jit/backend/llsupport/regalloc.py b/pypy/jit/backend/llsupport/regalloc.py
--- a/pypy/jit/backend/llsupport/regalloc.py
+++ b/pypy/jit/backend/llsupport/regalloc.py
@@ -57,11 +57,13 @@
     all_regs              = []
     no_lower_byte_regs    = []
     save_around_call_regs = []
-    
+    frame_reg             = None
+
     def __init__(self, longevity, frame_manager=None, assembler=None):
         self.free_regs = self.all_regs[:]
         self.longevity = longevity
         self.reg_bindings = {}
+        self.bindings_to_frame_reg = {}
         self.position = -1
         self.frame_manager = frame_manager
         self.assembler = assembler
@@ -218,6 +220,10 @@
         self.reg_bindings[v] = loc
         return loc
 
+    def force_allocate_frame_reg(self, v):
+        """ Allocate the new variable v in the frame register."""
+        self.bindings_to_frame_reg[v] = None
+
     def force_spill_var(self, var):
         self._sync_var(var)
         try:
@@ -236,6 +242,8 @@
         try:
             return self.reg_bindings[box]
         except KeyError:
+            if box in self.bindings_to_frame_reg:
+                return self.frame_reg
             return self.frame_manager.loc(box)
 
     def return_constant(self, v, forbidden_vars=[], selected_reg=None):
@@ -264,8 +272,9 @@
         self._check_type(v)
         if isinstance(v, Const):
             return self.return_constant(v, forbidden_vars, selected_reg)
-        
         prev_loc = self.loc(v)
+        if prev_loc is self.frame_reg and selected_reg is None:
+            return prev_loc
         loc = self.force_allocate_reg(v, forbidden_vars, selected_reg,
                                       need_lower_byte=need_lower_byte)
         if prev_loc is not loc:
diff --git a/pypy/jit/backend/llsupport/test/test_descr.py b/pypy/jit/backend/llsupport/test/test_descr.py
--- a/pypy/jit/backend/llsupport/test/test_descr.py
+++ b/pypy/jit/backend/llsupport/test/test_descr.py
@@ -52,7 +52,8 @@
     S = lltype.GcStruct('S', ('x', lltype.Char),
                              ('y', lltype.Ptr(T)),
                              ('z', lltype.Ptr(U)),
-                             ('f', lltype.Float))
+                             ('f', lltype.Float),
+                             ('s', lltype.SingleFloat))
     assert getFieldDescrClass(lltype.Ptr(T)) is GcPtrFieldDescr
     assert getFieldDescrClass(lltype.Ptr(U)) is NonGcPtrFieldDescr
     cls = getFieldDescrClass(lltype.Char)
@@ -61,6 +62,10 @@
     clsf = getFieldDescrClass(lltype.Float)
     assert clsf != cls
     assert clsf == getFieldDescrClass(lltype.Float)
+    clss = getFieldDescrClass(lltype.SingleFloat)
+    assert clss not in (cls, clsf)
+    assert clss == getFieldDescrClass(lltype.SingleFloat)
+    assert clss == getFieldDescrClass(rffi.UINT)    # for now
     #
     c0 = GcCache(False)
     c1 = GcCache(True)
@@ -72,14 +77,17 @@
         descr_y = get_field_descr(c2, S, 'y')
         descr_z = get_field_descr(c2, S, 'z')
         descr_f = get_field_descr(c2, S, 'f')
+        descr_s = get_field_descr(c2, S, 's')
         assert descr_x.__class__ is cls
         assert descr_y.__class__ is GcPtrFieldDescr
         assert descr_z.__class__ is NonGcPtrFieldDescr
         assert descr_f.__class__ is clsf
+        assert descr_s.__class__ is clss
         assert descr_x.name == 'S.x'
         assert descr_y.name == 'S.y'
         assert descr_z.name == 'S.z'
         assert descr_f.name == 'S.f'
+        assert descr_s.name == 'S.s'
         if not tsc:
             assert descr_x.offset < descr_y.offset < descr_z.offset
             assert descr_x.sort_key() < descr_y.sort_key() < descr_z.sort_key()
@@ -87,23 +95,29 @@
             assert descr_y.get_field_size(False) == rffi.sizeof(lltype.Ptr(T))
             assert descr_z.get_field_size(False) == rffi.sizeof(lltype.Ptr(U))
             assert descr_f.get_field_size(False) == rffi.sizeof(lltype.Float)
+            assert descr_s.get_field_size(False) == rffi.sizeof(
+                                                            lltype.SingleFloat)
         else:
             assert isinstance(descr_x.offset, Symbolic)
             assert isinstance(descr_y.offset, Symbolic)
             assert isinstance(descr_z.offset, Symbolic)
             assert isinstance(descr_f.offset, Symbolic)
+            assert isinstance(descr_s.offset, Symbolic)
             assert isinstance(descr_x.get_field_size(True), Symbolic)
             assert isinstance(descr_y.get_field_size(True), Symbolic)
             assert isinstance(descr_z.get_field_size(True), Symbolic)
             assert isinstance(descr_f.get_field_size(True), Symbolic)
+            assert isinstance(descr_s.get_field_size(True), Symbolic)
         assert not descr_x.is_pointer_field()
         assert     descr_y.is_pointer_field()
         assert not descr_z.is_pointer_field()
         assert not descr_f.is_pointer_field()
+        assert not descr_s.is_pointer_field()
         assert not descr_x.is_float_field()
         assert not descr_y.is_float_field()
         assert not descr_z.is_float_field()
         assert     descr_f.is_float_field()
+        assert not descr_s.is_float_field()
 
 
 def test_get_field_descr_sign():
@@ -135,6 +149,7 @@
     A2 = lltype.GcArray(lltype.Ptr(T))
     A3 = lltype.GcArray(lltype.Ptr(U))
     A4 = lltype.GcArray(lltype.Float)
+    A5 = lltype.GcArray(lltype.SingleFloat)
     assert getArrayDescrClass(A2) is GcPtrArrayDescr
     assert getArrayDescrClass(A3) is NonGcPtrArrayDescr
     cls = getArrayDescrClass(A1)
@@ -143,25 +158,32 @@
     clsf = getArrayDescrClass(A4)
     assert clsf != cls
     assert clsf == getArrayDescrClass(lltype.GcArray(lltype.Float))
+    clss = getArrayDescrClass(A5)
+    assert clss not in (clsf, cls)
+    assert clss == getArrayDescrClass(lltype.GcArray(rffi.UINT))
     #
     c0 = GcCache(False)
     descr1 = get_array_descr(c0, A1)
     descr2 = get_array_descr(c0, A2)
     descr3 = get_array_descr(c0, A3)
     descr4 = get_array_descr(c0, A4)
+    descr5 = get_array_descr(c0, A5)
     assert descr1.__class__ is cls
     assert descr2.__class__ is GcPtrArrayDescr
     assert descr3.__class__ is NonGcPtrArrayDescr
     assert descr4.__class__ is clsf
+    assert descr5.__class__ is clss
     assert descr1 == get_array_descr(c0, lltype.GcArray(lltype.Char))
     assert not descr1.is_array_of_pointers()
     assert     descr2.is_array_of_pointers()
     assert not descr3.is_array_of_pointers()
     assert not descr4.is_array_of_pointers()
+    assert not descr5.is_array_of_pointers()
     assert not descr1.is_array_of_floats()
     assert not descr2.is_array_of_floats()
     assert not descr3.is_array_of_floats()
     assert     descr4.is_array_of_floats()
+    assert not descr5.is_array_of_floats()
     #
     def get_alignment(code):
         # Retrieve default alignment for the compiler/platform
@@ -170,27 +192,33 @@
     assert descr2.get_base_size(False) == get_alignment('p')
     assert descr3.get_base_size(False) == get_alignment('p')
     assert descr4.get_base_size(False) == get_alignment('d')
+    assert descr5.get_base_size(False) == get_alignment('f')
     assert descr1.get_ofs_length(False) == 0
     assert descr2.get_ofs_length(False) == 0
     assert descr3.get_ofs_length(False) == 0
     assert descr4.get_ofs_length(False) == 0
+    assert descr5.get_ofs_length(False) == 0
     assert descr1.get_item_size(False) == rffi.sizeof(lltype.Char)
     assert descr2.get_item_size(False) == rffi.sizeof(lltype.Ptr(T))
     assert descr3.get_item_size(False) == rffi.sizeof(lltype.Ptr(U))
     assert descr4.get_item_size(False) == rffi.sizeof(lltype.Float)
+    assert descr5.get_item_size(False) == rffi.sizeof(lltype.SingleFloat)
     #
     assert isinstance(descr1.get_base_size(True), Symbolic)
     assert isinstance(descr2.get_base_size(True), Symbolic)
     assert isinstance(descr3.get_base_size(True), Symbolic)
     assert isinstance(descr4.get_base_size(True), Symbolic)
+    assert isinstance(descr5.get_base_size(True), Symbolic)
     assert isinstance(descr1.get_ofs_length(True), Symbolic)
     assert isinstance(descr2.get_ofs_length(True), Symbolic)
     assert isinstance(descr3.get_ofs_length(True), Symbolic)
     assert isinstance(descr4.get_ofs_length(True), Symbolic)
+    assert isinstance(descr5.get_ofs_length(True), Symbolic)
     assert isinstance(descr1.get_item_size(True), Symbolic)
     assert isinstance(descr2.get_item_size(True), Symbolic)
     assert isinstance(descr3.get_item_size(True), Symbolic)
     assert isinstance(descr4.get_item_size(True), Symbolic)
+    assert isinstance(descr5.get_item_size(True), Symbolic)
     CA = rffi.CArray(lltype.Signed)
     descr = get_array_descr(c0, CA)
     assert not descr.is_array_of_floats()
@@ -210,6 +238,11 @@
     assert descr.is_array_of_floats()
     assert descr.get_base_size(False) == 0
     assert descr.get_ofs_length(False) == -1
+    CA = rffi.CArray(rffi.FLOAT)
+    descr = get_array_descr(c0, CA)
+    assert not descr.is_array_of_floats()
+    assert descr.get_base_size(False) == 0
+    assert descr.get_ofs_length(False) == -1
 
 
 def test_get_array_descr_sign():
@@ -257,6 +290,11 @@
     assert descr4.get_result_size(False) == rffi.sizeof(lltype.Float)
     assert descr4.get_return_type() == history.FLOAT
     assert descr4.arg_classes == "ff"
+    #
+    descr5 = get_call_descr(c0, [lltype.SingleFloat], lltype.SingleFloat)
+    assert descr5.get_result_size(False) == rffi.sizeof(lltype.SingleFloat)
+    assert descr5.get_return_type() == "S"
+    assert descr5.arg_classes == "S"
 
 def test_get_call_descr_not_translated_longlong():
     if sys.maxint > 2147483647:
@@ -286,6 +324,11 @@
     assert isinstance(descr4.get_result_size(True), Symbolic)
     assert descr4.get_return_type() == history.FLOAT
     assert descr4.arg_classes == "ff"
+    #
+    descr5 = get_call_descr(c1, [lltype.SingleFloat], lltype.SingleFloat)
+    assert isinstance(descr5.get_result_size(True), Symbolic)
+    assert descr5.get_return_type() == "S"
+    assert descr5.arg_classes == "S"
 
 def test_call_descr_extra_info():
     c1 = GcCache(True)
@@ -345,8 +388,11 @@
     #
     descr4f = get_call_descr(c0, [lltype.Char, lltype.Ptr(S)], lltype.Float)
     assert 'FloatCallDescr' in descr4f.repr_of_descr()
+    #
+    descr5f = get_call_descr(c0, [lltype.Char], lltype.SingleFloat)
+    assert 'SingleFloatCallDescr' in descr5f.repr_of_descr()
 
-def test_call_stubs():
+def test_call_stubs_1():
     c0 = GcCache(False)
     ARGS = [lltype.Char, lltype.Signed]
     RES = lltype.Char
@@ -360,6 +406,8 @@
     res = call_stub(rffi.cast(lltype.Signed, fnptr), [1, 2], None, None)
     assert res == ord('c')
 
+def test_call_stubs_2():
+    c0 = GcCache(False)
     ARRAY = lltype.GcArray(lltype.Signed)
     ARGS = [lltype.Float, lltype.Ptr(ARRAY)]
     RES = lltype.Float
@@ -375,3 +423,27 @@
     res = descr2.call_stub(rffi.cast(lltype.Signed, fnptr),
                            [], [opaquea], [longlong.getfloatstorage(3.5)])
     assert longlong.getrealfloat(res) == 4.5
+
+def test_call_stubs_single_float():
+    from pypy.rlib.longlong2float import uint2singlefloat, singlefloat2uint
+    from pypy.rlib.rarithmetic import r_singlefloat, intmask
+    #
+    c0 = GcCache(False)
+    ARGS = [lltype.SingleFloat, lltype.SingleFloat, lltype.SingleFloat]
+    RES = lltype.SingleFloat
+
+    def f(a, b, c):
+        a = float(a)
+        b = float(b)
+        c = float(c)
+        x = a - (b / c)
+        return r_singlefloat(x)
+
+    fnptr = llhelper(lltype.Ptr(lltype.FuncType(ARGS, RES)), f)
+    descr2 = get_call_descr(c0, ARGS, RES)
+    a = intmask(singlefloat2uint(r_singlefloat(-10.0)))
+    b = intmask(singlefloat2uint(r_singlefloat(3.0)))
+    c = intmask(singlefloat2uint(r_singlefloat(2.0)))
+    res = descr2.call_stub(rffi.cast(lltype.Signed, fnptr),
+                           [a, b, c], [], [])
+    assert float(uint2singlefloat(rffi.r_uint(res))) == -11.5
diff --git a/pypy/jit/backend/llsupport/test/test_ffisupport.py b/pypy/jit/backend/llsupport/test/test_ffisupport.py
--- a/pypy/jit/backend/llsupport/test/test_ffisupport.py
+++ b/pypy/jit/backend/llsupport/test/test_ffisupport.py
@@ -1,24 +1,56 @@
 from pypy.rlib.libffi import types
-from pypy.jit.backend.llsupport.ffisupport import get_call_descr_dynamic, \
-    VoidCallDescr, DynamicIntCallDescr
-    
+from pypy.jit.codewriter.longlong import is_64_bit
+from pypy.jit.backend.llsupport.ffisupport import *
+
+
+class FakeCPU:
+    def __init__(self, supports_floats=False, supports_longlong=False,
+                 supports_singlefloats=False):
+        self.supports_floats = supports_floats
+        self.supports_longlong = supports_longlong
+        self.supports_singlefloats = supports_singlefloats
+
+
 def test_call_descr_dynamic():
+    args = [types.sint, types.pointer]
+    descr = get_call_descr_dynamic(FakeCPU(), args, types.sint, ffi_flags=42)
+    assert isinstance(descr, DynamicIntCallDescr)
+    assert descr.arg_classes == 'ii'
+    assert descr.get_ffi_flags() == 42
 
     args = [types.sint, types.double, types.pointer]
-    descr = get_call_descr_dynamic(args, types.void)
+    descr = get_call_descr_dynamic(FakeCPU(), args, types.void)
+    assert descr is None    # missing floats
+    descr = get_call_descr_dynamic(FakeCPU(supports_floats=True),
+                                   args, types.void, ffi_flags=43)
     assert isinstance(descr, VoidCallDescr)
     assert descr.arg_classes == 'ifi'
+    assert descr.get_ffi_flags() == 43
 
-    descr = get_call_descr_dynamic([], types.sint8)
+    descr = get_call_descr_dynamic(FakeCPU(), [], types.sint8)
     assert isinstance(descr, DynamicIntCallDescr)
     assert descr.get_result_size(False) == 1
     assert descr.is_result_signed() == True
 
-    descr = get_call_descr_dynamic([], types.uint8)
+    descr = get_call_descr_dynamic(FakeCPU(), [], types.uint8)
     assert isinstance(descr, DynamicIntCallDescr)
     assert descr.get_result_size(False) == 1
     assert descr.is_result_signed() == False
 
-    descr = get_call_descr_dynamic([], types.float)
-    assert descr is None # single floats are not supported so far
-    
+    if not is_64_bit:
+        descr = get_call_descr_dynamic(FakeCPU(), [], types.slonglong)
+        assert descr is None   # missing longlongs
+        descr = get_call_descr_dynamic(FakeCPU(supports_longlong=True),
+                                       [], types.slonglong, ffi_flags=43)
+        assert isinstance(descr, LongLongCallDescr)
+        assert descr.get_ffi_flags() == 43
+    else:
+        assert types.slonglong is types.slong
+
+    descr = get_call_descr_dynamic(FakeCPU(), [], types.float)
+    assert descr is None   # missing singlefloats
+    descr = get_call_descr_dynamic(FakeCPU(supports_singlefloats=True),
+                                   [], types.float, ffi_flags=44)
+    SingleFloatCallDescr = getCallDescrClass(rffi.FLOAT)
+    assert isinstance(descr, SingleFloatCallDescr)
+    assert descr.get_ffi_flags() == 44
diff --git a/pypy/jit/backend/llsupport/test/test_gc.py b/pypy/jit/backend/llsupport/test/test_gc.py
--- a/pypy/jit/backend/llsupport/test/test_gc.py
+++ b/pypy/jit/backend/llsupport/test/test_gc.py
@@ -246,9 +246,8 @@
     def __init__(self):
         self.record = []
 
-    def do_malloc_fixedsize_clear(self, RESTYPE, type_id, size, can_collect,
+    def do_malloc_fixedsize_clear(self, RESTYPE, type_id, size,
                                   has_finalizer, contains_weakptr):
-        assert can_collect
         assert not contains_weakptr
         p = llmemory.raw_malloc(size)
         p = llmemory.cast_adr_to_ptr(p, RESTYPE)
@@ -258,8 +257,7 @@
         return p
 
     def do_malloc_varsize_clear(self, RESTYPE, type_id, length, size,
-                                itemsize, offset_to_length, can_collect):
-        assert can_collect
+                                itemsize, offset_to_length):
         p = llmemory.raw_malloc(size + itemsize * length)
         (p + offset_to_length).signed[0] = length
         p = llmemory.cast_adr_to_ptr(p, RESTYPE)
diff --git a/pypy/jit/backend/llvm/llvm_rffi.py b/pypy/jit/backend/llvm/llvm_rffi.py
--- a/pypy/jit/backend/llvm/llvm_rffi.py
+++ b/pypy/jit/backend/llvm/llvm_rffi.py
@@ -3,7 +3,7 @@
 from pypy.rpython.lltypesystem import lltype, rffi
 from pypy.translator.tool.cbuild import ExternalCompilationInfo, log
 
-if sys.platform != 'linux2':
+if not sys.platform.startswith('linux'):
     py.test.skip("Linux only for now")
 
 # ____________________________________________________________
diff --git a/pypy/jit/backend/model.py b/pypy/jit/backend/model.py
--- a/pypy/jit/backend/model.py
+++ b/pypy/jit/backend/model.py
@@ -8,12 +8,13 @@
     # ^^^ This is only useful on 32-bit platforms.  If True,
     # longlongs are supported by the JIT, but stored as doubles.
     # Boxes and Consts are BoxFloats and ConstFloats.
+    supports_singlefloats = False
 
     done_with_this_frame_void_v = -1
     done_with_this_frame_int_v = -1
     done_with_this_frame_ref_v = -1
     done_with_this_frame_float_v = -1
-    exit_frame_with_exception_v = -1
+    propagate_exception_v = -1
     total_compiled_loops = 0
     total_compiled_bridges = 0
     total_freed_loops = 0
diff --git a/pypy/jit/backend/test/calling_convention_test.py b/pypy/jit/backend/test/calling_convention_test.py
--- a/pypy/jit/backend/test/calling_convention_test.py
+++ b/pypy/jit/backend/test/calling_convention_test.py
@@ -8,6 +8,7 @@
                                          ConstObj, BoxFloat, ConstFloat)
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.typesystem import deref
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.tool.oparser import parse
 from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi, rclass
 from pypy.rpython.ootypesystem import ootype
@@ -96,7 +97,8 @@
             FUNC = self.FuncType(funcargs, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             ops = '[%s]\n' % arguments
@@ -148,7 +150,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
@@ -190,7 +193,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
@@ -268,7 +272,8 @@
                 else:
                     ARGS.append(lltype.Signed)
             FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-                lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+                lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+                EffectInfo.MOST_GENERAL)
             ops = '''
             [%s]
             f99 = call_assembler(%s, descr=called_looptoken)
@@ -290,3 +295,59 @@
                 assert abs(x - expected_result) < 0.0001
             finally:
                 del self.cpu.done_with_this_frame_float_v
+
+    def test_call_with_singlefloats(self):
+        cpu = self.cpu
+        if not cpu.supports_floats or not cpu.supports_singlefloats:
+            py.test.skip('requires floats and singlefloats')
+
+        import random
+        from pypy.rlib.libffi import types
+        from pypy.rlib.rarithmetic import r_singlefloat
+
+        def func(*args):
+            res = 0.0
+            for i, x in enumerate(args):
+                res += (i + 1.1) * float(x)
+            return res
+
+        F = lltype.Float
+        S = lltype.SingleFloat
+        I = lltype.Signed
+        floats = [random.random() - 0.5 for i in range(8)]
+        singlefloats = [r_singlefloat(random.random() - 0.5) for i in range(8)]
+        ints = [random.randrange(-99, 99) for i in range(8)]
+        for repeat in range(100):
+            args = []
+            argvalues = []
+            argslist = []
+            local_floats = list(floats)
+            local_singlefloats = list(singlefloats)
+            local_ints = list(ints)
+            for i in range(8):
+                case = random.randrange(0, 3)
+                if case == 0:
+                    args.append(F)
+                    arg = local_floats.pop()
+                    argslist.append(boxfloat(arg))
+                elif case == 1:
+                    args.append(S)
+                    arg = local_singlefloats.pop()
+                    argslist.append(BoxInt(longlong.singlefloat2int(arg)))
+                else:
+                    args.append(I)
+                    arg = local_ints.pop()
+                    argslist.append(BoxInt(arg))
+                argvalues.append(arg)
+            FUNC = self.FuncType(args, F)
+            FPTR = self.Ptr(FUNC)
+            func_ptr = llhelper(FPTR, func)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
+            funcbox = self.get_funcbox(cpu, func_ptr)
+
+            res = self.execute_operation(rop.CALL,
+                                         [funcbox] + argslist,
+                                         'float', descr=calldescr)
+            expected = func(*argvalues)
+            assert abs(res.getfloat() - expected) < 0.0001
diff --git a/pypy/jit/backend/test/runner_test.py b/pypy/jit/backend/test/runner_test.py
--- a/pypy/jit/backend/test/runner_test.py
+++ b/pypy/jit/backend/test/runner_test.py
@@ -9,6 +9,7 @@
                                          ConstObj, BoxFloat, ConstFloat)
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.typesystem import deref
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.tool.oparser import parse
 from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi, rclass
 from pypy.rpython.ootypesystem import ootype
@@ -445,7 +446,8 @@
             return chr(ord(c) + 1)
         FPTR = self.Ptr(self.FuncType([lltype.Char], lltype.Char))
         func_ptr = llhelper(FPTR, func)
-        calldescr = cpu.calldescrof(deref(FPTR), (lltype.Char,), lltype.Char)
+        calldescr = cpu.calldescrof(deref(FPTR), (lltype.Char,), lltype.Char,
+                                    EffectInfo.MOST_GENERAL)
         x = cpu.bh_call_i(self.get_funcbox(cpu, func_ptr).value,
                           calldescr, [ord('A')], None, None)
         assert x == ord('B')
@@ -458,14 +460,15 @@
                                           lltype.Float))
             func_ptr = llhelper(FPTR, func)
             FTP = deref(FPTR)
-            calldescr = cpu.calldescrof(FTP, FTP.ARGS, FTP.RESULT)
+            calldescr = cpu.calldescrof(FTP, FTP.ARGS, FTP.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             x = cpu.bh_call_f(self.get_funcbox(cpu, func_ptr).value,
                               calldescr,
                               [42], None, [longlong.getfloatstorage(3.5)])
             assert longlong.getrealfloat(x) == 3.5 - 42
 
     def test_call(self):
-        from pypy.rlib.libffi import types
+        from pypy.rlib.libffi import types, FUNCFLAG_CDECL
 
         def func_int(a, b):
             return a + b
@@ -486,13 +489,16 @@
             FUNC = deref(FPTR)
             funcbox = self.get_funcbox(cpu, func_ptr)
             # first, try it with the "normal" calldescr
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             res = self.execute_operation(rop.CALL,
                                          [funcbox, BoxInt(num), BoxInt(num)],
                                          'int', descr=calldescr)
             assert res.value == 2 * num
             # then, try it with the dynamic calldescr
-            dyn_calldescr = cpu.calldescrof_dynamic([ffi_type, ffi_type], ffi_type)
+            dyn_calldescr = cpu.calldescrof_dynamic([ffi_type, ffi_type], ffi_type,
+                                                    EffectInfo.MOST_GENERAL,
+                                                    ffi_flags=FUNCFLAG_CDECL)
             res = self.execute_operation(rop.CALL,
                                          [funcbox, BoxInt(num), BoxInt(num)],
                                          'int', descr=dyn_calldescr)
@@ -507,7 +513,8 @@
             FUNC = self.FuncType([F] * 7 + [I] * 2 + [F] * 3, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
             args = ([boxfloat(.1) for i in range(7)] +
                     [BoxInt(1), BoxInt(2), boxfloat(.2), boxfloat(.3),
@@ -529,7 +536,8 @@
 
         FUNC = self.FuncType([lltype.Signed]*16, lltype.Signed)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         func_ptr = llhelper(FPTR, func)
         args = range(16)
         funcbox = self.get_funcbox(self.cpu, func_ptr)
@@ -552,7 +560,8 @@
             FPTR = self.Ptr(self.FuncType([TP] * nb_args, TP))
             func_ptr = llhelper(FPTR, func_ints)
             FUNC = deref(FPTR)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
             args = [280-24*i for i in range(nb_args)]
             res = self.execute_operation(rop.CALL,
@@ -566,7 +575,8 @@
 
         FUNC = self.FuncType([lltype.Float, lltype.Float], lltype.Float)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         func_ptr = llhelper(FPTR, func)
         funcbox = self.get_funcbox(self.cpu, func_ptr)
         res = self.execute_operation(rop.CALL, [funcbox, constfloat(1.5),
@@ -1589,7 +1599,8 @@
         '''
         FPTR = lltype.Ptr(lltype.FuncType([lltype.Signed], lltype.Void))
         fptr = llhelper(FPTR, func)
-        calldescr = self.cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT)
+        calldescr = self.cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT,
+                                         EffectInfo.MOST_GENERAL)
 
         xtp = lltype.malloc(rclass.OBJECT_VTABLE, immortal=True)
         xtp.subclassrange_min = 1
@@ -1707,6 +1718,7 @@
             jit_wb_if_flag = 4096
             jit_wb_if_flag_byteofs = struct.pack("i", 4096).index('\x10')
             jit_wb_if_flag_singlebyte = 0x10
+            jit_wb_cards_set = 0
             def get_write_barrier_from_array_fn(self, cpu):
                 return funcbox.getint()
         #
@@ -1728,6 +1740,72 @@
             else:
                 assert record == []
 
+    def test_cond_call_gc_wb_array_card_marking_fast_path(self):
+        def func_void(a, b, c):
+            record.append((a, b, c))
+        record = []
+        #
+        S = lltype.Struct('S', ('tid', lltype.Signed))
+        S_WITH_CARDS = lltype.Struct('S_WITH_CARDS',
+                                     ('card0', lltype.Char),
+                                     ('card1', lltype.Char),
+                                     ('card2', lltype.Char),
+                                     ('card3', lltype.Char),
+                                     ('card4', lltype.Char),
+                                     ('card5', lltype.Char),
+                                     ('card6', lltype.Char),
+                                     ('card7', lltype.Char),
+                                     ('data',  S))
+        FUNC = self.FuncType([lltype.Ptr(S), lltype.Signed, lltype.Ptr(S)],
+                             lltype.Void)
+        func_ptr = llhelper(lltype.Ptr(FUNC), func_void)
+        funcbox = self.get_funcbox(self.cpu, func_ptr)
+        class WriteBarrierDescr(AbstractDescr):
+            jit_wb_if_flag = 4096
+            jit_wb_if_flag_byteofs = struct.pack("i", 4096).index('\x10')
+            jit_wb_if_flag_singlebyte = 0x10
+            jit_wb_cards_set = 8192
+            jit_wb_cards_set_byteofs = struct.pack("i", 8192).index('\x20')
+            jit_wb_cards_set_singlebyte = 0x20
+            jit_wb_card_page_shift = 7
+            def get_write_barrier_from_array_fn(self, cpu):
+                return funcbox.getint()
+        #
+        for BoxIndexCls in [BoxInt, ConstInt]:
+            for cond in [False, True]:
+                print
+                print '_'*79
+                print 'BoxIndexCls =', BoxIndexCls
+                print 'JIT_WB_CARDS_SET =', cond
+                print
+                value = random.randrange(-sys.maxint, sys.maxint)
+                value |= 4096
+                if cond:
+                    value |= 8192
+                else:
+                    value &= ~8192
+                s = lltype.malloc(S_WITH_CARDS, immortal=True, zero=True)
+                s.data.tid = value
+                sgcref = rffi.cast(llmemory.GCREF, s.data)
+                del record[:]
+                box_index = BoxIndexCls((9<<7) + 17)
+                self.execute_operation(rop.COND_CALL_GC_WB_ARRAY,
+                           [BoxPtr(sgcref), box_index, BoxPtr(sgcref)],
+                           'void', descr=WriteBarrierDescr())
+                if cond:
+                    assert record == []
+                    assert s.card6 == '\x02'
+                else:
+                    assert record == [(s.data, (9<<7) + 17, s.data)]
+                    assert s.card6 == '\x00'
+                assert s.card0 == '\x00'
+                assert s.card1 == '\x00'
+                assert s.card2 == '\x00'
+                assert s.card3 == '\x00'
+                assert s.card4 == '\x00'
+                assert s.card5 == '\x00'
+                assert s.card7 == '\x00'
+
     def test_force_operations_returning_void(self):
         values = []
         def maybe_force(token, flag):
@@ -1740,7 +1818,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Void)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1783,7 +1862,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Signed)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1828,7 +1908,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Float)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1864,7 +1945,7 @@
         assert values == [1, 10]
 
     def test_call_to_c_function(self):
-        from pypy.rlib.libffi import CDLL, types, ArgChain
+        from pypy.rlib.libffi import CDLL, types, ArgChain, FUNCFLAG_CDECL
         from pypy.rpython.lltypesystem.ll2ctypes import libc_name
         libc = CDLL(libc_name)
         c_tolower = libc.getpointer('tolower', [types.uchar], types.sint)
@@ -1874,7 +1955,9 @@
         cpu = self.cpu
         func_adr = llmemory.cast_ptr_to_adr(c_tolower.funcsym)
         funcbox = ConstInt(heaptracker.adr2int(func_adr))
-        calldescr = cpu.calldescrof_dynamic([types.uchar], types.sint)
+        calldescr = cpu.calldescrof_dynamic([types.uchar], types.sint,
+                                            EffectInfo.MOST_GENERAL,
+                                            ffi_flags=FUNCFLAG_CDECL)
         i1 = BoxInt()
         i2 = BoxInt()
         tok = BoxInt()
@@ -1930,7 +2013,9 @@
         funcbox = ConstInt(heaptracker.adr2int(func_adr))
         calldescr = cpu.calldescrof_dynamic([types.pointer, types_size_t,
                                              types_size_t, types.pointer],
-                                            types.void)
+                                            types.void,
+                                            EffectInfo.MOST_GENERAL,
+                                            ffi_flags=clibffi.FUNCFLAG_CDECL)
         i0 = BoxInt()
         i1 = BoxInt()
         i2 = BoxInt()
@@ -1956,6 +2041,62 @@
         assert len(glob.lst) > 0
         lltype.free(raw, flavor='raw')
 
+    def test_call_to_winapi_function(self):
+        from pypy.rlib.clibffi import _WIN32, FUNCFLAG_STDCALL
+        if not _WIN32:
+            py.test.skip("Windows test only")
+        from pypy.rlib.libffi import CDLL, types, ArgChain
+        from pypy.rlib.rwin32 import DWORD
+        libc = CDLL('KERNEL32')
+        c_GetCurrentDir = libc.getpointer('GetCurrentDirectoryA',
+                                          [types.ulong, types.pointer],
+                                          types.ulong)
+
+        cwd = os.getcwd()
+        buflen = len(cwd) + 10
+        buffer = lltype.malloc(rffi.CCHARP.TO, buflen, flavor='raw')
+        argchain = ArgChain().arg(rffi.cast(DWORD, buflen)).arg(buffer)
+        res = c_GetCurrentDir.call(argchain, DWORD)
+        assert rffi.cast(lltype.Signed, res) == len(cwd)
+        assert rffi.charp2strn(buffer, buflen) == cwd
+        lltype.free(buffer, flavor='raw')
+
+        cpu = self.cpu
+        func_adr = llmemory.cast_ptr_to_adr(c_GetCurrentDir.funcsym)
+        funcbox = ConstInt(heaptracker.adr2int(func_adr))
+        calldescr = cpu.calldescrof_dynamic([types.ulong, types.pointer],
+                                            types.ulong,
+                                            EffectInfo.MOST_GENERAL,
+                                            ffi_flags=FUNCFLAG_STDCALL)
+        i1 = BoxInt()
+        i2 = BoxInt()
+        faildescr = BasicFailDescr(1)
+        # if the stdcall convention is ignored, then ESP is wrong after the
+        # call: 8 bytes too much.  If we repeat the call often enough, crash.
+        ops = []
+        for i in range(50):
+            i3 = BoxInt()
+            ops += [
+                ResOperation(rop.CALL_RELEASE_GIL, [funcbox, i1, i2], i3,
+                             descr=calldescr),
+                ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+                ]
+            ops[-1].setfailargs([])
+        ops += [
+            ResOperation(rop.FINISH, [i3], None, descr=BasicFailDescr(0))
+        ]
+        looptoken = LoopToken()
+        self.cpu.compile_loop([i1, i2], ops, looptoken)
+
+        buffer = lltype.malloc(rffi.CCHARP.TO, buflen, flavor='raw')
+        self.cpu.set_future_value_int(0, buflen)
+        self.cpu.set_future_value_int(1, rffi.cast(lltype.Signed, buffer))
+        fail = self.cpu.execute_token(looptoken)
+        assert fail.identifier == 0
+        assert self.cpu.get_latest_value_int(0) == len(cwd)
+        assert rffi.charp2strn(buffer, buflen) == cwd
+        lltype.free(buffer, flavor='raw')
+
     def test_guard_not_invalidated(self):
         cpu = self.cpu
         i0 = BoxInt()
@@ -2225,7 +2366,8 @@
         ARGS = [lltype.Signed] * 10
         RES = lltype.Signed
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         for i in range(10):
             self.cpu.set_future_value_int(i, i+1)
         res = self.cpu.execute_token(looptoken)
@@ -2265,7 +2407,8 @@
         ARGS = [lltype.Float, lltype.Float]
         RES = lltype.Float
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         
         ops = '''
         [f0, f1]
@@ -2355,7 +2498,8 @@
         ARGS = [lltype.Float, lltype.Float]
         RES = lltype.Float
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         
         ops = '''
         [f0, f1]
@@ -2567,7 +2711,8 @@
             #
             FUNC = self.FuncType([lltype.Signed], RESTYPE)
             FPTR = self.Ptr(FUNC)
-            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                             EffectInfo.MOST_GENERAL)
             x = self.cpu.bh_call_i(self.get_funcbox(self.cpu, f).value,
                                    calldescr, [value], None, None)
             assert x == expected, (
@@ -2600,7 +2745,8 @@
             #
             FUNC = self.FuncType([lltype.Signed], RESTYPE)
             FPTR = self.Ptr(FUNC)
-            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                             EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(self.cpu, f)
             res = self.execute_operation(rop.CALL, [funcbox, BoxInt(value)],
                                          'int', descr=calldescr)
@@ -2634,7 +2780,8 @@
         #
         FUNC = self.FuncType([lltype.SignedLongLong], lltype.SignedLongLong)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         x = self.cpu.bh_call_f(self.get_funcbox(self.cpu, f).value,
                                calldescr, None, None, [value])
         assert x == expected
@@ -2661,12 +2808,74 @@
         #
         FUNC = self.FuncType([lltype.SignedLongLong], lltype.SignedLongLong)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         funcbox = self.get_funcbox(self.cpu, f)
         res = self.execute_operation(rop.CALL, [funcbox, BoxFloat(value)],
                                      'float', descr=calldescr)
         assert res.getfloatstorage() == expected
 
+    def test_singlefloat_result_of_call_direct(self):
+        if not self.cpu.supports_singlefloats:
+            py.test.skip("singlefloat test")
+        from pypy.translator.tool.cbuild import ExternalCompilationInfo
+        from pypy.rlib.rarithmetic import r_singlefloat
+        eci = ExternalCompilationInfo(
+            separate_module_sources=["""
+            float fn_test_result_of_call(float x)
+            {
+                return x / 2.0f;
+            }
+            """],
+            export_symbols=['fn_test_result_of_call'])
+        f = rffi.llexternal('fn_test_result_of_call', [lltype.SingleFloat],
+                            lltype.SingleFloat,
+                            compilation_info=eci, _nowrapper=True)
+        value = r_singlefloat(-42.5)
+        expected = r_singlefloat(-21.25)
+        assert f(value) == expected
+        #
+        FUNC = self.FuncType([lltype.SingleFloat], lltype.SingleFloat)
+        FPTR = self.Ptr(FUNC)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
+        ivalue = longlong.singlefloat2int(value)
+        iexpected = longlong.singlefloat2int(expected)
+        x = self.cpu.bh_call_i(self.get_funcbox(self.cpu, f).value,
+                               calldescr, [ivalue], None, None)
+        assert x == iexpected
+
+    def test_singlefloat_result_of_call_compiled(self):
+        if not self.cpu.supports_singlefloats:
+            py.test.skip("test of singlefloat result")
+        from pypy.translator.tool.cbuild import ExternalCompilationInfo
+        from pypy.rlib.rarithmetic import r_singlefloat
+        eci = ExternalCompilationInfo(
+            separate_module_sources=["""
+            float fn_test_result_of_call(float x)
+            {
+                return x / 2.0f;
+            }
+            """],
+            export_symbols=['fn_test_result_of_call'])
+        f = rffi.llexternal('fn_test_result_of_call', [lltype.SingleFloat],
+                            lltype.SingleFloat,
+                            compilation_info=eci, _nowrapper=True)
+        value = r_singlefloat(-42.5)
+        expected = r_singlefloat(-21.25)
+        assert f(value) == expected
+        #
+        FUNC = self.FuncType([lltype.SingleFloat], lltype.SingleFloat)
+        FPTR = self.Ptr(FUNC)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
+        funcbox = self.get_funcbox(self.cpu, f)
+        ivalue = longlong.singlefloat2int(value)
+        iexpected = longlong.singlefloat2int(expected)
+        res = self.execute_operation(rop.CALL, [funcbox, BoxInt(ivalue)],
+                                     'int', descr=calldescr)
+        assert res.value == iexpected
+
     def test_free_loop_and_bridges(self):
         from pypy.jit.backend.llsupport.llmodel import AbstractLLCPU
         if not isinstance(self.cpu, AbstractLLCPU):
@@ -2681,6 +2890,26 @@
         assert mem2 < mem1
         assert mem2 == mem0
 
+    def test_memoryerror(self):
+        excdescr = BasicFailDescr(666)
+        self.cpu.propagate_exception_v = self.cpu.get_fail_descr_number(
+            excdescr)
+        self.cpu.setup_once()    # xxx redo it, because we added
+                                 # propagate_exception_v
+        i0 = BoxInt()
+        p0 = BoxPtr()
+        operations = [
+            ResOperation(rop.NEWUNICODE, [i0], p0),
+            ResOperation(rop.FINISH, [p0], None, descr=BasicFailDescr(1))
+            ]
+        inputargs = [i0]
+        looptoken = LoopToken()
+        self.cpu.compile_loop(inputargs, operations, looptoken)
+        # overflowing value:
+        self.cpu.set_future_value_int(0, sys.maxint // 4 + 1)
+        fail = self.cpu.execute_token(looptoken)
+        assert fail.identifier == excdescr.identifier
+
 
 class OOtypeBackendTest(BaseBackendTest):
 
diff --git a/pypy/jit/backend/test/test_ll_random.py b/pypy/jit/backend/test/test_ll_random.py
--- a/pypy/jit/backend/test/test_ll_random.py
+++ b/pypy/jit/backend/test/test_ll_random.py
@@ -6,6 +6,7 @@
 from pypy.jit.metainterp.history import BoxPtr, BoxInt
 from pypy.jit.metainterp.history import BasicFailDescr
 from pypy.jit.codewriter import heaptracker
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.rpython.annlowlevel import llhelper
 from pypy.rlib.rarithmetic import intmask
 from pypy.rpython.llinterp import LLException
@@ -468,6 +469,10 @@
         exec code in d
         return subset, d['f'], vtableptr
 
+    def getcalldescr(self, builder, TP):
+        ef = EffectInfo.MOST_GENERAL
+        return builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT, ef)
+
 # 1. non raising call and guard_no_exception
 class CallOperation(BaseCallOperation):
     def produce_into(self, builder, r):
@@ -481,7 +486,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         op = ResOperation(rop.GUARD_NO_EXCEPTION, [], None,
                           descr=BasicFailDescr())
@@ -501,7 +506,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         _, vtableptr = builder.get_random_structure_type_and_vtable(r)
         exc_box = ConstAddr(llmemory.cast_ptr_to_adr(vtableptr), builder.cpu)
@@ -523,7 +528,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         exc_box = ConstAddr(llmemory.cast_ptr_to_adr(exc), builder.cpu)
         op = ResOperation(rop.GUARD_EXCEPTION, [exc_box], BoxPtr(),
@@ -540,7 +545,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         op = ResOperation(rop.GUARD_NO_EXCEPTION, [], BoxPtr(),
                           descr=BasicFailDescr())
@@ -559,7 +564,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         while True:
             _, vtableptr = builder.get_random_structure_type_and_vtable(r)
diff --git a/pypy/jit/backend/x86/arch.py b/pypy/jit/backend/x86/arch.py
--- a/pypy/jit/backend/x86/arch.py
+++ b/pypy/jit/backend/x86/arch.py
@@ -27,3 +27,6 @@
 # which are used in the malloc itself.  They are:
 #   ecx, ebx, esi, edi               [32 and 64 bits]
 #   r8, r9, r10, r12, r13, r14, r15    [64 bits only]
+#
+# Note that with asmgcc, the locations corresponding to callee-save registers
+# are never used.
diff --git a/pypy/jit/backend/x86/assembler.py b/pypy/jit/backend/x86/assembler.py
--- a/pypy/jit/backend/x86/assembler.py
+++ b/pypy/jit/backend/x86/assembler.py
@@ -34,6 +34,7 @@
 from pypy.rlib.debug import (debug_print, debug_start, debug_stop,
                              have_debug_prints)
 from pypy.rlib import rgc
+from pypy.rlib.clibffi import FFI_DEFAULT_ABI
 from pypy.jit.backend.x86.jump import remap_frame_layout
 from pypy.jit.metainterp.history import ConstInt, BoxInt
 from pypy.jit.codewriter.effectinfo import EffectInfo
@@ -56,7 +57,9 @@
         self.exc = exc
         self.is_guard_not_invalidated = is_guard_not_invalidated
 
-DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed))
+DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed),
+                              ('bridge', lltype.Signed), # 0 or 1
+                              ('number', lltype.Signed))
 
 class Assembler386(object):
     _regalloc = None
@@ -89,6 +92,8 @@
         self._current_depths_cache = (0, 0)
         self.datablockwrapper = None
         self.stack_check_slowpath = 0
+        self.propagate_exception_path = 0
+        self.gcrootmap_retaddr_forced = 0
         self.teardown()
 
     def leave_jitted_hook(self):
@@ -125,6 +130,7 @@
             self._build_failure_recovery(True, withfloats=True)
             support.ensure_sse2_floats()
             self._build_float_constants()
+        self._build_propagate_exception_path()
         if gc_ll_descr.get_malloc_slowpath_addr is not None:
             self._build_malloc_slowpath()
         self._build_stack_check_slowpath()
@@ -138,6 +144,9 @@
         assert self.memcpy_addr != 0, "setup_once() not called?"
         self.current_clt = looptoken.compiled_loop_token
         self.pending_guard_tokens = []
+        if WORD == 8:
+            self.pending_memoryerror_trampoline_from = []
+            self.error_trampoline_64 = 0
         self.mc = codebuf.MachineCodeBlockWrapper()
         #assert self.datablockwrapper is None --- but obscure case
         # possible, e.g. getting MemoryError and continuing
@@ -147,6 +156,8 @@
 
     def teardown(self):
         self.pending_guard_tokens = None
+        if WORD == 8:
+            self.pending_memoryerror_trampoline_from = None
         self.mc = None
         self.looppos = -1
         self.currently_compiling_loop = None
@@ -155,9 +166,12 @@
     def finish_once(self):
         if self._debug:
             debug_start('jit-backend-counts')
-            for i in range(len(self.loop_run_counters)):
-                struct = self.loop_run_counters[i]
-                debug_print(str(i) + ':' + str(struct.i))
+            for struct in self.loop_run_counters:
+                if struct.bridge:
+                    prefix = 'bridge '
+                else:
+                    prefix = 'loop '
+                debug_print(prefix + str(struct.number) + ':' + str(struct.i))
             debug_stop('jit-backend-counts')
 
     def _build_float_constants(self):
@@ -181,6 +195,7 @@
         # instructions in assembler, with a mark_gc_roots in between.
         # With shadowstack, this is not needed, so we produce a single helper.
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        shadow_stack = (gcrootmap is not None and gcrootmap.is_shadow_stack)
         #
         # ---------- first helper for the slow path of malloc ----------
         mc = codebuf.MachineCodeBlockWrapper()
@@ -190,10 +205,19 @@
         mc.SUB_rr(edx.value, eax.value)       # compute the size we want
         addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
         #
-        if gcrootmap is not None and gcrootmap.is_shadow_stack:
+        # The registers to save in the copy area: with shadowstack, most
+        # registers need to be saved.  With asmgcc, the callee-saved registers
+        # don't need to.
+        save_in_copy_area = gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items()
+        if not shadow_stack:
+            save_in_copy_area = [(reg, ofs) for (reg, ofs) in save_in_copy_area
+                   if reg not in gpr_reg_mgr_cls.REGLOC_TO_GCROOTMAP_REG_INDEX]
+        #
+        for reg, ofs in save_in_copy_area:
+            mc.MOV_br(ofs, reg.value)
+        #
+        if shadow_stack:
             # ---- shadowstack ----
-            for reg, ofs in gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items():
-                mc.MOV_br(ofs, reg.value)
             mc.SUB_ri(esp.value, 16 - WORD)      # stack alignment of 16 bytes
             if IS_X86_32:
                 mc.MOV_sr(0, edx.value)          # push argument
@@ -201,15 +225,13 @@
                 mc.MOV_rr(edi.value, edx.value)
             mc.CALL(imm(addr))
             mc.ADD_ri(esp.value, 16 - WORD)
-            for reg, ofs in gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items():
-                mc.MOV_rb(reg.value, ofs)
         else:
             # ---- asmgcc ----
             if IS_X86_32:
                 mc.MOV_sr(WORD, edx.value)       # save it as the new argument
             elif IS_X86_64:
-                # rdi can be clobbered: its content was forced to the stack
-                # by _fastpath_malloc(), like all other save_around_call_regs.
+                # rdi can be clobbered: its content was saved in the
+                # copy area of the stack
                 mc.MOV_rr(edi.value, edx.value)
             mc.JMP(imm(addr))                    # tail call to the real malloc
             rawstart = mc.materialize(self.cpu.asmmemmgr, [])
@@ -217,18 +239,54 @@
             # ---------- second helper for the slow path of malloc ----------
             mc = codebuf.MachineCodeBlockWrapper()
         #
+        for reg, ofs in save_in_copy_area:
+            mc.MOV_rb(reg.value, ofs)
+            assert reg is not eax and reg is not edx
+        #
         if self.cpu.supports_floats:          # restore the XMM registers
             for i in range(self.cpu.NUM_REGS):# from where they were saved
                 mc.MOVSD_xs(i, (WORD*2)+8*i)
+        #
+        # Note: we check this after the code above, just because the code
+        # above is more than 127 bytes on 64-bits...
+        mc.TEST_rr(eax.value, eax.value)
+        mc.J_il8(rx86.Conditions['Z'], 0) # patched later
+        jz_location = mc.get_relative_pos()
+        #
         nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
         mc.MOV(edx, heap(nursery_free_adr))   # load this in EDX
         mc.RET()
+        #
+        # If the slowpath malloc failed, we raise a MemoryError that
+        # always interrupts the current loop, as a "good enough"
+        # approximation.  Also note that we didn't RET from this helper;
+        # but the code we jump to will actually restore the stack
+        # position based on EBP, which will get us out of here for free.
+        offset = mc.get_relative_pos() - jz_location
+        assert 0 < offset <= 127
+        mc.overwrite(jz_location-1, chr(offset))
+        mc.JMP(imm(self.propagate_exception_path))
+        #
         rawstart = mc.materialize(self.cpu.asmmemmgr, [])
         self.malloc_slowpath2 = rawstart
 
+    def _build_propagate_exception_path(self):
+        if self.cpu.propagate_exception_v < 0:
+            return      # not supported (for tests, or non-translated)
+        #
+        self.mc = codebuf.MachineCodeBlockWrapper()
+        # call on_leave_jitted_save_exc()
+        addr = self.cpu.get_on_leave_jitted_int(save_exception=True)
+        self.mc.CALL(imm(addr))
+        self.mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
+        self._call_footer()
+        rawstart = self.mc.materialize(self.cpu.asmmemmgr, [])
+        self.propagate_exception_path = rawstart
+        self.mc = None
+
     def _build_stack_check_slowpath(self):
         _, _, slowpathaddr = self.cpu.insert_stack_check()
-        if slowpathaddr == 0 or self.cpu.exit_frame_with_exception_v < 0:
+        if slowpathaddr == 0 or self.cpu.propagate_exception_v < 0:
             return      # no stack check (for tests, or non-translated)
         #
         # make a "function" that is called immediately at the start of
@@ -284,19 +342,11 @@
         offset = mc.get_relative_pos() - jnz_location
         assert 0 < offset <= 127
         mc.overwrite(jnz_location-1, chr(offset))
-        # clear the exception from the global position
-        mc.MOV(eax, heap(self.cpu.pos_exc_value()))
-        mc.MOV(heap(self.cpu.pos_exception()), imm0)
-        mc.MOV(heap(self.cpu.pos_exc_value()), imm0)
-        # save the current exception instance into fail_boxes_ptr[0]
-        adr = self.fail_boxes_ptr.get_addr_for_num(0)
-        mc.MOV(heap(adr), eax)
-        # call the helper function to set the GC flag on the fail_boxes_ptr
-        # array (note that there is no exception any more here)
-        addr = self.cpu.get_on_leave_jitted_int(save_exception=False)
+        # call on_leave_jitted_save_exc()
+        addr = self.cpu.get_on_leave_jitted_int(save_exception=True)
         mc.CALL(imm(addr))
         #
-        mc.MOV_ri(eax.value, self.cpu.exit_frame_with_exception_v)
+        mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
         #
         # footer -- note the ADD, which skips the return address of this
         # function, and will instead return to the caller's caller.  Note
@@ -309,6 +359,7 @@
         self.stack_check_slowpath = rawstart
 
     @staticmethod
+    @rgc.no_collect
     def _release_gil_asmgcc(css):
         # similar to trackgcroot.py:pypy_asm_stackwalk, first part
         from pypy.rpython.memory.gctransform import asmgcroot
@@ -324,6 +375,7 @@
             before()
 
     @staticmethod
+    @rgc.no_collect
     def _reacquire_gil_asmgcc(css):
         # first reacquire the GIL
         after = rffi.aroundstate.after
@@ -338,12 +390,14 @@
         next.prev = prev
 
     @staticmethod
+    @rgc.no_collect
     def _release_gil_shadowstack():
         before = rffi.aroundstate.before
         if before:
             before()
 
     @staticmethod
+    @rgc.no_collect
     def _reacquire_gil_shadowstack():
         after = rffi.aroundstate.after
         if after:
@@ -392,7 +446,7 @@
         self.setup(looptoken)
         self.currently_compiling_loop = looptoken
         if log:
-            self._register_counter()
+            self._register_counter(False, looptoken.number)
             operations = self._inject_debugging_code(looptoken, operations)
 
         regalloc = RegAlloc(self, self.cpu.translate_support_code)
@@ -416,10 +470,13 @@
         fullsize = self.mc.get_relative_pos()
         #
         rawstart = self.materialize_loop(looptoken)
-        debug_print("Loop #%d (%s) has address %x to %x" % (
+        debug_start("jit-backend-addr")
+        debug_print("Loop %d (%s) has address %x to %x (bootstrap %x)" % (
             looptoken.number, loopname,
             rawstart + self.looppos,
-            rawstart + directbootstrappos))
+            rawstart + directbootstrappos,
+            rawstart))
+        debug_stop("jit-backend-addr")
         self._patch_stackadjust(rawstart + stackadjustpos,
                                 frame_depth + param_depth)
         self.patch_pending_failure_recoveries(rawstart)
@@ -458,7 +515,7 @@
 
         self.setup(original_loop_token)
         if log:
-            self._register_counter()
+            self._register_counter(True, descr_number)
             operations = self._inject_debugging_code(faildescr, operations)
 
         arglocs = self.rebuild_faillocs_from_descr(failure_recovery)
@@ -478,9 +535,10 @@
         fullsize = self.mc.get_relative_pos()
         #
         rawstart = self.materialize_loop(original_loop_token)
-
-        debug_print("Bridge out of guard %d has address %x to %x" %
+        debug_start("jit-backend-addr")
+        debug_print("bridge out of Guard %d has address %x to %x" %
                     (descr_number, rawstart, rawstart + codeendpos))
+        debug_stop("jit-backend-addr")
         self._patch_stackadjust(rawstart + stackadjustpos,
                                 frame_depth + param_depth)
         self.patch_pending_failure_recoveries(rawstart)
@@ -504,6 +562,8 @@
         # at the end of self.mc.
         for tok in self.pending_guard_tokens:
             tok.pos_recovery_stub = self.generate_quick_failure(tok)
+        if WORD == 8 and len(self.pending_memoryerror_trampoline_from) > 0:
+            self.error_trampoline_64 = self.generate_propagate_error_64()
 
     def patch_pending_failure_recoveries(self, rawstart):
         # after we wrote the assembler to raw memory, set up
@@ -540,6 +600,12 @@
                 # less, we would run into the issue that overwriting the
                 # 5 bytes here might get a few nonsense bytes at the
                 # return address of the following CALL.
+        if WORD == 8:
+            for pos_after_jz in self.pending_memoryerror_trampoline_from:
+                assert self.error_trampoline_64 != 0     # only if non-empty
+                mc = codebuf.MachineCodeBlockWrapper()
+                mc.writeimm32(self.error_trampoline_64 - pos_after_jz)
+                mc.copy_to_raw_memory(rawstart + pos_after_jz - 4)
 
     def get_asmmemmgr_blocks(self, looptoken):
         clt = looptoken.compiled_loop_token
@@ -554,7 +620,7 @@
         return self.mc.materialize(self.cpu.asmmemmgr, allblocks,
                                    self.cpu.gc_ll_descr.gcrootmap)
 
-    def _register_counter(self):
+    def _register_counter(self, bridge, number):
         if self._debug:
             # YYY very minor leak -- we need the counters to stay alive
             # forever, just because we want to report them at the end
@@ -562,6 +628,8 @@
             struct = lltype.malloc(DEBUG_COUNTER, flavor='raw',
                                    track_allocation=False)
             struct.i = 0
+            struct.bridge = int(bridge)
+            struct.number = number
             self.loop_run_counters.append(struct)
 
     def _find_failure_recovery_bytecode(self, faildescr):
@@ -889,6 +957,7 @@
         if (isinstance(from_loc, RegLoc) and from_loc.is_xmm) or (isinstance(to_loc, RegLoc) and to_loc.is_xmm):
             self.mc.MOVSD(to_loc, from_loc)
         else:
+            assert to_loc is not ebp
             self.mc.MOV(to_loc, from_loc)
 
     regalloc_mov = mov # legacy interface
@@ -1052,9 +1121,10 @@
                     self.implement_guard(guard_token, checkfalsecond)
         return genop_cmp_guard_float
 
-    def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax):
+    def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax,
+                   argtypes=None, callconv=FFI_DEFAULT_ABI):
         if IS_X86_64:
-            return self._emit_call_64(force_index, x, arglocs, start)
+            return self._emit_call_64(force_index, x, arglocs, start, argtypes)
 
         p = 0
         n = len(arglocs)
@@ -1081,13 +1151,24 @@
         # x is a location
         self.mc.CALL(x)
         self.mark_gc_roots(force_index)
+        #
+        if callconv != FFI_DEFAULT_ABI:
+            self._fix_stdcall(callconv, p)
 
-    def _emit_call_64(self, force_index, x, arglocs, start):
+    def _fix_stdcall(self, callconv, p):
+        from pypy.rlib.clibffi import FFI_STDCALL
+        assert callconv == FFI_STDCALL
+        # it's a bit stupid, but we're just going to cancel the fact that
+        # the called function just added 'p' to ESP, by subtracting it again.
+        self.mc.SUB_ri(esp.value, p)
+
+    def _emit_call_64(self, force_index, x, arglocs, start, argtypes):
         src_locs = []
         dst_locs = []
         xmm_src_locs = []
         xmm_dst_locs = []
         pass_on_stack = []
+        singlefloats = None
 
         # In reverse order for use with pop()
         unused_gpr = [r9, r8, ecx, edx, esi, edi]
@@ -1107,6 +1188,11 @@
                     xmm_dst_locs.append(unused_xmm.pop())
                 else:
                     pass_on_stack.append(loc)
+            elif (argtypes is not None and argtypes[i-start] == 'S' and
+                  len(unused_xmm) > 0):
+                # Singlefloat argument
+                if singlefloats is None: singlefloats = []
+                singlefloats.append((loc, unused_xmm.pop()))
             else:
                 if len(unused_gpr) > 0:
                     src_locs.append(loc)
@@ -1134,9 +1220,15 @@
                 else:
                     self.mc.MOV_sr(i*WORD, loc.value)
 
-        # Handle register arguments
+        # Handle register arguments: first remap the xmm arguments
+        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs,
+                           X86_64_XMM_SCRATCH_REG)
+        # Load the singlefloat arguments from main regs or stack to xmm regs
+        if singlefloats is not None:
+            for src, dst in singlefloats:
+                self.mc.MOVD(dst, src)
+        # Finally remap the arguments in the main regs
         remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
-        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs, X86_64_XMM_SCRATCH_REG)
 
         self._regalloc.reserve_param(len(pass_on_stack))
         self.mc.CALL(x)
@@ -1251,6 +1343,20 @@
     def genop_cast_int_to_float(self, op, arglocs, resloc):
         self.mc.CVTSI2SD(resloc, arglocs[0])
 
+    def genop_cast_float_to_singlefloat(self, op, arglocs, resloc):
+        loc0, loctmp = arglocs
+        self.mc.CVTSD2SS(loctmp, loc0)
+        assert isinstance(resloc, RegLoc)
+        assert isinstance(loctmp, RegLoc)
+        self.mc.MOVD_rx(resloc.value, loctmp.value)
+
+    def genop_cast_singlefloat_to_float(self, op, arglocs, resloc):
+        loc0, = arglocs
+        assert isinstance(resloc, RegLoc)
+        assert isinstance(loc0, RegLoc)
+        self.mc.MOVD_xr(resloc.value, loc0.value)
+        self.mc.CVTSS2SD_xx(resloc.value, resloc.value)
+
     def genop_guard_int_is_true(self, op, guard_op, guard_token, arglocs, resloc):
         guard_opnum = guard_op.getopnum()
         self.mc.CMP(arglocs[0], imm0)
@@ -1372,7 +1478,7 @@
         assert isinstance(loc_vtable, ImmedLoc)
         arglocs = arglocs[:-1]
         self.call(self.malloc_func_addr, arglocs, eax)
-        # xxx ignore NULL returns for now
+        self.propagate_memoryerror_if_eax_is_null()
         self.set_vtable(eax, loc_vtable)
 
     def set_vtable(self, loc, loc_vtable):
@@ -1391,18 +1497,35 @@
     def genop_new(self, op, arglocs, result_loc):
         assert result_loc is eax
         self.call(self.malloc_func_addr, arglocs, eax)
+        self.propagate_memoryerror_if_eax_is_null()
 
     def genop_new_array(self, op, arglocs, result_loc):
         assert result_loc is eax
         self.call(self.malloc_array_func_addr, arglocs, eax)
+        self.propagate_memoryerror_if_eax_is_null()
 
     def genop_newstr(self, op, arglocs, result_loc):
         assert result_loc is eax
         self.call(self.malloc_str_func_addr, arglocs, eax)
+        self.propagate_memoryerror_if_eax_is_null()
 
     def genop_newunicode(self, op, arglocs, result_loc):
         assert result_loc is eax
         self.call(self.malloc_unicode_func_addr, arglocs, eax)
+        self.propagate_memoryerror_if_eax_is_null()
+
+    def propagate_memoryerror_if_eax_is_null(self):
+        # if self.propagate_exception_path == 0 (tests), this may jump to 0
+        # and segfaults.  too bad.  the alternative is to continue anyway
+        # with eax==0, but that will segfault too.
+        self.mc.TEST_rr(eax.value, eax.value)
+        if WORD == 4:
+            self.mc.J_il(rx86.Conditions['Z'], self.propagate_exception_path)
+            self.mc.add_pending_relocation()
+        elif WORD == 8:
+            self.mc.J_il(rx86.Conditions['Z'], 0)
+            pos = self.mc.get_relative_pos()
+            self.pending_memoryerror_trampoline_from.append(pos)
 
     # ----------
 
@@ -1674,6 +1797,12 @@
         return GuardToken(faildescr, failargs, fail_locs, exc,
                           is_guard_not_invalidated)
 
+    def generate_propagate_error_64(self):
+        assert WORD == 8
+        startpos = self.mc.get_relative_pos()
+        self.mc.JMP(imm(self.propagate_exception_path))
+        return startpos
+
     def generate_quick_failure(self, guardtok):
         """Generate the initial code for handling a failure.  We try to
         keep it as compact as possible.
@@ -2009,7 +2138,9 @@
         else:
             tmp = eax
 
-        self._emit_call(force_index, x, arglocs, 3, tmp=tmp)
+        self._emit_call(force_index, x, arglocs, 3, tmp=tmp,
+                        argtypes=op.getdescr().get_arg_types(),
+                        callconv=op.getdescr().get_call_conv())
 
         if IS_X86_32 and isinstance(resloc, StackLoc) and resloc.width == 8:
             # a float or a long long return
@@ -2021,7 +2152,19 @@
                 #     and this way is simpler also because the result loc
                 #     can just be always a stack location
             else:
-                self.mc.FSTP_b(resloc.value)   # float return
+                self.mc.FSTPL_b(resloc.value)   # float return
+        elif op.getdescr().get_return_type() == 'S':
+            # singlefloat return
+            assert resloc is eax
+            if IS_X86_32:
+                # must convert ST(0) to a 32-bit singlefloat and load it into EAX
+                # mess mess mess
+                self.mc.SUB_ri(esp.value, 4)
+                self.mc.FSTPS_s(0)
+                self.mc.POP_r(eax.value)
+            elif IS_X86_64:
+                # must copy from the lower 32 bits of XMM0 into eax
+                self.mc.MOVD_rx(eax.value, xmm0.value)
         elif size == WORD:
             assert resloc is eax or resloc is xmm0    # a full word
         elif size == 0:
@@ -2093,13 +2236,27 @@
                 css = get_ebp_ofs(pos + use_words - 1)
                 self._regalloc.close_stack_struct = css
             # The location where the future CALL will put its return address
-            # will be [ESP-WORD], so save that as the next frame's top address
-            self.mc.LEA_rs(eax.value, -WORD)        # LEA EAX, [ESP-4]
+            # will be [ESP-WORD].  But we can't use that as the next frame's
+            # top address!  As the code after releasegil() runs without the
+            # GIL, it might not be set yet by the time we need it (very
+            # unlikely), or it might be overwritten by the following call
+            # to reaquiregil() (much more likely).  So we hack even more
+            # and use a dummy location containing a dummy value (a pointer
+            # to itself) which we pretend is the return address :-/ :-/ :-/
+            # It prevents us to store any %esp-based stack locations but we
+            # don't so far.
+            adr = self.datablockwrapper.malloc_aligned(WORD, WORD)
+            rffi.cast(rffi.CArrayPtr(lltype.Signed), adr)[0] = adr
+            self.gcrootmap_retaddr_forced = adr
             frame_ptr = css + WORD * (2+asmgcroot.FRAME_PTR)
-            self.mc.MOV_br(frame_ptr, eax.value)    # MOV [css.frame], EAX
+            if rx86.fits_in_32bits(adr):
+                self.mc.MOV_bi(frame_ptr, adr)          # MOV [css.frame], adr
+            else:
+                self.mc.MOV_ri(eax.value, adr)          # MOV EAX, adr
+                self.mc.MOV_br(frame_ptr, eax.value)    # MOV [css.frame], EAX
             # Save ebp
             index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
-            self.mc.MOV_br(index_of_ebp, ebp.value) # MOV [css.ebp], EBP
+            self.mc.MOV_br(index_of_ebp, ebp.value)     # MOV [css.ebp], EBP
             # Call the closestack() function (also releasing the GIL)
             if IS_X86_32:
                 reg = eax
@@ -2127,6 +2284,9 @@
         if gcrootmap.is_shadow_stack:
             args = []
         else:
+            assert self.gcrootmap_retaddr_forced == -1, (
+                      "missing mark_gc_roots() in CALL_RELEASE_GIL")
+            self.gcrootmap_retaddr_forced = 0
             css = self._regalloc.close_stack_struct
             assert css != 0
             if IS_X86_32:
@@ -2179,7 +2339,7 @@
         self._emit_call(fail_index, imm(asm_helper_adr), [eax, arglocs[1]], 0,
                         tmp=ecx)
         if IS_X86_32 and isinstance(result_loc, StackLoc) and result_loc.type == FLOAT:
-            self.mc.FSTP_b(result_loc.value)
+            self.mc.FSTPL_b(result_loc.value)
         #else: result_loc is already either eax or None, checked below
         self.mc.JMP_l8(0) # jump to done, patched later
         jmp_location = self.mc.get_relative_pos()
@@ -2242,10 +2402,12 @@
         if opnum == rop.COND_CALL_GC_WB:
             N = 2
             func = descr.get_write_barrier_fn(self.cpu)
+            card_marking = False
         elif opnum == rop.COND_CALL_GC_WB_ARRAY:
             N = 3
             func = descr.get_write_barrier_from_array_fn(self.cpu)
             assert func != 0
+            card_marking = descr.jit_wb_cards_set != 0
         else:
             raise AssertionError(opnum)
         #
@@ -2254,6 +2416,18 @@
                       imm(descr.jit_wb_if_flag_singlebyte))
         self.mc.J_il8(rx86.Conditions['Z'], 0) # patched later
         jz_location = self.mc.get_relative_pos()
+
+        # for cond_call_gc_wb_array, also add another fast path:
+        # if GCFLAG_CARDS_SET, then we can just set one bit and be done
+        if card_marking:
+            self.mc.TEST8(addr_add_const(loc_base,
+                                         descr.jit_wb_cards_set_byteofs),
+                          imm(descr.jit_wb_cards_set_singlebyte))
+            self.mc.J_il8(rx86.Conditions['NZ'], 0) # patched later
+            jnz_location = self.mc.get_relative_pos()
+        else:
+            jnz_location = 0
+
         # the following is supposed to be the slow path, so whenever possible
         # we choose the most compact encoding over the most efficient one.
         if IS_X86_32:
@@ -2293,6 +2467,43 @@
             loc = arglocs[i]
             assert isinstance(loc, RegLoc)
             self.mc.POP_r(loc.value)
+
+        # if GCFLAG_CARDS_SET, then we can do the whole thing that would
+        # be done in the CALL above with just four instructions, so here
+        # is an inline copy of them
+        if card_marking:
+            self.mc.JMP_l8(0) # jump to the exit, patched later
+            jmp_location = self.mc.get_relative_pos()
+            # patch the JNZ above
+            offset = self.mc.get_relative_pos() - jnz_location
+            assert 0 < offset <= 127
+            self.mc.overwrite(jnz_location-1, chr(offset))
+            #
+            loc_index = arglocs[1]
+            if isinstance(loc_index, RegLoc):
+                # choose a scratch register
+                tmp1 = loc_index
+                self.mc.PUSH_r(tmp1.value)
+                # SHR tmp, card_page_shift
+                self.mc.SHR_ri(tmp1.value, descr.jit_wb_card_page_shift)
+                # XOR tmp, -8
+                self.mc.XOR_ri(tmp1.value, -8)
+                # BTS [loc_base], tmp
+                self.mc.BTS(addr_add_const(loc_base, 0), tmp1)
+                # done
+                self.mc.POP_r(tmp1.value)
+            elif isinstance(loc_index, ImmedLoc):
+                byte_index = loc_index.value >> descr.jit_wb_card_page_shift
+                byte_ofs = ~(byte_index >> 3)
+                byte_val = 1 << (byte_index & 7)
+                self.mc.OR8(addr_add_const(loc_base, byte_ofs), imm(byte_val))
+            else:
+                raise AssertionError("index is neither RegLoc nor ImmedLoc")
+            # patch the JMP above
+            offset = self.mc.get_relative_pos() - jmp_location
+            assert 0 < offset <= 127
+            self.mc.overwrite(jmp_location-1, chr(offset))
+        #
         # patch the JZ above
         offset = self.mc.get_relative_pos() - jz_location
         assert 0 < offset <= 127
@@ -2300,11 +2511,6 @@
 
     genop_discard_cond_call_gc_wb_array = genop_discard_cond_call_gc_wb
 
-    def genop_force_token(self, op, arglocs, resloc):
-        # RegAlloc.consider_force_token ensures this:
-        assert isinstance(resloc, RegLoc)
-        self.mc.LEA_rb(resloc.value, FORCE_INDEX_OFS)
-
     def not_implemented_op_discard(self, op, arglocs):
         not_implemented("not implemented operation: %s" % op.getopname())
 
@@ -2326,7 +2532,13 @@
             if gcrootmap.is_shadow_stack:
                 gcrootmap.write_callshape(mark, force_index)
             else:
-                self.mc.insert_gcroot_marker(mark)
+                if self.gcrootmap_retaddr_forced == 0:
+                    self.mc.insert_gcroot_marker(mark)   # common case
+                else:
+                    assert self.gcrootmap_retaddr_forced != -1, (
+                              "two mark_gc_roots() in a CALL_RELEASE_GIL")
+                    gcrootmap.put(self.gcrootmap_retaddr_forced, mark)
+                    self.gcrootmap_retaddr_forced = -1
 
     def target_arglocs(self, loop_token):
         return loop_token._x86_arglocs
@@ -2369,8 +2581,7 @@
             # there are two helpers to call only with asmgcc
             slowpath_addr1 = self.malloc_slowpath1
             self.mc.CALL(imm(slowpath_addr1))
-        self.mark_gc_roots(self.write_new_force_index(),
-                           use_copy_area=shadow_stack)
+        self.mark_gc_roots(self.write_new_force_index(), use_copy_area=True)
         slowpath_addr2 = self.malloc_slowpath2
         self.mc.CALL(imm(slowpath_addr2))
 
diff --git a/pypy/jit/backend/x86/codebuf.py b/pypy/jit/backend/x86/codebuf.py
--- a/pypy/jit/backend/x86/codebuf.py
+++ b/pypy/jit/backend/x86/codebuf.py
@@ -25,8 +25,11 @@
         self.init_block_builder()
         # a list of relative positions; for each position p, the bytes
         # at [p-4:p] encode an absolute address that will need to be
-        # made relative.
-        self.relocations = []
+        # made relative.  Only works on 32-bit!
+        if WORD == 4:
+            self.relocations = []
+        else:
+            self.relocations = None
         #
         # ResOperation --> offset in the assembly.
         # ops_offset[None] represents the beginning of the code after the last op
@@ -42,9 +45,10 @@
 
     def copy_to_raw_memory(self, addr):
         self._copy_to_raw_memory(addr)
-        for reloc in self.relocations:
-            p = addr + reloc
-            adr = rffi.cast(rffi.LONGP, p - WORD)
-            adr[0] = intmask(adr[0] - p)
+        if self.relocations is not None:
+            for reloc in self.relocations:
+                p = addr + reloc
+                adr = rffi.cast(rffi.LONGP, p - WORD)
+                adr[0] = intmask(adr[0] - p)
         valgrind.discard_translations(addr, self.get_relative_pos())
         self._dump(addr, "jit-backend-dump", backend_name)
diff --git a/pypy/jit/backend/x86/regalloc.py b/pypy/jit/backend/x86/regalloc.py
--- a/pypy/jit/backend/x86/regalloc.py
+++ b/pypy/jit/backend/x86/regalloc.py
@@ -29,6 +29,7 @@
     all_regs = [eax, ecx, edx, ebx, esi, edi]
     no_lower_byte_regs = [esi, edi]
     save_around_call_regs = [eax, edx, ecx]
+    frame_reg = ebp
 
     REGLOC_TO_GCROOTMAP_REG_INDEX = {
         ebx: 1,
@@ -312,8 +313,11 @@
                     self.fm.frame_bindings[arg] = loc
             else:
                 if isinstance(loc, RegLoc):
-                    self.rm.reg_bindings[arg] = loc
-                    used[loc] = None
+                    if loc is ebp:
+                        self.rm.bindings_to_frame_reg[arg] = None
+                    else:
+                        self.rm.reg_bindings[arg] = loc
+                        used[loc] = None
                 else:
                     self.fm.frame_bindings[arg] = loc
         self.rm.free_regs = []
@@ -705,6 +709,17 @@
         self.Perform(op, [loc0], loc1)
         self.rm.possibly_free_var(op.getarg(0))
 
+    def consider_cast_float_to_singlefloat(self, op):
+        loc0 = self.xrm.make_sure_var_in_reg(op.getarg(0))
+        loc1 = self.rm.force_allocate_reg(op.result)
+        self.xrm.possibly_free_var(op.getarg(0))
+        tmpxvar = TempBox()
+        loctmp = self.xrm.force_allocate_reg(tmpxvar)   # may be equal to loc0
+        self.xrm.possibly_free_var(tmpxvar)
+        self.Perform(op, [loc0, loctmp], loc1)
+
+    consider_cast_singlefloat_to_float = consider_cast_int_to_float
+
     def _consider_llong_binop_xx(self, op):
         # must force both arguments into xmm registers, because we don't
         # know if they will be suitably aligned.  Exception: if the second
@@ -832,8 +847,8 @@
 
     def consider_call(self, op):
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            oopspecindex = effectinfo.oopspecindex
+        oopspecindex = effectinfo.oopspecindex
+        if oopspecindex != EffectInfo.OS_NONE:
             if IS_X86_32:
                 # support for some of the llong operations,
                 # which only exist on x86-32
@@ -921,27 +936,13 @@
     def _do_fastpath_malloc(self, op, size, tid):
         gc_ll_descr = self.assembler.cpu.gc_ll_descr
         self.rm.force_allocate_reg(op.result, selected_reg=eax)
-
-        if gc_ll_descr.gcrootmap and gc_ll_descr.gcrootmap.is_shadow_stack:
-            # ---- shadowstack ----
-            # We need edx as a temporary, but otherwise don't save any more
-            # register.  See comments in _build_malloc_slowpath().
-            tmp_box = TempBox()
-            self.rm.force_allocate_reg(tmp_box, selected_reg=edx)
-            self.rm.possibly_free_var(tmp_box)
-        else:
-            # ---- asmgcc ----
-            # We need to force-allocate each of save_around_call_regs now.
-            # The alternative would be to save and restore them around the
-            # actual call to malloc(), in the rare case where we need to do
-            # it; however, mark_gc_roots() would need to be adapted to know
-            # where the variables end up being saved.  Messy.
-            for reg in self.rm.save_around_call_regs:
-                if reg is not eax:
-                    tmp_box = TempBox()
-                    self.rm.force_allocate_reg(tmp_box, selected_reg=reg)
-                    self.rm.possibly_free_var(tmp_box)
-
+        #
+        # We need edx as a temporary, but otherwise don't save any more
+        # register.  See comments in _build_malloc_slowpath().
+        tmp_box = TempBox()
+        self.rm.force_allocate_reg(tmp_box, selected_reg=edx)
+        self.rm.possibly_free_var(tmp_box)
+        #
         self.assembler.malloc_cond(
             gc_ll_descr.get_nursery_free_addr(),
             gc_ll_descr.get_nursery_top_addr(),
@@ -1337,20 +1338,32 @@
             if reg is eax:
                 continue      # ok to ignore this one
             if (isinstance(v, BoxPtr) and self.rm.stays_alive(v)):
-                if use_copy_area:
-                    assert reg in self.rm.REGLOC_TO_COPY_AREA_OFS
-                    area_offset = self.rm.REGLOC_TO_COPY_AREA_OFS[reg]
-                    gcrootmap.add_frame_offset(shape, area_offset)
-                else:
-                    assert reg in self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX
-                    gcrootmap.add_callee_save_reg(
-                        shape, self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX[reg])
+                #
+                # The register 'reg' is alive across this call.
+                gcrootmap = self.assembler.cpu.gc_ll_descr.gcrootmap
+                if gcrootmap is None or not gcrootmap.is_shadow_stack:
+                    #
+                    # Asmgcc: if reg is a callee-save register, we can
+                    # explicitly mark it as containing a BoxPtr.
+                    if reg in self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX:
+                        gcrootmap.add_callee_save_reg(
+                            shape, self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX[reg])
+                        continue
+                #
+                # Else, 'use_copy_area' must be True (otherwise this BoxPtr
+                # should not be in a register).  The copy area contains the
+                # real value of the register.
+                assert use_copy_area
+                assert reg in self.rm.REGLOC_TO_COPY_AREA_OFS
+                area_offset = self.rm.REGLOC_TO_COPY_AREA_OFS[reg]
+                gcrootmap.add_frame_offset(shape, area_offset)
+        #
         return gcrootmap.compress_callshape(shape,
                                             self.assembler.datablockwrapper)
 
     def consider_force_token(self, op):
-        loc = self.rm.force_allocate_reg(op.result)
-        self.Perform(op, [], loc)
+        # the FORCE_TOKEN operation returns directly 'ebp'
+        self.rm.force_allocate_frame_reg(op.result)
 
     def not_implemented_op(self, op):
         not_implemented("not implemented operation: %s" % op.getopname())
diff --git a/pypy/jit/backend/x86/regloc.py b/pypy/jit/backend/x86/regloc.py
--- a/pypy/jit/backend/x86/regloc.py
+++ b/pypy/jit/backend/x86/regloc.py
@@ -476,6 +476,7 @@
 
     AND = _binaryop('AND')
     OR  = _binaryop('OR')
+    OR8 = _binaryop('OR8')
     XOR = _binaryop('XOR')
     NOT = _unaryop('NOT')
     SHL = _binaryop('SHL')
@@ -483,6 +484,7 @@
     SAR = _binaryop('SAR')
     TEST = _binaryop('TEST')
     TEST8 = _binaryop('TEST8')
+    BTS = _binaryop('BTS')
 
     ADD = _binaryop('ADD')
     SUB = _binaryop('SUB')
@@ -519,6 +521,8 @@
     UCOMISD = _binaryop('UCOMISD')
     CVTSI2SD = _binaryop('CVTSI2SD')
     CVTTSD2SI = _binaryop('CVTTSD2SI')
+    CVTSD2SS = _binaryop('CVTSD2SS')
+    CVTSS2SD = _binaryop('CVTSS2SD')
     
     SQRTSD = _binaryop('SQRTSD')
 
@@ -532,6 +536,8 @@
     PXOR  = _binaryop('PXOR')
     PCMPEQD = _binaryop('PCMPEQD')
 
+    MOVD = _binaryop('MOVD')
+
     CALL = _relative_unaryop('CALL')
     JMP = _relative_unaryop('JMP')
 
diff --git a/pypy/jit/backend/x86/runner.py b/pypy/jit/backend/x86/runner.py
--- a/pypy/jit/backend/x86/runner.py
+++ b/pypy/jit/backend/x86/runner.py
@@ -19,6 +19,7 @@
 class AbstractX86CPU(AbstractLLCPU):
     debug = True
     supports_floats = True
+    supports_singlefloats = True
 
     BOOTSTRAP_TP = lltype.FuncType([], lltype.Signed)
     dont_keepalive_stuff = False # for tests
@@ -118,7 +119,8 @@
             setitem(index, null)
 
     def get_latest_force_token(self):
-        return self.assembler.fail_ebp + FORCE_INDEX_OFS
+        # the FORCE_TOKEN operation and this helper both return 'ebp'.
+        return self.assembler.fail_ebp
 
     def execute_token(self, executable_token):
         addr = executable_token._x86_bootstrap_code
@@ -152,8 +154,9 @@
                                        flavor='raw', zero=True,
                                        immortal=True)
 
-    def force(self, addr_of_force_index):
+    def force(self, addr_of_force_token):
         TP = rffi.CArrayPtr(lltype.Signed)
+        addr_of_force_index = addr_of_force_token + FORCE_INDEX_OFS
         fail_index = rffi.cast(TP, addr_of_force_index)[0]
         assert fail_index >= 0, "already forced!"
         faildescr = self.get_fail_descr_from_number(fail_index)
@@ -163,7 +166,7 @@
         # start of "no gc operation!" block
         fail_index_2 = self.assembler.grab_frame_values(
             bytecode,
-            addr_of_force_index - FORCE_INDEX_OFS,
+            addr_of_force_token,
             self.all_null_registers)
         self.assembler.leave_jitted_hook()
         # end of "no gc operation!" block
diff --git a/pypy/jit/backend/x86/rx86.py b/pypy/jit/backend/x86/rx86.py
--- a/pypy/jit/backend/x86/rx86.py
+++ b/pypy/jit/backend/x86/rx86.py
@@ -464,7 +464,7 @@
 
     # ------------------------------ MOV ------------------------------
 
-    MOV_ri = insn(rex_w, register(1), '\xB8', immediate(2, 'q'))
+    MOV_ri = insn(register(1), '\xB8', immediate(2))
     MOV8_ri = insn(rex_fw, byte_register(1), '\xB0', immediate(2, 'b'))
 
     # ------------------------------ Arithmetic ------------------------------
@@ -496,6 +496,10 @@
     AND8_rr = insn(rex_fw, '\x20', byte_register(1), byte_register(2,8), '\xC0')
 
     OR8_rr = insn(rex_fw, '\x08', byte_register(1), byte_register(2,8), '\xC0')
+    OR8_mi = insn(rex_fw, '\x80', orbyte(1<<3), mem_reg_plus_const(1),
+                  immediate(2, 'b'))
+    OR8_ji = insn(rex_fw, '\x80', orbyte(1<<3), abs_, immediate(1),
+                  immediate(2, 'b'))
 
     NEG_r = insn(rex_w, '\xF7', register(1), '\xD8')
 
@@ -523,6 +527,7 @@
 
     NOP = insn('\x90')
     RET = insn('\xC3')
+    RET16_i = insn('\xC2', immediate(1, 'h'))
 
     PUSH_r = insn(rex_nw, register(1), '\x50')
     PUSH_b = insn(rex_nw, '\xFF', orbyte(6<<3), stack_bp(1))
@@ -565,8 +570,12 @@
     TEST8_ji = insn(rex_nw, '\xF6', orbyte(0<<3), abs_, immediate(1), immediate(2, 'b'))
     TEST_rr = insn(rex_w, '\x85', register(2,8), register(1), '\xC0')
 
+    BTS_mr = insn(rex_w, '\x0F\xAB', register(2,8), mem_reg_plus_const(1))
+    BTS_jr = insn(rex_w, '\x0F\xAB', register(2,8), abs_, immediate(1))
+
     # x87 instructions
-    FSTP_b = insn('\xDD', orbyte(3<<3), stack_bp(1))
+    FSTPL_b = insn('\xDD', orbyte(3<<3), stack_bp(1)) # rffi.DOUBLE ('as' wants L??)
+    FSTPS_s = insn('\xD9', orbyte(3<<3), stack_sp(1)) # lltype.SingleFloat
 
     # ------------------------------ Random mess -----------------------
     RDTSC = insn('\x0F\x31')
@@ -583,8 +592,18 @@
     CVTTSD2SI_rx = xmminsn('\xF2', rex_w, '\x0F\x2C', register(1, 8), register(2), '\xC0')
     CVTTSD2SI_rb = xmminsn('\xF2', rex_w, '\x0F\x2C', register(1, 8), stack_bp(2))
 
-    MOVD_rx = xmminsn('\x66', rex_w, '\x0F\x7E', register(2, 8), register(1), '\xC0')
-    MOVD_xr = xmminsn('\x66', rex_w, '\x0F\x6E', register(1, 8), register(2), '\xC0')
+    CVTSD2SS_xx = xmminsn('\xF2', rex_nw, '\x0F\x5A',
+                          register(1, 8), register(2), '\xC0')
+    CVTSD2SS_xb = xmminsn('\xF2', rex_nw, '\x0F\x5A',
+                          register(1, 8), stack_bp(2))
+    CVTSS2SD_xx = xmminsn('\xF3', rex_nw, '\x0F\x5A',
+                          register(1, 8), register(2), '\xC0')
+    CVTSS2SD_xb = xmminsn('\xF3', rex_nw, '\x0F\x5A',


More information about the pypy-commit mailing list