[pypy-commit] pypy jit-short_from_state: hg merge default

Fri Aug 5 14:42:49 CEST 2011

Author: Hakan Ardo <hakan at debian.org>
Branch: jit-short_from_state
Changeset: r46287:314b7370dbf0
Date: 2011-08-05 14:13 +0200
http://bitbucket.org/pypy/pypy/changeset/314b7370dbf0/

Log:	hg merge default

diff --git a/.hgtags b/.hgtags
--- a/.hgtags
+++ b/.hgtags
@@ -1,1 +1,2 @@
 b590cf6de4190623aad9aa698694c22e614d67b9 release-1.5
+b48df0bf4e75b81d98f19ce89d4a7dc3e1dab5e5 benchmarked
diff --git a/LICENSE b/LICENSE
--- a/LICENSE
+++ b/LICENSE
@@ -37,22 +37,22 @@
     Armin Rigo
     Maciej Fijalkowski
     Carl Friedrich Bolz
+    Antonio Cuni
     Amaury Forgeot d'Arc
-    Antonio Cuni
     Samuele Pedroni
     Michael Hudson
     Holger Krekel
+    Benjamin Peterson
     Christian Tismer
-    Benjamin Peterson
+    Hakan Ardo
+    Alex Gaynor
     Eric van Riet Paap
-    Anders Chrigstr&#246;m
-    H&#229;kan Ard&#246;
+    Anders Chrigstrom
+    David Schneider
     Richard Emslie
     Dan Villiom Podlaski Christiansen
     Alexander Schremmer
-    Alex Gaynor
-    David Schneider
-    Aureli&#233;n Campeas
+    Aurelien Campeas
     Anders Lehmann
     Camillo Bruni
     Niklaus Haldimann
@@ -63,16 +63,17 @@
     Bartosz Skowron
     Jakub Gustak
     Guido Wesdorp
+    Daniel Roberts
     Adrien Di Mascio
     Laura Creighton
     Ludovic Aubry
     Niko Matsakis
-    Daniel Roberts
     Jason Creighton
-    Jacob Hall&#233;n
+    Jacob Hallen
     Alex Martelli
     Anders Hammarquist
     Jan de Mooij
+    Wim Lavrijsen
     Stephan Diehl
     Michael Foord
     Stefan Schwarzer
@@ -83,9 +84,13 @@
     Alexandre Fayolle
     Marius Gedminas
     Simon Burton
+    Justin Peel
     Jean-Paul Calderone
     John Witulski
+    Lukas Diekmann
+    holger krekel
     Wim Lavrijsen
+    Dario Bertini
     Andreas St&#252;hrk
     Jean-Philippe St. Pierre
     Guido van Rossum
@@ -97,15 +102,16 @@
     Georg Brandl
     Gerald Klix
     Wanja Saatkamp
+    Ronny Pfannschmidt
     Boris Feigin
     Oscar Nierstrasz
-    Dario Bertini
     David Malcolm
     Eugene Oden
     Henry Mason
+    Sven Hager
     Lukas Renggli
+    Ilya Osadchiy
     Guenter Jantzen
-    Ronny Pfannschmidt
     Bert Freudenberg
     Amit Regmi
     Ben Young
@@ -122,8 +128,8 @@
     Jared Grubb
     Karl Bartel
     Gabriel Lavoie
+    Victor Stinner
     Brian Dorsey
-    Victor Stinner
     Stuart Williams
     Toby Watson
     Antoine Pitrou
@@ -134,19 +140,23 @@
     Jonathan David Riehl
     Elmo M&#228;ntynen
     Anders Qvist
-    Beatrice D&#252;ring
+    Beatrice During
     Alexander Sedov
+    Timo Paulssen
+    Corbin Simpson
     Vincent Legoll
+    Romain Guillebert
     Alan McIntyre
-    Romain Guillebert
     Alex Perry
     Jens-Uwe Mager
+    Simon Cross
     Dan Stromberg
-    Lukas Diekmann
+    Guillebert Romain
     Carl Meyer
     Pieter Zieschang
     Alejandro J. Cura
     Sylvain Thenault
+    Christoph Gerum
     Travis Francis Athougies
     Henrik Vendelbo
     Lutz Paelike
@@ -157,6 +167,7 @@
     Miguel de Val Borro
     Ignas Mikalajunas
     Artur Lisiecki
+    Philip Jenvey
     Joshua Gilbert
     Godefroid Chappelle
     Yusei Tahara
@@ -165,27 +176,31 @@
     Gustavo Niemeyer
     William Leslie
     Akira Li
-    Kristj&#225;n Valur J&#243;nsson
+    Kristjan Valur Jonsson
     Bobby Impollonia
+    Michael Hudson-Doyle
     Andrew Thompson
     Anders Sigfridsson
+    Floris Bruynooghe
     Jacek Generowicz
     Dan Colish
-    Sven Hager
     Zooko Wilcox-O Hearn
+    Dan Villiom Podlaski Christiansen
     Anders Hammarquist
+    Chris Lambacher
     Dinu Gherman
     Dan Colish
+    Brett Cannon
     Daniel Neuh&#228;user
     Michael Chermside
     Konrad Delong
     Anna Ravencroft
     Greg Price
     Armin Ronacher
+    Christian Muirhead
     Jim Baker
-    Philip Jenvey
     Rodrigo Ara&#250;jo
-    Brett Cannon
+    Romain Guillebert
 
     Heinrich-Heine University, Germany 
     Open End AB (formerly AB Strakt), Sweden
diff --git a/lib-python/conftest.py b/lib-python/conftest.py
--- a/lib-python/conftest.py
+++ b/lib-python/conftest.py
@@ -154,18 +154,18 @@
     RegrTest('test_cmd.py'),
     RegrTest('test_cmd_line_script.py'),
     RegrTest('test_codeccallbacks.py', core=True),
-    RegrTest('test_codecencodings_cn.py'),
-    RegrTest('test_codecencodings_hk.py'),
-    RegrTest('test_codecencodings_jp.py'),
-    RegrTest('test_codecencodings_kr.py'),
-    RegrTest('test_codecencodings_tw.py'),
+    RegrTest('test_codecencodings_cn.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecencodings_hk.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecencodings_jp.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecencodings_kr.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecencodings_tw.py', usemodules='_multibytecodec'),
 
-    RegrTest('test_codecmaps_cn.py'),
-    RegrTest('test_codecmaps_hk.py'),
-    RegrTest('test_codecmaps_jp.py'),
-    RegrTest('test_codecmaps_kr.py'),
-    RegrTest('test_codecmaps_tw.py'),
-    RegrTest('test_codecs.py', core=True),
+    RegrTest('test_codecmaps_cn.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecmaps_hk.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecmaps_jp.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecmaps_kr.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecmaps_tw.py', usemodules='_multibytecodec'),
+    RegrTest('test_codecs.py', core=True, usemodules='_multibytecodec'),
     RegrTest('test_codeop.py', core=True),
     RegrTest('test_coercion.py', core=True),
     RegrTest('test_collections.py'),
@@ -314,7 +314,7 @@
     RegrTest('test_mmap.py'),
     RegrTest('test_module.py', core=True),
     RegrTest('test_modulefinder.py'),
-    RegrTest('test_multibytecodec.py'),
+    RegrTest('test_multibytecodec.py', usemodules='_multibytecodec'),
     RegrTest('test_multibytecodec_support.py', skip="not a test"),
     RegrTest('test_multifile.py'),
     RegrTest('test_multiprocessing.py', skip='FIXME leaves subprocesses'),
diff --git a/lib-python/modified-2.7/ctypes/__init__.py b/lib-python/modified-2.7/ctypes/__init__.py
--- a/lib-python/modified-2.7/ctypes/__init__.py
+++ b/lib-python/modified-2.7/ctypes/__init__.py
@@ -489,9 +489,12 @@
         _flags_ = _FUNCFLAG_CDECL | _FUNCFLAG_PYTHONAPI
     return CFunctionType
 
-_cast = PYFUNCTYPE(py_object, c_void_p, py_object, py_object)(_cast_addr)
 def cast(obj, typ):
-    return _cast(obj, obj, typ)
+    try:
+        c_void_p.from_param(obj)
+    except TypeError, e:
+        raise ArgumentError(str(e))
+    return _cast_addr(obj, obj, typ)
 
 _string_at = PYFUNCTYPE(py_object, c_void_p, c_int)(_string_at_addr)
 def string_at(ptr, size=-1):
diff --git a/lib-python/modified-2.7/test/test_multibytecodec.py b/lib-python/modified-2.7/test/test_multibytecodec.py
--- a/lib-python/modified-2.7/test/test_multibytecodec.py
+++ b/lib-python/modified-2.7/test/test_multibytecodec.py
@@ -148,7 +148,8 @@
 class Test_StreamReader(unittest.TestCase):
     def test_bug1728403(self):
         try:
-            open(TESTFN, 'w').write('\xa1')
+            with open(TESTFN, 'w') as f:
+                f.write('\xa1')
             f = codecs.open(TESTFN, encoding='cp949')
             self.assertRaises(UnicodeDecodeError, f.read, 2)
         finally:
diff --git a/lib_pypy/_ctypes/function.py b/lib_pypy/_ctypes/function.py
--- a/lib_pypy/_ctypes/function.py
+++ b/lib_pypy/_ctypes/function.py
@@ -78,8 +78,6 @@
     _com_iid = None
     _is_fastpath = False
 
-    __restype_set = False
-
     def _getargtypes(self):
         return self._argtypes_
 
@@ -93,13 +91,15 @@
                     raise TypeError(
                         "item %d in _argtypes_ has no from_param method" % (
                             i + 1,))
-            #
-            if all([hasattr(argtype, '_ffiargshape') for argtype in argtypes]):
-                fastpath_cls = make_fastpath_subclass(self.__class__)
-                fastpath_cls.enable_fastpath_maybe(self)
             self._argtypes_ = list(argtypes)
+            self._check_argtypes_for_fastpath()
     argtypes = property(_getargtypes, _setargtypes)
 
+    def _check_argtypes_for_fastpath(self):
+        if all([hasattr(argtype, '_ffiargshape') for argtype in self._argtypes_]):
+            fastpath_cls = make_fastpath_subclass(self.__class__)
+            fastpath_cls.enable_fastpath_maybe(self)
+
     def _getparamflags(self):
         return self._paramflags
 
@@ -149,7 +149,6 @@
         return self._restype_
 
     def _setrestype(self, restype):
-        self.__restype_set = True
         self._ptr = None
         if restype is int:
             from ctypes import c_int
@@ -219,6 +218,7 @@
                 import ctypes
                 restype = ctypes.c_int
             self._ptr = self._getfuncptr_fromaddress(self._argtypes_, restype)
+            self._check_argtypes_for_fastpath()
             return
 
         
@@ -296,13 +296,12 @@
                     "This function takes %d argument%s (%s given)"
                     % (len(self._argtypes_), plural, len(args)))
 
-            # check that arguments are convertible
-            ## XXX Not as long as ctypes.cast is a callback function with
-            ## py_object arguments...
-            ## self._convert_args(self._argtypes_, args, {})
-
             try:
-                res = self.callable(*args)
+                newargs = self._convert_args_for_callback(argtypes, args)
+            except (UnicodeError, TypeError, ValueError), e:
+                raise ArgumentError(str(e))
+            try:
+                res = self.callable(*newargs)
             except:
                 exc_info = sys.exc_info()
                 traceback.print_tb(exc_info[2], file=sys.stderr)
@@ -316,10 +315,6 @@
             warnings.warn('C function without declared arguments called',
                           RuntimeWarning, stacklevel=2)
             argtypes = []
-            
-        if not self.__restype_set:
-            warnings.warn('C function without declared return type called',
-                          RuntimeWarning, stacklevel=2)
 
         if self._com_index:
             from ctypes import cast, c_void_p, POINTER
@@ -366,7 +361,10 @@
             if self._flags_ & _rawffi.FUNCFLAG_USE_LASTERROR:
                 set_last_error(_rawffi.get_last_error())
         #
-        return self._build_result(self._restype_, result, newargs)
+        try:
+            return self._build_result(self._restype_, result, newargs)
+        finally:
+            funcptr.free_temp_buffers()
 
     def _do_errcheck(self, result, args):
         # The 'errcheck' protocol
@@ -466,6 +464,18 @@
 
         return cobj, cobj._to_ffi_param(), type(cobj)
 
+    def _convert_args_for_callback(self, argtypes, args):
+        assert len(argtypes) == len(args)
+        newargs = []
+        for argtype, arg in zip(argtypes, args):
+            param = argtype.from_param(arg)
+            if argtype._type_ == 'P': # special-case for c_void_p
+                param = param._get_buffer_value()
+            elif self._is_primitive(argtype):
+                param = param.value
+            newargs.append(param)
+        return newargs
+
     def _convert_args(self, argtypes, args, kwargs, marker=object()):
         newargs = []
         outargs = []
@@ -556,6 +566,9 @@
                 newargtypes.append(newargtype)
         return keepalives, newargs, newargtypes, outargs
 
+    @staticmethod
+    def _is_primitive(argtype):
+        return argtype.__bases__[0] is _SimpleCData
     
     def _wrap_result(self, restype, result):
         """
@@ -564,7 +577,7 @@
         """
         # hack for performance: if restype is a "simple" primitive type, don't
         # allocate the buffer because it's going to be thrown away immediately
-        if restype.__bases__[0] is _SimpleCData and not restype._is_pointer_like():
+        if self._is_primitive(restype) and not restype._is_pointer_like():
             return result
         #
         shape = restype._ffishape
@@ -680,7 +693,7 @@
             try:
                 result = self._call_funcptr(funcptr, *args)
                 result = self._do_errcheck(result, args)
-            except (TypeError, ArgumentError): # XXX, should be FFITypeError
+            except (TypeError, ArgumentError, UnicodeDecodeError):
                 assert self._slowpath_allowed
                 return CFuncPtr.__call__(self, *args)
             return result
diff --git a/lib_pypy/_ctypes/primitive.py b/lib_pypy/_ctypes/primitive.py
--- a/lib_pypy/_ctypes/primitive.py
+++ b/lib_pypy/_ctypes/primitive.py
@@ -10,6 +10,8 @@
 from _ctypes.builtin import ConvMode
 from _ctypes.array import Array
 from _ctypes.pointer import _Pointer, as_ffi_pointer
+#from _ctypes.function import CFuncPtr # this import is moved at the bottom
+                                       # because else it's circular
 
 class NULL(object):
     pass
@@ -86,7 +88,7 @@
         return res
     if isinstance(value, Array):
         return value
-    if isinstance(value, _Pointer):
+    if isinstance(value, (_Pointer, CFuncPtr)):
         return cls.from_address(value._buffer.buffer)
     if isinstance(value, (int, long)):
         return cls(value)
@@ -338,3 +340,5 @@
 
     def __nonzero__(self):
         return self._buffer[0] not in (0, '\x00')
+
+from _ctypes.function import CFuncPtr
diff --git a/lib_pypy/_ctypes/structure.py b/lib_pypy/_ctypes/structure.py
--- a/lib_pypy/_ctypes/structure.py
+++ b/lib_pypy/_ctypes/structure.py
@@ -34,16 +34,18 @@
     for i, field in enumerate(all_fields):
         name = field[0]
         value = field[1]
+        is_bitfield = (len(field) == 3)
         fields[name] = Field(name,
                              self._ffistruct.fieldoffset(name),
                              self._ffistruct.fieldsize(name),
-                             value, i)
+                             value, i, is_bitfield)
 
     if anonymous_fields:
         resnames = []
         for i, field in enumerate(all_fields):
             name = field[0]
             value = field[1]
+            is_bitfield = (len(field) == 3)
             startpos = self._ffistruct.fieldoffset(name)
             if name in anonymous_fields:
                 for subname in value._names:
@@ -52,7 +54,7 @@
                     subvalue = value._fieldtypes[subname].ctype
                     fields[subname] = Field(subname,
                                             relpos, subvalue._sizeofinstances(),
-                                            subvalue, i)
+                                            subvalue, i, is_bitfield)
             else:
                 resnames.append(name)
         names = resnames
@@ -60,8 +62,8 @@
     self._fieldtypes = fields
 
 class Field(object):
-    def __init__(self, name, offset, size, ctype, num):
-        for k in ('name', 'offset', 'size', 'ctype', 'num'):
+    def __init__(self, name, offset, size, ctype, num, is_bitfield):
+        for k in ('name', 'offset', 'size', 'ctype', 'num', 'is_bitfield'):
             self.__dict__[k] = locals()[k]
 
     def __setattr__(self, name, value):
@@ -225,7 +227,7 @@
             field = self._fieldtypes[name]
         except KeyError:
             return _CData.__getattribute__(self, name)
-        if field.size >> 16:
+        if field.is_bitfield:
             # bitfield member, use direct access
             return self._buffer.__getattr__(name)
         else:
diff --git a/pypy/annotation/builtin.py b/pypy/annotation/builtin.py
--- a/pypy/annotation/builtin.py
+++ b/pypy/annotation/builtin.py
@@ -416,7 +416,8 @@
 from pypy.annotation.model import SomePtr
 from pypy.rpython.lltypesystem import lltype
 
-def malloc(s_T, s_n=None, s_flavor=None, s_zero=None, s_track_allocation=None):
+def malloc(s_T, s_n=None, s_flavor=None, s_zero=None, s_track_allocation=None,
+           s_add_memory_pressure=None):
     assert (s_n is None or s_n.knowntype == int
             or issubclass(s_n.knowntype, pypy.rlib.rarithmetic.base_int))
     assert s_T.is_constant()
@@ -432,6 +433,8 @@
     else:
         assert s_flavor.is_constant()
         assert s_track_allocation is None or s_track_allocation.is_constant()
+        assert (s_add_memory_pressure is None or
+                s_add_memory_pressure.is_constant())
         # not sure how to call malloc() for the example 'p' in the
         # presence of s_extraargs
         r = SomePtr(lltype.Ptr(s_T.const))
diff --git a/pypy/config/translationoption.py b/pypy/config/translationoption.py
--- a/pypy/config/translationoption.py
+++ b/pypy/config/translationoption.py
@@ -13,6 +13,10 @@
 DEFL_LOW_INLINE_THRESHOLD = DEFL_INLINE_THRESHOLD / 2.0
 
 DEFL_GC = "minimark"
+if sys.platform.startswith("linux"):
+    DEFL_ROOTFINDER_WITHJIT = "asmgcc"
+else:
+    DEFL_ROOTFINDER_WITHJIT = "shadowstack"
 
 IS_64_BITS = sys.maxint > 2147483647
 
@@ -109,7 +113,7 @@
     BoolOption("jit", "generate a JIT",
                default=False,
                suggests=[("translation.gc", DEFL_GC),
-                         ("translation.gcrootfinder", "asmgcc"),
+                         ("translation.gcrootfinder", DEFL_ROOTFINDER_WITHJIT),
                          ("translation.list_comprehension_operations", True)]),
     ChoiceOption("jit_backend", "choose the backend for the JIT",
                  ["auto", "x86", "x86-without-sse2", "llvm"],
diff --git a/pypy/doc/contributor.rst b/pypy/doc/contributor.rst
--- a/pypy/doc/contributor.rst
+++ b/pypy/doc/contributor.rst
@@ -9,22 +9,22 @@
     Armin Rigo
     Maciej Fijalkowski
     Carl Friedrich Bolz
+    Antonio Cuni
     Amaury Forgeot d'Arc
-    Antonio Cuni
     Samuele Pedroni
     Michael Hudson
     Holger Krekel
+    Benjamin Peterson
     Christian Tismer
-    Benjamin Peterson
+    Hakan Ardo
+    Alex Gaynor
     Eric van Riet Paap
-    Anders Chrigstr&#246;m
-    H&#229;kan Ard&#246;
+    Anders Chrigstrom
+    David Schneider
     Richard Emslie
     Dan Villiom Podlaski Christiansen
     Alexander Schremmer
-    Alex Gaynor
-    David Schneider
-    Aureli&#233;n Campeas
+    Aurelien Campeas
     Anders Lehmann
     Camillo Bruni
     Niklaus Haldimann
@@ -35,16 +35,17 @@
     Bartosz Skowron
     Jakub Gustak
     Guido Wesdorp
+    Daniel Roberts
     Adrien Di Mascio
     Laura Creighton
     Ludovic Aubry
     Niko Matsakis
-    Daniel Roberts
     Jason Creighton
-    Jacob Hall&#233;n
+    Jacob Hallen
     Alex Martelli
     Anders Hammarquist
     Jan de Mooij
+    Wim Lavrijsen
     Stephan Diehl
     Michael Foord
     Stefan Schwarzer
@@ -55,9 +56,13 @@
     Alexandre Fayolle
     Marius Gedminas
     Simon Burton
+    Justin Peel
     Jean-Paul Calderone
     John Witulski
+    Lukas Diekmann
+    holger krekel
     Wim Lavrijsen
+    Dario Bertini
     Andreas St&#252;hrk
     Jean-Philippe St. Pierre
     Guido van Rossum
@@ -69,15 +74,16 @@
     Georg Brandl
     Gerald Klix
     Wanja Saatkamp
+    Ronny Pfannschmidt
     Boris Feigin
     Oscar Nierstrasz
-    Dario Bertini
     David Malcolm
     Eugene Oden
     Henry Mason
+    Sven Hager
     Lukas Renggli
+    Ilya Osadchiy
     Guenter Jantzen
-    Ronny Pfannschmidt
     Bert Freudenberg
     Amit Regmi
     Ben Young
@@ -94,8 +100,8 @@
     Jared Grubb
     Karl Bartel
     Gabriel Lavoie
+    Victor Stinner
     Brian Dorsey
-    Victor Stinner
     Stuart Williams
     Toby Watson
     Antoine Pitrou
@@ -106,19 +112,23 @@
     Jonathan David Riehl
     Elmo M&#228;ntynen
     Anders Qvist
-    Beatrice D&#252;ring
+    Beatrice During
     Alexander Sedov
+    Timo Paulssen
+    Corbin Simpson
     Vincent Legoll
+    Romain Guillebert
     Alan McIntyre
-    Romain Guillebert
     Alex Perry
     Jens-Uwe Mager
+    Simon Cross
     Dan Stromberg
-    Lukas Diekmann
+    Guillebert Romain
     Carl Meyer
     Pieter Zieschang
     Alejandro J. Cura
     Sylvain Thenault
+    Christoph Gerum
     Travis Francis Athougies
     Henrik Vendelbo
     Lutz Paelike
@@ -129,6 +139,7 @@
     Miguel de Val Borro
     Ignas Mikalajunas
     Artur Lisiecki
+    Philip Jenvey
     Joshua Gilbert
     Godefroid Chappelle
     Yusei Tahara
@@ -137,24 +148,29 @@
     Gustavo Niemeyer
     William Leslie
     Akira Li
-    Kristj&#225;n Valur J&#243;nsson
+    Kristjan Valur Jonsson
     Bobby Impollonia
+    Michael Hudson-Doyle
     Andrew Thompson
     Anders Sigfridsson
+    Floris Bruynooghe
     Jacek Generowicz
     Dan Colish
-    Sven Hager
     Zooko Wilcox-O Hearn
+    Dan Villiom Podlaski Christiansen
     Anders Hammarquist
+    Chris Lambacher
     Dinu Gherman
     Dan Colish
+    Brett Cannon
     Daniel Neuh&#228;user
     Michael Chermside
     Konrad Delong
     Anna Ravencroft
     Greg Price
     Armin Ronacher
+    Christian Muirhead
     Jim Baker
-    Philip Jenvey
     Rodrigo Ara&#250;jo
+    Romain Guillebert
 
diff --git a/pypy/doc/how-to-release.rst b/pypy/doc/how-to-release.rst
--- a/pypy/doc/how-to-release.rst
+++ b/pypy/doc/how-to-release.rst
@@ -21,8 +21,8 @@
 Release Steps
 ----------------
 
-* at code freeze make a release branch under
-  http://codepeak.net/svn/pypy/release/x.y(.z). IMPORTANT: bump the
+* at code freeze make a release branch using release-x.x.x in mercurial
+  IMPORTANT: bump the
   pypy version number in module/sys/version.py and in
   module/cpyext/include/patchlevel.h, notice that the branch
   will capture the revision number of this change for the release;
@@ -48,12 +48,6 @@
   the release announcement should contain a direct link to the download page
 * update pypy.org (under extradoc/pypy.org), rebuild and commit
 
-* update http://codespeak.net/pypy/trunk:
-   code0> + chmod -R yourname:users /www/codespeak.net/htdocs/pypy/trunk
-   local> cd ..../pypy/doc && py.test
-   local> cd ..../pypy
-   local> rsync -az doc codespeak.net:/www/codespeak.net/htdocs/pypy/trunk/pypy/
-
 * post announcement on morepypy.blogspot.com
 * send announcements to pypy-dev, python-list,
   python-announce, python-dev ...
diff --git a/pypy/doc/release-1.6.0.rst b/pypy/doc/release-1.6.0.rst
new file mode 100644
--- /dev/null
+++ b/pypy/doc/release-1.6.0.rst
@@ -0,0 +1,85 @@
+===========================
+PyPy 1.6 - faster than ever
+===========================
+
+We're pleased to announce the 1.6 release of PyPy. This release brings a lot
+of bugfixes and performance improvements over 1.5, and improves support for
+Windows 32bit and OS X 64bit. This version fully implements Python 2.7.1 and
+has beta level support for loading CPython C extensions.  You can download it
+here:
+
+    http://pypy.org/download.html
+
+What is PyPy?
+=============
+
+PyPy is a very compliant Python interpreter, almost a drop-in replacement for
+CPython 2.7.1. It's fast (`pypy 1.5 and cpython 2.6.2`_ performance comparison)
+due to its integrated tracing JIT compiler. XXX: compare to 2.7.1
+
+This release supports x86 machines running Linux 32/64 or Mac OS X.  Windows 32
+is beta (it roughly works but a lot of small issues have not been fixed so
+far).  Windows 64 is not yet supported.
+
+The main topics of this release are speed and stability: on average, PyPy 1.6
+is between 20% and 30% faster than PyPy 1.5, and overall it's 4.3 times faster
+than CPython when running our set of benchmarks.
+
+The speed improvements have been made possible by optimizing many of the
+layers which compose PyPy.  In particular, we improved: the Garbage Collector,
+the JIT warmup time, the optimizations performed by the JIT, the quality of
+the generated machine code and the implementation of our Python interpreter.
+
+
+Highlights
+==========
+
+* Numerous performance improvements, overall giving considerable speedups:
+
+  - better GC behavior when dealing with very large objects and arrays
+
+  - `fast ctypes`_: now calls to ctypes functions are seen and optimized
+    by the JIT, and they are up to 60 times faster than PyPy 1.5 and 10 times
+    faster than CPython
+
+  - improved generators(1): simple generators now are inlined into the caller
+    loop, making performance up to 3.5 times faster than PyPy 1.5.
+
+  - improved generators(2): thanks to other optimizations, even generators
+    that are not inlined are between 10% and 20% faster than PyPy 1.5.
+
+  - faster warmup time for the JIT
+
+  - JIT support for single floats (e.g., for ``array('f')``)
+
+  - optimized dictionaries: the internal representation of dictionaries is now
+    dynamically selected depending on the type of stored objects, resulting in
+    faster code and smaller memory footprint.  For example, dictionaries whose
+    keys are all strings, or all integers.
+
+* JitViewer: this is the first official release which includes the JitViewer,
+  a web-based tool which helps you to see which parts of your Python code have
+  been compiled by the JIT, down until the assembler. XXX: publish a public
+  demo?
+
+- The CPython extension module API has been improved and now supports many
+  more extensions. For information on which one are supported, please refer to
+  our `compatibility wiki`_.
+
+* Multibyte encoding support: this was of of the last areas in which we were
+  still behind CPython, but now we fully support them. (XXX: is that true?)
+
+* Preliminary support for NumPy: this release includes a preview of a very
+  fast NumPy module integrated with the PyPy JIT.  Unfortunately, this does
+  not mean that you can expect to take an existing NumPy program and run it on
+  PyPy, because the module is still unfinished and supports only some of the
+  numpy API.  However, what works is blazingly fast :-)
+
+* Bugfixes: since the 1.5 release we fixed 53 bugs in our `bug tracker`_, not
+  counting the numerous bugs that were found and reported through other
+  channels than the bug tracker.
+
+Cheers,
+
+Carl Friedrich Bolz, Laura Creighton, Antonio Cuni, Maciej Fijalkowski,
+Amaury Forgeot d'Arc, Alex Gaynor, Armin Rigo and the PyPy team
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -111,6 +111,9 @@
     def setslotvalue(self, index, w_val):
         raise NotImplementedError
 
+    def delslotvalue(self, index):
+        raise NotImplementedError
+
     def descr_call_mismatch(self, space, opname, RequiredClass, args):
         if RequiredClass is None:
             classname = '?'
diff --git a/pypy/interpreter/gateway.py b/pypy/interpreter/gateway.py
--- a/pypy/interpreter/gateway.py
+++ b/pypy/interpreter/gateway.py
@@ -64,7 +64,7 @@
                 self.visit_self(el[1], *args)
             else:
                 self.visit_function(el, *args)
-        else:
+        elif isinstance(el, type):
             for typ in self.bases_order:
                 if issubclass(el, typ):
                     visit = getattr(self, "visit__%s" % (typ.__name__,))
@@ -73,6 +73,8 @@
             else:
                 raise Exception("%s: no match for unwrap_spec element %s" % (
                     self.__class__.__name__, el))
+        else:
+            raise Exception("unable to dispatch, %s, perhaps your parameter should have started with w_?" % el)
 
     def apply_over(self, unwrap_spec, *extra):
         dispatch = self.dispatch
diff --git a/pypy/interpreter/typedef.py b/pypy/interpreter/typedef.py
--- a/pypy/interpreter/typedef.py
+++ b/pypy/interpreter/typedef.py
@@ -258,6 +258,11 @@
                     self.slots_w = [None] * nslots
             def setslotvalue(self, index, w_value):
                 self.slots_w[index] = w_value
+            def delslotvalue(self, index):
+                if self.slots_w[index] is None:
+                    return False
+                self.slots_w[index] = None
+                return True
             def getslotvalue(self, index):
                 return self.slots_w[index]
         add(Proto)
@@ -530,11 +535,10 @@
         """member.__delete__(obj)
         Delete the value of the slot 'member' from the given 'obj'."""
         self.typecheck(space, w_obj)
-        w_oldresult = w_obj.getslotvalue(self.index)
-        if w_oldresult is None:
+        success = w_obj.delslotvalue(self.index)
+        if not success:
             raise OperationError(space.w_AttributeError,
                                  space.wrap(self.name)) # XXX better message
-        w_obj.setslotvalue(self.index, None)
 
 Member.typedef = TypeDef(
     "member_descriptor",
diff --git a/pypy/jit/backend/llgraph/llimpl.py b/pypy/jit/backend/llgraph/llimpl.py
--- a/pypy/jit/backend/llgraph/llimpl.py
+++ b/pypy/jit/backend/llgraph/llimpl.py
@@ -1071,6 +1071,8 @@
         return heaptracker.adr2int(llmemory.cast_ptr_to_adr(x))
     if TP == llmemory.Address:
         return heaptracker.adr2int(x)
+    if TP is lltype.SingleFloat:
+        return longlong.singlefloat2int(x)
     return lltype.cast_primitive(lltype.Signed, x)
 
 def cast_from_int(TYPE, x):
@@ -1086,6 +1088,9 @@
             x = llmemory.cast_int_to_adr(x)
         assert lltype.typeOf(x) == llmemory.Address
         return x
+    elif TYPE is lltype.SingleFloat:
+        assert lltype.typeOf(x) is lltype.Signed
+        return longlong.int2singlefloat(x)
     else:
         if lltype.typeOf(x) == llmemory.Address:
             x = heaptracker.adr2int(x)
@@ -1140,6 +1145,7 @@
     del _future_values[:]
 
 def set_future_value_int(index, value):
+    assert lltype.typeOf(value) is lltype.Signed
     set_future_value_ref(index, value)
 
 def set_future_value_float(index, value):
@@ -1488,6 +1494,7 @@
     'i': lltype.Signed,
     'f': lltype.Float,
     'L': lltype.SignedLongLong,
+    'S': lltype.SingleFloat,
     'v': lltype.Void,
     }
 
diff --git a/pypy/jit/backend/llgraph/runner.py b/pypy/jit/backend/llgraph/runner.py
--- a/pypy/jit/backend/llgraph/runner.py
+++ b/pypy/jit/backend/llgraph/runner.py
@@ -91,6 +91,7 @@
 class BaseCPU(model.AbstractCPU):
     supports_floats = True
     supports_longlong = llimpl.IS_32_BIT
+    supports_singlefloats = True
 
     def __init__(self, rtyper, stats=None, opts=None,
                  translate_support_code=False,
@@ -327,12 +328,16 @@
 
     def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo=None):
         from pypy.jit.backend.llsupport.ffisupport import get_ffi_type_kind
+        from pypy.jit.backend.llsupport.ffisupport import UnsupportedKind
         arg_types = []
-        for arg in ffi_args:
-            kind = get_ffi_type_kind(arg)
-            if kind != history.VOID:
-                arg_types.append(kind)
-        reskind = get_ffi_type_kind(ffi_result)
+        try:
+            for arg in ffi_args:
+                kind = get_ffi_type_kind(self, arg)
+                if kind != history.VOID:
+                    arg_types.append(kind)
+            reskind = get_ffi_type_kind(self, ffi_result)
+        except UnsupportedKind:
+            return None
         return self.getdescr(0, reskind, extrainfo=extrainfo,
                              arg_types=''.join(arg_types))
 
diff --git a/pypy/jit/backend/llgraph/test/test_llgraph.py b/pypy/jit/backend/llgraph/test/test_llgraph.py
--- a/pypy/jit/backend/llgraph/test/test_llgraph.py
+++ b/pypy/jit/backend/llgraph/test/test_llgraph.py
@@ -19,6 +19,9 @@
     def setup_method(self, _):
         self.cpu = self.cpu_type(None)
 
+    def test_memoryerror(self):
+        py.test.skip("does not make much sense on the llgraph backend")
+
 
 def test_cast_adr_to_int_and_back():
     X = lltype.Struct('X', ('foo', lltype.Signed))
diff --git a/pypy/jit/backend/llsupport/descr.py b/pypy/jit/backend/llsupport/descr.py
--- a/pypy/jit/backend/llsupport/descr.py
+++ b/pypy/jit/backend/llsupport/descr.py
@@ -303,6 +303,8 @@
                 c = 'f'
             elif c == 'f' and longlong.supports_longlong:
                 return 'longlong.getrealfloat(%s)' % (process('L'),)
+            elif c == 'S':
+                return 'longlong.int2singlefloat(%s)' % (process('i'),)
             arg = 'args_%s[%d]' % (c, seen[c])
             seen[c] += 1
             return arg
@@ -318,6 +320,8 @@
                 return lltype.Void
             elif arg == 'L':
                 return lltype.SignedLongLong
+            elif arg == 'S':
+                return lltype.SingleFloat
             else:
                 raise AssertionError(arg)
 
@@ -334,6 +338,8 @@
             result = 'rffi.cast(lltype.SignedLongLong, res)'
         elif self.get_return_type() == history.VOID:
             result = 'None'
+        elif self.get_return_type() == 'S':
+            result = 'longlong.singlefloat2int(res)'
         else:
             assert 0
         source = py.code.Source("""
@@ -344,14 +350,15 @@
         """ % locals())
         ARGS = [TYPE(arg) for arg in self.arg_classes]
         FUNC = lltype.FuncType(ARGS, RESULT)
-        d = locals().copy()
-        d.update(globals())
+        d = globals().copy()
+        d.update(locals())
         exec source.compile() in d
         self.call_stub = d['call_stub']
 
     def verify_types(self, args_i, args_r, args_f, return_type):
         assert self._return_type in return_type
-        assert self.arg_classes.count('i') == len(args_i or ())
+        assert (self.arg_classes.count('i') +
+                self.arg_classes.count('S')) == len(args_i or ())
         assert self.arg_classes.count('r') == len(args_r or ())
         assert (self.arg_classes.count('f') +
                 self.arg_classes.count('L')) == len(args_f or ())
@@ -428,23 +435,39 @@
     def get_result_size(self, translate_support_code):
         return 0
 
+_SingleFloatCallDescr = None   # built lazily
+
 def getCallDescrClass(RESULT):
     if RESULT is lltype.Void:
         return VoidCallDescr
     if RESULT is lltype.Float:
         return FloatCallDescr
+    if RESULT is lltype.SingleFloat:
+        global _SingleFloatCallDescr
+        if _SingleFloatCallDescr is None:
+            assert rffi.sizeof(rffi.UINT) == rffi.sizeof(RESULT)
+            class SingleFloatCallDescr(getCallDescrClass(rffi.UINT)):
+                _clsname = 'SingleFloatCallDescr'
+                _return_type = 'S'
+            _SingleFloatCallDescr = SingleFloatCallDescr
+        return _SingleFloatCallDescr
     if is_longlong(RESULT):
         return LongLongCallDescr
     return getDescrClass(RESULT, BaseIntCallDescr, GcPtrCallDescr,
                          NonGcPtrCallDescr, 'Call', 'get_result_size',
                          Ellipsis,  # <= floatattrname should not be used here
                          '_is_result_signed')
+getCallDescrClass._annspecialcase_ = 'specialize:memo'
 
 def get_call_descr(gccache, ARGS, RESULT, extrainfo=None):
     arg_classes = []
     for ARG in ARGS:
         kind = getkind(ARG)
-        if   kind == 'int': arg_classes.append('i')
+        if   kind == 'int':
+            if ARG is lltype.SingleFloat:
+                arg_classes.append('S')
+            else:
+                arg_classes.append('i')
         elif kind == 'ref': arg_classes.append('r')
         elif kind == 'float':
             if is_longlong(ARG):
@@ -476,6 +499,9 @@
             return GcPtrDescr
         else:
             return NonGcPtrDescr
+    if TYPE is lltype.SingleFloat:
+        assert rffi.sizeof(rffi.UINT) == rffi.sizeof(TYPE)
+        TYPE = rffi.UINT
     try:
         return _cache[nameprefix, TYPE]
     except KeyError:
diff --git a/pypy/jit/backend/llsupport/ffisupport.py b/pypy/jit/backend/llsupport/ffisupport.py
--- a/pypy/jit/backend/llsupport/ffisupport.py
+++ b/pypy/jit/backend/llsupport/ffisupport.py
@@ -1,19 +1,21 @@
 from pypy.rlib.rarithmetic import intmask
 from pypy.jit.metainterp import history
-from pypy.jit.backend.llsupport.descr import DynamicIntCallDescr, NonGcPtrCallDescr,\
-    FloatCallDescr, VoidCallDescr
+from pypy.rpython.lltypesystem import rffi
+from pypy.jit.backend.llsupport.descr import (
+    DynamicIntCallDescr, NonGcPtrCallDescr, FloatCallDescr, VoidCallDescr,
+    LongLongCallDescr, getCallDescrClass)
 
 class UnsupportedKind(Exception):
     pass
 
-def get_call_descr_dynamic(ffi_args, ffi_result, extrainfo=None):
+def get_call_descr_dynamic(cpu, ffi_args, ffi_result, extrainfo=None):
     """Get a call descr: the types of result and args are represented by
     rlib.libffi.types.*"""
     try:
-        reskind = get_ffi_type_kind(ffi_result)
-        argkinds = [get_ffi_type_kind(arg) for arg in ffi_args]
+        reskind = get_ffi_type_kind(cpu, ffi_result)
+        argkinds = [get_ffi_type_kind(cpu, arg) for arg in ffi_args]
     except UnsupportedKind:
-        return None # ??
+        return None
     arg_classes = ''.join(argkinds)
     if reskind == history.INT:
         size = intmask(ffi_result.c_size)
@@ -25,17 +27,26 @@
         return FloatCallDescr(arg_classes, extrainfo)
     elif reskind == history.VOID:
         return VoidCallDescr(arg_classes, extrainfo)
+    elif reskind == 'L':
+        return LongLongCallDescr(arg_classes, extrainfo)
+    elif reskind == 'S':
+        SingleFloatCallDescr = getCallDescrClass(rffi.FLOAT)
+        return SingleFloatCallDescr(arg_classes, extrainfo)
     assert False
 
-def get_ffi_type_kind(ffi_type):
+def get_ffi_type_kind(cpu, ffi_type):
     from pypy.rlib.libffi import types
     kind = types.getkind(ffi_type)
     if kind == 'i' or kind == 'u':
         return history.INT
-    elif kind == 'f':
+    elif cpu.supports_floats and kind == 'f':
         return history.FLOAT
     elif kind == 'v':
         return history.VOID
+    elif cpu.supports_longlong and (kind == 'I' or kind == 'U'):     # longlong
+        return 'L'
+    elif cpu.supports_singlefloats and kind == 's':    # singlefloat
+        return 'S'
     raise UnsupportedKind("Unsupported kind '%s'" % kind)
 
 def is_ffi_type_signed(ffi_type):
diff --git a/pypy/jit/backend/llsupport/gc.py b/pypy/jit/backend/llsupport/gc.py
--- a/pypy/jit/backend/llsupport/gc.py
+++ b/pypy/jit/backend/llsupport/gc.py
@@ -544,18 +544,19 @@
         assert self.GCClass.inline_simple_malloc
         assert self.GCClass.inline_simple_malloc_varsize
 
-        # make a malloc function, with three arguments
+        # make a malloc function, with two arguments
         def malloc_basic(size, tid):
             type_id = llop.extract_ushort(llgroup.HALFWORD, tid)
             has_finalizer = bool(tid & (1<<llgroup.HALFSHIFT))
             check_typeid(type_id)
-            try:
-                res = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
-                                                      type_id, size, True,
-                                                      has_finalizer, False)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                res = lltype.nullptr(llmemory.GCREF.TO)
+            res = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
+                                                  type_id, size, True,
+                                                  has_finalizer, False)
+            # In case the operation above failed, we are returning NULL
+            # from this function to assembler.  There is also an RPython
+            # exception set, typically MemoryError; but it's easier and
+            # faster to check for the NULL return value, as done by
+            # translator/exceptiontransform.py.
             #llop.debug_print(lltype.Void, "\tmalloc_basic", size, type_id,
             #                 "-->", res)
             return res
@@ -571,14 +572,10 @@
         def malloc_array(itemsize, tid, num_elem):
             type_id = llop.extract_ushort(llgroup.HALFWORD, tid)
             check_typeid(type_id)
-            try:
-                return llop1.do_malloc_varsize_clear(
-                    llmemory.GCREF,
-                    type_id, num_elem, self.array_basesize, itemsize,
-                    self.array_length_ofs, True)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                return lltype.nullptr(llmemory.GCREF.TO)
+            return llop1.do_malloc_varsize_clear(
+                llmemory.GCREF,
+                type_id, num_elem, self.array_basesize, itemsize,
+                self.array_length_ofs, True)
         self.malloc_array = malloc_array
         self.GC_MALLOC_ARRAY = lltype.Ptr(lltype.FuncType(
             [lltype.Signed] * 3, llmemory.GCREF))
@@ -591,23 +588,15 @@
         unicode_type_id = self.layoutbuilder.get_type_id(rstr.UNICODE)
         #
         def malloc_str(length):
-            try:
-                return llop1.do_malloc_varsize_clear(
-                    llmemory.GCREF,
-                    str_type_id, length, str_basesize, str_itemsize,
-                    str_ofs_length, True)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                return lltype.nullptr(llmemory.GCREF.TO)
+            return llop1.do_malloc_varsize_clear(
+                llmemory.GCREF,
+                str_type_id, length, str_basesize, str_itemsize,
+                str_ofs_length, True)
         def malloc_unicode(length):
-            try:
-                return llop1.do_malloc_varsize_clear(
-                    llmemory.GCREF,
-                    unicode_type_id, length, unicode_basesize,unicode_itemsize,
-                    unicode_ofs_length, True)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                return lltype.nullptr(llmemory.GCREF.TO)
+            return llop1.do_malloc_varsize_clear(
+                llmemory.GCREF,
+                unicode_type_id, length, unicode_basesize,unicode_itemsize,
+                unicode_ofs_length, True)
         self.malloc_str = malloc_str
         self.malloc_unicode = malloc_unicode
         self.GC_MALLOC_STR_UNICODE = lltype.Ptr(lltype.FuncType(
@@ -628,16 +617,12 @@
             if self.DEBUG:
                 random_usage_of_xmm_registers()
             assert size >= self.minimal_size_in_nursery
-            try:
-                # NB. although we call do_malloc_fixedsize_clear() here,
-                # it's a bit of a hack because we set tid to 0 and may
-                # also use it to allocate varsized objects.  The tid
-                # and possibly the length are both set afterward.
-                gcref = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
-                                            0, size, True, False, False)
-            except MemoryError:
-                fatalerror("out of memory (from JITted code)")
-                return 0
+            # NB. although we call do_malloc_fixedsize_clear() here,
+            # it's a bit of a hack because we set tid to 0 and may
+            # also use it to allocate varsized objects.  The tid
+            # and possibly the length are both set afterward.
+            gcref = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
+                                        0, size, True, False, False)
             return rffi.cast(lltype.Signed, gcref)
         self.malloc_slowpath = malloc_slowpath
         self.MALLOC_SLOWPATH = lltype.FuncType([lltype.Signed], lltype.Signed)
diff --git a/pypy/jit/backend/llsupport/llmodel.py b/pypy/jit/backend/llsupport/llmodel.py
--- a/pypy/jit/backend/llsupport/llmodel.py
+++ b/pypy/jit/backend/llsupport/llmodel.py
@@ -259,7 +259,7 @@
 
     def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo=None):
         from pypy.jit.backend.llsupport import ffisupport
-        return ffisupport.get_call_descr_dynamic(ffi_args, ffi_result,
+        return ffisupport.get_call_descr_dynamic(self, ffi_args, ffi_result,
                                                  extrainfo)
 
     def get_overflow_error(self):
@@ -499,7 +499,7 @@
     def bh_call_i(self, func, calldescr, args_i, args_r, args_f):
         assert isinstance(calldescr, BaseIntCallDescr)
         if not we_are_translated():
-            calldescr.verify_types(args_i, args_r, args_f, history.INT)
+            calldescr.verify_types(args_i, args_r, args_f, history.INT + 'S')
         return calldescr.call_stub(func, args_i, args_r, args_f)
 
     def bh_call_r(self, func, calldescr, args_i, args_r, args_f):
diff --git a/pypy/jit/backend/llsupport/test/test_descr.py b/pypy/jit/backend/llsupport/test/test_descr.py
--- a/pypy/jit/backend/llsupport/test/test_descr.py
+++ b/pypy/jit/backend/llsupport/test/test_descr.py
@@ -52,7 +52,8 @@
     S = lltype.GcStruct('S', ('x', lltype.Char),
                              ('y', lltype.Ptr(T)),
                              ('z', lltype.Ptr(U)),
-                             ('f', lltype.Float))
+                             ('f', lltype.Float),
+                             ('s', lltype.SingleFloat))
     assert getFieldDescrClass(lltype.Ptr(T)) is GcPtrFieldDescr
     assert getFieldDescrClass(lltype.Ptr(U)) is NonGcPtrFieldDescr
     cls = getFieldDescrClass(lltype.Char)
@@ -61,6 +62,10 @@
     clsf = getFieldDescrClass(lltype.Float)
     assert clsf != cls
     assert clsf == getFieldDescrClass(lltype.Float)
+    clss = getFieldDescrClass(lltype.SingleFloat)
+    assert clss not in (cls, clsf)
+    assert clss == getFieldDescrClass(lltype.SingleFloat)
+    assert clss == getFieldDescrClass(rffi.UINT)    # for now
     #
     c0 = GcCache(False)
     c1 = GcCache(True)
@@ -72,14 +77,17 @@
         descr_y = get_field_descr(c2, S, 'y')
         descr_z = get_field_descr(c2, S, 'z')
         descr_f = get_field_descr(c2, S, 'f')
+        descr_s = get_field_descr(c2, S, 's')
         assert descr_x.__class__ is cls
         assert descr_y.__class__ is GcPtrFieldDescr
         assert descr_z.__class__ is NonGcPtrFieldDescr
         assert descr_f.__class__ is clsf
+        assert descr_s.__class__ is clss
         assert descr_x.name == 'S.x'
         assert descr_y.name == 'S.y'
         assert descr_z.name == 'S.z'
         assert descr_f.name == 'S.f'
+        assert descr_s.name == 'S.s'
         if not tsc:
             assert descr_x.offset < descr_y.offset < descr_z.offset
             assert descr_x.sort_key() < descr_y.sort_key() < descr_z.sort_key()
@@ -87,23 +95,29 @@
             assert descr_y.get_field_size(False) == rffi.sizeof(lltype.Ptr(T))
             assert descr_z.get_field_size(False) == rffi.sizeof(lltype.Ptr(U))
             assert descr_f.get_field_size(False) == rffi.sizeof(lltype.Float)
+            assert descr_s.get_field_size(False) == rffi.sizeof(
+                                                            lltype.SingleFloat)
         else:
             assert isinstance(descr_x.offset, Symbolic)
             assert isinstance(descr_y.offset, Symbolic)
             assert isinstance(descr_z.offset, Symbolic)
             assert isinstance(descr_f.offset, Symbolic)
+            assert isinstance(descr_s.offset, Symbolic)
             assert isinstance(descr_x.get_field_size(True), Symbolic)
             assert isinstance(descr_y.get_field_size(True), Symbolic)
             assert isinstance(descr_z.get_field_size(True), Symbolic)
             assert isinstance(descr_f.get_field_size(True), Symbolic)
+            assert isinstance(descr_s.get_field_size(True), Symbolic)
         assert not descr_x.is_pointer_field()
         assert     descr_y.is_pointer_field()
         assert not descr_z.is_pointer_field()
         assert not descr_f.is_pointer_field()
+        assert not descr_s.is_pointer_field()
         assert not descr_x.is_float_field()
         assert not descr_y.is_float_field()
         assert not descr_z.is_float_field()
         assert     descr_f.is_float_field()
+        assert not descr_s.is_float_field()
 
 
 def test_get_field_descr_sign():
@@ -135,6 +149,7 @@
     A2 = lltype.GcArray(lltype.Ptr(T))
     A3 = lltype.GcArray(lltype.Ptr(U))
     A4 = lltype.GcArray(lltype.Float)
+    A5 = lltype.GcArray(lltype.SingleFloat)
     assert getArrayDescrClass(A2) is GcPtrArrayDescr
     assert getArrayDescrClass(A3) is NonGcPtrArrayDescr
     cls = getArrayDescrClass(A1)
@@ -143,25 +158,32 @@
     clsf = getArrayDescrClass(A4)
     assert clsf != cls
     assert clsf == getArrayDescrClass(lltype.GcArray(lltype.Float))
+    clss = getArrayDescrClass(A5)
+    assert clss not in (clsf, cls)
+    assert clss == getArrayDescrClass(lltype.GcArray(rffi.UINT))
     #
     c0 = GcCache(False)
     descr1 = get_array_descr(c0, A1)
     descr2 = get_array_descr(c0, A2)
     descr3 = get_array_descr(c0, A3)
     descr4 = get_array_descr(c0, A4)
+    descr5 = get_array_descr(c0, A5)
     assert descr1.__class__ is cls
     assert descr2.__class__ is GcPtrArrayDescr
     assert descr3.__class__ is NonGcPtrArrayDescr
     assert descr4.__class__ is clsf
+    assert descr5.__class__ is clss
     assert descr1 == get_array_descr(c0, lltype.GcArray(lltype.Char))
     assert not descr1.is_array_of_pointers()
     assert     descr2.is_array_of_pointers()
     assert not descr3.is_array_of_pointers()
     assert not descr4.is_array_of_pointers()
+    assert not descr5.is_array_of_pointers()
     assert not descr1.is_array_of_floats()
     assert not descr2.is_array_of_floats()
     assert not descr3.is_array_of_floats()
     assert     descr4.is_array_of_floats()
+    assert not descr5.is_array_of_floats()
     #
     def get_alignment(code):
         # Retrieve default alignment for the compiler/platform
@@ -170,27 +192,33 @@
     assert descr2.get_base_size(False) == get_alignment('p')
     assert descr3.get_base_size(False) == get_alignment('p')
     assert descr4.get_base_size(False) == get_alignment('d')
+    assert descr5.get_base_size(False) == get_alignment('f')
     assert descr1.get_ofs_length(False) == 0
     assert descr2.get_ofs_length(False) == 0
     assert descr3.get_ofs_length(False) == 0
     assert descr4.get_ofs_length(False) == 0
+    assert descr5.get_ofs_length(False) == 0
     assert descr1.get_item_size(False) == rffi.sizeof(lltype.Char)
     assert descr2.get_item_size(False) == rffi.sizeof(lltype.Ptr(T))
     assert descr3.get_item_size(False) == rffi.sizeof(lltype.Ptr(U))
     assert descr4.get_item_size(False) == rffi.sizeof(lltype.Float)
+    assert descr5.get_item_size(False) == rffi.sizeof(lltype.SingleFloat)
     #
     assert isinstance(descr1.get_base_size(True), Symbolic)
     assert isinstance(descr2.get_base_size(True), Symbolic)
     assert isinstance(descr3.get_base_size(True), Symbolic)
     assert isinstance(descr4.get_base_size(True), Symbolic)
+    assert isinstance(descr5.get_base_size(True), Symbolic)
     assert isinstance(descr1.get_ofs_length(True), Symbolic)
     assert isinstance(descr2.get_ofs_length(True), Symbolic)
     assert isinstance(descr3.get_ofs_length(True), Symbolic)
     assert isinstance(descr4.get_ofs_length(True), Symbolic)
+    assert isinstance(descr5.get_ofs_length(True), Symbolic)
     assert isinstance(descr1.get_item_size(True), Symbolic)
     assert isinstance(descr2.get_item_size(True), Symbolic)
     assert isinstance(descr3.get_item_size(True), Symbolic)
     assert isinstance(descr4.get_item_size(True), Symbolic)
+    assert isinstance(descr5.get_item_size(True), Symbolic)
     CA = rffi.CArray(lltype.Signed)
     descr = get_array_descr(c0, CA)
     assert not descr.is_array_of_floats()
@@ -210,6 +238,11 @@
     assert descr.is_array_of_floats()
     assert descr.get_base_size(False) == 0
     assert descr.get_ofs_length(False) == -1
+    CA = rffi.CArray(rffi.FLOAT)
+    descr = get_array_descr(c0, CA)
+    assert not descr.is_array_of_floats()
+    assert descr.get_base_size(False) == 0
+    assert descr.get_ofs_length(False) == -1
 
 
 def test_get_array_descr_sign():
@@ -257,6 +290,11 @@
     assert descr4.get_result_size(False) == rffi.sizeof(lltype.Float)
     assert descr4.get_return_type() == history.FLOAT
     assert descr4.arg_classes == "ff"
+    #
+    descr5 = get_call_descr(c0, [lltype.SingleFloat], lltype.SingleFloat)
+    assert descr5.get_result_size(False) == rffi.sizeof(lltype.SingleFloat)
+    assert descr5.get_return_type() == "S"
+    assert descr5.arg_classes == "S"
 
 def test_get_call_descr_not_translated_longlong():
     if sys.maxint > 2147483647:
@@ -286,6 +324,11 @@
     assert isinstance(descr4.get_result_size(True), Symbolic)
     assert descr4.get_return_type() == history.FLOAT
     assert descr4.arg_classes == "ff"
+    #
+    descr5 = get_call_descr(c1, [lltype.SingleFloat], lltype.SingleFloat)
+    assert isinstance(descr5.get_result_size(True), Symbolic)
+    assert descr5.get_return_type() == "S"
+    assert descr5.arg_classes == "S"
 
 def test_call_descr_extra_info():
     c1 = GcCache(True)
@@ -345,8 +388,11 @@
     #
     descr4f = get_call_descr(c0, [lltype.Char, lltype.Ptr(S)], lltype.Float)
     assert 'FloatCallDescr' in descr4f.repr_of_descr()
+    #
+    descr5f = get_call_descr(c0, [lltype.Char], lltype.SingleFloat)
+    assert 'SingleFloatCallDescr' in descr5f.repr_of_descr()
 
-def test_call_stubs():
+def test_call_stubs_1():
     c0 = GcCache(False)
     ARGS = [lltype.Char, lltype.Signed]
     RES = lltype.Char
@@ -360,6 +406,8 @@
     res = call_stub(rffi.cast(lltype.Signed, fnptr), [1, 2], None, None)
     assert res == ord('c')
 
+def test_call_stubs_2():
+    c0 = GcCache(False)
     ARRAY = lltype.GcArray(lltype.Signed)
     ARGS = [lltype.Float, lltype.Ptr(ARRAY)]
     RES = lltype.Float
@@ -375,3 +423,27 @@
     res = descr2.call_stub(rffi.cast(lltype.Signed, fnptr),
                            [], [opaquea], [longlong.getfloatstorage(3.5)])
     assert longlong.getrealfloat(res) == 4.5
+
+def test_call_stubs_single_float():
+    from pypy.rlib.longlong2float import uint2singlefloat, singlefloat2uint
+    from pypy.rlib.rarithmetic import r_singlefloat, intmask
+    #
+    c0 = GcCache(False)
+    ARGS = [lltype.SingleFloat, lltype.SingleFloat, lltype.SingleFloat]
+    RES = lltype.SingleFloat
+
+    def f(a, b, c):
+        a = float(a)
+        b = float(b)
+        c = float(c)
+        x = a - (b / c)
+        return r_singlefloat(x)
+
+    fnptr = llhelper(lltype.Ptr(lltype.FuncType(ARGS, RES)), f)
+    descr2 = get_call_descr(c0, ARGS, RES)
+    a = intmask(singlefloat2uint(r_singlefloat(-10.0)))
+    b = intmask(singlefloat2uint(r_singlefloat(3.0)))
+    c = intmask(singlefloat2uint(r_singlefloat(2.0)))
+    res = descr2.call_stub(rffi.cast(lltype.Signed, fnptr),
+                           [a, b, c], [], [])
+    assert float(uint2singlefloat(rffi.r_uint(res))) == -11.5
diff --git a/pypy/jit/backend/llsupport/test/test_ffisupport.py b/pypy/jit/backend/llsupport/test/test_ffisupport.py
--- a/pypy/jit/backend/llsupport/test/test_ffisupport.py
+++ b/pypy/jit/backend/llsupport/test/test_ffisupport.py
@@ -1,24 +1,52 @@
 from pypy.rlib.libffi import types
-from pypy.jit.backend.llsupport.ffisupport import get_call_descr_dynamic, \
-    VoidCallDescr, DynamicIntCallDescr
-    
+from pypy.jit.codewriter.longlong import is_64_bit
+from pypy.jit.backend.llsupport.ffisupport import *
+
+
+class FakeCPU:
+    def __init__(self, supports_floats=False, supports_longlong=False,
+                 supports_singlefloats=False):
+        self.supports_floats = supports_floats
+        self.supports_longlong = supports_longlong
+        self.supports_singlefloats = supports_singlefloats
+
+
 def test_call_descr_dynamic():
+    args = [types.sint, types.pointer]
+    descr = get_call_descr_dynamic(FakeCPU(), args, types.sint)
+    assert isinstance(descr, DynamicIntCallDescr)
+    assert descr.arg_classes == 'ii'
 
     args = [types.sint, types.double, types.pointer]
-    descr = get_call_descr_dynamic(args, types.void)
+    descr = get_call_descr_dynamic(FakeCPU(), args, types.void)
+    assert descr is None    # missing floats
+    descr = get_call_descr_dynamic(FakeCPU(supports_floats=True),
+                                   args, types.void)
     assert isinstance(descr, VoidCallDescr)
     assert descr.arg_classes == 'ifi'
 
-    descr = get_call_descr_dynamic([], types.sint8)
+    descr = get_call_descr_dynamic(FakeCPU(), [], types.sint8)
     assert isinstance(descr, DynamicIntCallDescr)
     assert descr.get_result_size(False) == 1
     assert descr.is_result_signed() == True
 
-    descr = get_call_descr_dynamic([], types.uint8)
+    descr = get_call_descr_dynamic(FakeCPU(), [], types.uint8)
     assert isinstance(descr, DynamicIntCallDescr)
     assert descr.get_result_size(False) == 1
     assert descr.is_result_signed() == False
 
-    descr = get_call_descr_dynamic([], types.float)
-    assert descr is None # single floats are not supported so far
-    
+    if not is_64_bit:
+        descr = get_call_descr_dynamic(FakeCPU(), [], types.slonglong)
+        assert descr is None   # missing longlongs
+        descr = get_call_descr_dynamic(FakeCPU(supports_longlong=True),
+                                       [], types.slonglong)
+        assert isinstance(descr, LongLongCallDescr)
+    else:
+        assert types.slonglong is types.slong
+
+    descr = get_call_descr_dynamic(FakeCPU(), [], types.float)
+    assert descr is None   # missing singlefloats
+    descr = get_call_descr_dynamic(FakeCPU(supports_singlefloats=True),
+                                   [], types.float)
+    SingleFloatCallDescr = getCallDescrClass(rffi.FLOAT)
+    assert isinstance(descr, SingleFloatCallDescr)
diff --git a/pypy/jit/backend/model.py b/pypy/jit/backend/model.py
--- a/pypy/jit/backend/model.py
+++ b/pypy/jit/backend/model.py
@@ -8,12 +8,13 @@
     # ^^^ This is only useful on 32-bit platforms.  If True,
     # longlongs are supported by the JIT, but stored as doubles.
     # Boxes and Consts are BoxFloats and ConstFloats.
+    supports_singlefloats = False
 
     done_with_this_frame_void_v = -1
     done_with_this_frame_int_v = -1
     done_with_this_frame_ref_v = -1
     done_with_this_frame_float_v = -1
-    exit_frame_with_exception_v = -1
+    propagate_exception_v = -1
     total_compiled_loops = 0
     total_compiled_bridges = 0
     total_freed_loops = 0
diff --git a/pypy/jit/backend/test/calling_convention_test.py b/pypy/jit/backend/test/calling_convention_test.py
--- a/pypy/jit/backend/test/calling_convention_test.py
+++ b/pypy/jit/backend/test/calling_convention_test.py
@@ -290,3 +290,58 @@
                 assert abs(x - expected_result) < 0.0001
             finally:
                 del self.cpu.done_with_this_frame_float_v
+
+    def test_call_with_singlefloats(self):
+        cpu = self.cpu
+        if not cpu.supports_floats or not cpu.supports_singlefloats:
+            py.test.skip('requires floats and singlefloats')
+
+        import random
+        from pypy.rlib.libffi import types
+        from pypy.rlib.rarithmetic import r_singlefloat
+
+        def func(*args):
+            res = 0.0
+            for i, x in enumerate(args):
+                res += (i + 1.1) * float(x)
+            return res
+
+        F = lltype.Float
+        S = lltype.SingleFloat
+        I = lltype.Signed
+        floats = [random.random() - 0.5 for i in range(8)]
+        singlefloats = [r_singlefloat(random.random() - 0.5) for i in range(8)]
+        ints = [random.randrange(-99, 99) for i in range(8)]
+        for repeat in range(100):
+            args = []
+            argvalues = []
+            argslist = []
+            local_floats = list(floats)
+            local_singlefloats = list(singlefloats)
+            local_ints = list(ints)
+            for i in range(8):
+                case = random.randrange(0, 3)
+                if case == 0:
+                    args.append(F)
+                    arg = local_floats.pop()
+                    argslist.append(boxfloat(arg))
+                elif case == 1:
+                    args.append(S)
+                    arg = local_singlefloats.pop()
+                    argslist.append(BoxInt(longlong.singlefloat2int(arg)))
+                else:
+                    args.append(I)
+                    arg = local_ints.pop()
+                    argslist.append(BoxInt(arg))
+                argvalues.append(arg)
+            FUNC = self.FuncType(args, F)
+            FPTR = self.Ptr(FUNC)
+            func_ptr = llhelper(FPTR, func)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            funcbox = self.get_funcbox(cpu, func_ptr)
+
+            res = self.execute_operation(rop.CALL,
+                                         [funcbox] + argslist,
+                                         'float', descr=calldescr)
+            expected = func(*argvalues)
+            assert abs(res.getfloat() - expected) < 0.0001
diff --git a/pypy/jit/backend/test/runner_test.py b/pypy/jit/backend/test/runner_test.py
--- a/pypy/jit/backend/test/runner_test.py
+++ b/pypy/jit/backend/test/runner_test.py
@@ -2734,6 +2734,65 @@
                                      'float', descr=calldescr)
         assert res.getfloatstorage() == expected
 
+    def test_singlefloat_result_of_call_direct(self):
+        if not self.cpu.supports_singlefloats:
+            py.test.skip("singlefloat test")
+        from pypy.translator.tool.cbuild import ExternalCompilationInfo
+        from pypy.rlib.rarithmetic import r_singlefloat
+        eci = ExternalCompilationInfo(
+            separate_module_sources=["""
+            float fn_test_result_of_call(float x)
+            {
+                return x / 2.0f;
+            }
+            """],
+            export_symbols=['fn_test_result_of_call'])
+        f = rffi.llexternal('fn_test_result_of_call', [lltype.SingleFloat],
+                            lltype.SingleFloat,
+                            compilation_info=eci, _nowrapper=True)
+        value = r_singlefloat(-42.5)
+        expected = r_singlefloat(-21.25)
+        assert f(value) == expected
+        #
+        FUNC = self.FuncType([lltype.SingleFloat], lltype.SingleFloat)
+        FPTR = self.Ptr(FUNC)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        ivalue = longlong.singlefloat2int(value)
+        iexpected = longlong.singlefloat2int(expected)
+        x = self.cpu.bh_call_i(self.get_funcbox(self.cpu, f).value,
+                               calldescr, [ivalue], None, None)
+        assert x == iexpected
+
+    def test_singlefloat_result_of_call_compiled(self):
+        if not self.cpu.supports_singlefloats:
+            py.test.skip("test of singlefloat result")
+        from pypy.translator.tool.cbuild import ExternalCompilationInfo
+        from pypy.rlib.rarithmetic import r_singlefloat
+        eci = ExternalCompilationInfo(
+            separate_module_sources=["""
+            float fn_test_result_of_call(float x)
+            {
+                return x / 2.0f;
+            }
+            """],
+            export_symbols=['fn_test_result_of_call'])
+        f = rffi.llexternal('fn_test_result_of_call', [lltype.SingleFloat],
+                            lltype.SingleFloat,
+                            compilation_info=eci, _nowrapper=True)
+        value = r_singlefloat(-42.5)
+        expected = r_singlefloat(-21.25)
+        assert f(value) == expected
+        #
+        FUNC = self.FuncType([lltype.SingleFloat], lltype.SingleFloat)
+        FPTR = self.Ptr(FUNC)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        funcbox = self.get_funcbox(self.cpu, f)
+        ivalue = longlong.singlefloat2int(value)
+        iexpected = longlong.singlefloat2int(expected)
+        res = self.execute_operation(rop.CALL, [funcbox, BoxInt(ivalue)],
+                                     'int', descr=calldescr)
+        assert res.value == iexpected
+
     def test_free_loop_and_bridges(self):
         from pypy.jit.backend.llsupport.llmodel import AbstractLLCPU
         if not isinstance(self.cpu, AbstractLLCPU):
@@ -2748,6 +2807,26 @@
         assert mem2 < mem1
         assert mem2 == mem0
 
+    def test_memoryerror(self):
+        excdescr = BasicFailDescr(666)
+        self.cpu.propagate_exception_v = self.cpu.get_fail_descr_number(
+            excdescr)
+        self.cpu.setup_once()    # xxx redo it, because we added
+                                 # propagate_exception_v
+        i0 = BoxInt()
+        p0 = BoxPtr()
+        operations = [
+            ResOperation(rop.NEWUNICODE, [i0], p0),
+            ResOperation(rop.FINISH, [p0], None, descr=BasicFailDescr(1))
+            ]
+        inputargs = [i0]
+        looptoken = LoopToken()
+        self.cpu.compile_loop(inputargs, operations, looptoken)
+        # overflowing value:
+        self.cpu.set_future_value_int(0, sys.maxint // 4 + 1)
+        fail = self.cpu.execute_token(looptoken)
+        assert fail.identifier == excdescr.identifier
+
 
 class OOtypeBackendTest(BaseBackendTest):
 
diff --git a/pypy/jit/backend/x86/arch.py b/pypy/jit/backend/x86/arch.py
--- a/pypy/jit/backend/x86/arch.py
+++ b/pypy/jit/backend/x86/arch.py
@@ -27,3 +27,6 @@
 # which are used in the malloc itself.  They are:
 #   ecx, ebx, esi, edi               [32 and 64 bits]
 #   r8, r9, r10, r12, r13, r14, r15    [64 bits only]
+#
+# Note that with asmgcc, the locations corresponding to callee-save registers
+# are never used.
diff --git a/pypy/jit/backend/x86/assembler.py b/pypy/jit/backend/x86/assembler.py
--- a/pypy/jit/backend/x86/assembler.py
+++ b/pypy/jit/backend/x86/assembler.py
@@ -56,7 +56,9 @@
         self.exc = exc
         self.is_guard_not_invalidated = is_guard_not_invalidated
 
-DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed))
+DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed),
+                              ('bridge', lltype.Signed), # 0 or 1
+                              ('number', lltype.Signed))
 
 class Assembler386(object):
     _regalloc = None
@@ -89,6 +91,7 @@
         self._current_depths_cache = (0, 0)
         self.datablockwrapper = None
         self.stack_check_slowpath = 0
+        self.propagate_exception_path = 0
         self.teardown()
 
     def leave_jitted_hook(self):
@@ -125,6 +128,7 @@
             self._build_failure_recovery(True, withfloats=True)
             support.ensure_sse2_floats()
             self._build_float_constants()
+        self._build_propagate_exception_path()
         if gc_ll_descr.get_malloc_slowpath_addr is not None:
             self._build_malloc_slowpath()
         self._build_stack_check_slowpath()
@@ -138,6 +142,9 @@
         assert self.memcpy_addr != 0, "setup_once() not called?"
         self.current_clt = looptoken.compiled_loop_token
         self.pending_guard_tokens = []
+        if WORD == 8:
+            self.pending_memoryerror_trampoline_from = []
+            self.error_trampoline_64 = 0
         self.mc = codebuf.MachineCodeBlockWrapper()
         #assert self.datablockwrapper is None --- but obscure case
         # possible, e.g. getting MemoryError and continuing
@@ -147,6 +154,8 @@
 
     def teardown(self):
         self.pending_guard_tokens = None
+        if WORD == 8:
+            self.pending_memoryerror_trampoline_from = None
         self.mc = None
         self.looppos = -1
         self.currently_compiling_loop = None
@@ -155,9 +164,12 @@
     def finish_once(self):
         if self._debug:
             debug_start('jit-backend-counts')
-            for i in range(len(self.loop_run_counters)):
-                struct = self.loop_run_counters[i]
-                debug_print(str(i) + ':' + str(struct.i))
+            for struct in self.loop_run_counters:
+                if struct.bridge:
+                    prefix = 'bridge '
+                else:
+                    prefix = 'loop '
+                debug_print(prefix + str(struct.number) + ':' + str(struct.i))
             debug_stop('jit-backend-counts')
 
     def _build_float_constants(self):
@@ -181,6 +193,7 @@
         # instructions in assembler, with a mark_gc_roots in between.
         # With shadowstack, this is not needed, so we produce a single helper.
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        shadow_stack = (gcrootmap is not None and gcrootmap.is_shadow_stack)
         #
         # ---------- first helper for the slow path of malloc ----------
         mc = codebuf.MachineCodeBlockWrapper()
@@ -190,10 +203,19 @@
         mc.SUB_rr(edx.value, eax.value)       # compute the size we want
         addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
         #
-        if gcrootmap is not None and gcrootmap.is_shadow_stack:
+        # The registers to save in the copy area: with shadowstack, most
+        # registers need to be saved.  With asmgcc, the callee-saved registers
+        # don't need to.
+        save_in_copy_area = gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items()
+        if not shadow_stack:
+            save_in_copy_area = [(reg, ofs) for (reg, ofs) in save_in_copy_area
+                   if reg not in gpr_reg_mgr_cls.REGLOC_TO_GCROOTMAP_REG_INDEX]
+        #
+        for reg, ofs in save_in_copy_area:
+            mc.MOV_br(ofs, reg.value)
+        #
+        if shadow_stack:
             # ---- shadowstack ----
-            for reg, ofs in gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items():
-                mc.MOV_br(ofs, reg.value)
             mc.SUB_ri(esp.value, 16 - WORD)      # stack alignment of 16 bytes
             if IS_X86_32:
                 mc.MOV_sr(0, edx.value)          # push argument
@@ -201,15 +223,13 @@
                 mc.MOV_rr(edi.value, edx.value)
             mc.CALL(imm(addr))
             mc.ADD_ri(esp.value, 16 - WORD)
-            for reg, ofs in gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items():
-                mc.MOV_rb(reg.value, ofs)
         else:
             # ---- asmgcc ----
             if IS_X86_32:
                 mc.MOV_sr(WORD, edx.value)       # save it as the new argument
             elif IS_X86_64:
-                # rdi can be clobbered: its content was forced to the stack
-                # by _fastpath_malloc(), like all other save_around_call_regs.
+                # rdi can be clobbered: its content was saved in the
+                # copy area of the stack
                 mc.MOV_rr(edi.value, edx.value)
             mc.JMP(imm(addr))                    # tail call to the real malloc
             rawstart = mc.materialize(self.cpu.asmmemmgr, [])
@@ -217,18 +237,54 @@
             # ---------- second helper for the slow path of malloc ----------
             mc = codebuf.MachineCodeBlockWrapper()
         #
+        for reg, ofs in save_in_copy_area:
+            mc.MOV_rb(reg.value, ofs)
+            assert reg is not eax and reg is not edx
+        #
         if self.cpu.supports_floats:          # restore the XMM registers
             for i in range(self.cpu.NUM_REGS):# from where they were saved
                 mc.MOVSD_xs(i, (WORD*2)+8*i)
+        #
+        # Note: we check this after the code above, just because the code
+        # above is more than 127 bytes on 64-bits...
+        mc.TEST_rr(eax.value, eax.value)
+        mc.J_il8(rx86.Conditions['Z'], 0) # patched later
+        jz_location = mc.get_relative_pos()
+        #
         nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
         mc.MOV(edx, heap(nursery_free_adr))   # load this in EDX
         mc.RET()
+        #
+        # If the slowpath malloc failed, we raise a MemoryError that
+        # always interrupts the current loop, as a "good enough"
+        # approximation.  Also note that we didn't RET from this helper;
+        # but the code we jump to will actually restore the stack
+        # position based on EBP, which will get us out of here for free.
+        offset = mc.get_relative_pos() - jz_location
+        assert 0 < offset <= 127
+        mc.overwrite(jz_location-1, chr(offset))
+        mc.JMP(imm(self.propagate_exception_path))
+        #
         rawstart = mc.materialize(self.cpu.asmmemmgr, [])
         self.malloc_slowpath2 = rawstart
 
+    def _build_propagate_exception_path(self):
+        if self.cpu.propagate_exception_v < 0:
+            return      # not supported (for tests, or non-translated)
+        #
+        self.mc = codebuf.MachineCodeBlockWrapper()
+        # call on_leave_jitted_save_exc()
+        addr = self.cpu.get_on_leave_jitted_int(save_exception=True)
+        self.mc.CALL(imm(addr))
+        self.mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
+        self._call_footer()
+        rawstart = self.mc.materialize(self.cpu.asmmemmgr, [])
+        self.propagate_exception_path = rawstart
+        self.mc = None
+
     def _build_stack_check_slowpath(self):
         _, _, slowpathaddr = self.cpu.insert_stack_check()
-        if slowpathaddr == 0 or self.cpu.exit_frame_with_exception_v < 0:
+        if slowpathaddr == 0 or self.cpu.propagate_exception_v < 0:
             return      # no stack check (for tests, or non-translated)
         #
         # make a "function" that is called immediately at the start of
@@ -284,19 +340,11 @@
         offset = mc.get_relative_pos() - jnz_location
         assert 0 < offset <= 127
         mc.overwrite(jnz_location-1, chr(offset))
-        # clear the exception from the global position
-        mc.MOV(eax, heap(self.cpu.pos_exc_value()))
-        mc.MOV(heap(self.cpu.pos_exception()), imm0)
-        mc.MOV(heap(self.cpu.pos_exc_value()), imm0)
-        # save the current exception instance into fail_boxes_ptr[0]
-        adr = self.fail_boxes_ptr.get_addr_for_num(0)
-        mc.MOV(heap(adr), eax)
-        # call the helper function to set the GC flag on the fail_boxes_ptr
-        # array (note that there is no exception any more here)
-        addr = self.cpu.get_on_leave_jitted_int(save_exception=False)
+        # call on_leave_jitted_save_exc()
+        addr = self.cpu.get_on_leave_jitted_int(save_exception=True)
         mc.CALL(imm(addr))
         #
-        mc.MOV_ri(eax.value, self.cpu.exit_frame_with_exception_v)
+        mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
         #
         # footer -- note the ADD, which skips the return address of this
         # function, and will instead return to the caller's caller.  Note
@@ -392,7 +440,7 @@
         self.setup(looptoken)
         self.currently_compiling_loop = looptoken
         if log:
-            self._register_counter()
+            self._register_counter(False, looptoken.number)
             operations = self._inject_debugging_code(looptoken, operations)
 
         regalloc = RegAlloc(self, self.cpu.translate_support_code)
@@ -461,7 +509,7 @@
 
         self.setup(original_loop_token)
         if log:
-            self._register_counter()
+            self._register_counter(True, descr_number)
             operations = self._inject_debugging_code(faildescr, operations)
 
         arglocs = self.rebuild_faillocs_from_descr(failure_recovery)
@@ -508,6 +556,8 @@
         # at the end of self.mc.
         for tok in self.pending_guard_tokens:
             tok.pos_recovery_stub = self.generate_quick_failure(tok)
+        if WORD == 8 and len(self.pending_memoryerror_trampoline_from) > 0:
+            self.error_trampoline_64 = self.generate_propagate_error_64()
 
     def patch_pending_failure_recoveries(self, rawstart):
         # after we wrote the assembler to raw memory, set up
@@ -544,6 +594,12 @@
                 # less, we would run into the issue that overwriting the
                 # 5 bytes here might get a few nonsense bytes at the
                 # return address of the following CALL.
+        if WORD == 8:
+            for pos_after_jz in self.pending_memoryerror_trampoline_from:
+                assert self.error_trampoline_64 != 0     # only if non-empty
+                mc = codebuf.MachineCodeBlockWrapper()
+                mc.writeimm32(self.error_trampoline_64 - pos_after_jz)
+                mc.copy_to_raw_memory(rawstart + pos_after_jz - 4)
 
     def get_asmmemmgr_blocks(self, looptoken):
         clt = looptoken.compiled_loop_token
@@ -558,7 +614,7 @@
         return self.mc.materialize(self.cpu.asmmemmgr, allblocks,
                                    self.cpu.gc_ll_descr.gcrootmap)
 
-    def _register_counter(self):
+    def _register_counter(self, bridge, number):
         if self._debug:
             # YYY very minor leak -- we need the counters to stay alive
             # forever, just because we want to report them at the end
@@ -566,6 +622,8 @@
             struct = lltype.malloc(DEBUG_COUNTER, flavor='raw',
                                    track_allocation=False)
             struct.i = 0
+            struct.bridge = int(bridge)
+            struct.number = number
             self.loop_run_counters.append(struct)
 
     def _find_failure_recovery_bytecode(self, faildescr):
@@ -1056,9 +1114,10 @@
                     self.implement_guard(guard_token, checkfalsecond)
         return genop_cmp_guard_float
 
-    def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax):
+    def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax,
+                   argtypes=None):
         if IS_X86_64:
-            return self._emit_call_64(force_index, x, arglocs, start)
+            return self._emit_call_64(force_index, x, arglocs, start, argtypes)
 
         p = 0
         n = len(arglocs)
@@ -1086,12 +1145,13 @@
         self.mc.CALL(x)
         self.mark_gc_roots(force_index)
 
-    def _emit_call_64(self, force_index, x, arglocs, start):
+    def _emit_call_64(self, force_index, x, arglocs, start, argtypes):
         src_locs = []
         dst_locs = []
         xmm_src_locs = []
         xmm_dst_locs = []
         pass_on_stack = []
+        singlefloats = None
 
         # In reverse order for use with pop()
         unused_gpr = [r9, r8, ecx, edx, esi, edi]
@@ -1111,6 +1171,11 @@
                     xmm_dst_locs.append(unused_xmm.pop())
                 else:
                     pass_on_stack.append(loc)
+            elif (argtypes is not None and argtypes[i-start] == 'S' and
+                  len(unused_xmm) > 0):
+                # Singlefloat argument
+                if singlefloats is None: singlefloats = []
+                singlefloats.append((loc, unused_xmm.pop()))
             else:
                 if len(unused_gpr) > 0:
                     src_locs.append(loc)
@@ -1138,9 +1203,15 @@
                 else:
                     self.mc.MOV_sr(i*WORD, loc.value)
 
-        # Handle register arguments
+        # Handle register arguments: first remap the xmm arguments
+        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs,
+                           X86_64_XMM_SCRATCH_REG)
+        # Load the singlefloat arguments from main regs or stack to xmm regs
+        if singlefloats is not None:
+            for src, dst in singlefloats:
+                self.mc.MOVD(dst, src)
+        # Finally remap the arguments in the main regs
         remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
-        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs, X86_64_XMM_SCRATCH_REG)
 
         self._regalloc.reserve_param(len(pass_on_stack))
         self.mc.CALL(x)
@@ -1255,6 +1326,20 @@
     def genop_cast_int_to_float(self, op, arglocs, resloc):
         self.mc.CVTSI2SD(resloc, arglocs[0])
 
+    def genop_cast_float_to_singlefloat(self, op, arglocs, resloc):
+        loc0, loctmp = arglocs
+        self.mc.CVTSD2SS(loctmp, loc0)
+        assert isinstance(resloc, RegLoc)
+        assert isinstance(loctmp, RegLoc)
+        self.mc.MOVD_rx(resloc.value, loctmp.value)
+
+    def genop_cast_singlefloat_to_float(self, op, arglocs, resloc):
+        loc0, = arglocs
+        assert isinstance(resloc, RegLoc)
+        assert isinstance(loc0, RegLoc)
+        self.mc.MOVD_xr(resloc.value, loc0.value)
+        self.mc.CVTSS2SD_xx(resloc.value, resloc.value)
+
     def genop_guard_int_is_true(self, op, guard_op, guard_token, arglocs, resloc):
         guard_opnum = guard_op.getopnum()
         self.mc.CMP(arglocs[0], imm0)
@@ -1376,7 +1461,7 @@
         assert isinstance(loc_vtable, ImmedLoc)
         arglocs = arglocs[:-1]
         self.call(self.malloc_func_addr, arglocs, eax)
-        # xxx ignore NULL returns for now
+        self.propagate_memoryerror_if_eax_is_null()
         self.set_vtable(eax, loc_vtable)
 
     def set_vtable(self, loc, loc_vtable):
@@ -1395,18 +1480,35 @@
     def genop_new(self, op, arglocs, result_loc):
         assert result_loc is eax
         self.call(self.malloc_func_addr, arglocs, eax)
+        self.propagate_memoryerror_if_eax_is_null()
 
     def genop_new_array(self, op, arglocs, result_loc):
         assert result_loc is eax
         self.call(self.malloc_array_func_addr, arglocs, eax)
+        self.propagate_memoryerror_if_eax_is_null()
 
     def genop_newstr(self, op, arglocs, result_loc):
         assert result_loc is eax
         self.call(self.malloc_str_func_addr, arglocs, eax)
+        self.propagate_memoryerror_if_eax_is_null()
 
     def genop_newunicode(self, op, arglocs, result_loc):
         assert result_loc is eax
         self.call(self.malloc_unicode_func_addr, arglocs, eax)
+        self.propagate_memoryerror_if_eax_is_null()
+
+    def propagate_memoryerror_if_eax_is_null(self):
+        # if self.propagate_exception_path == 0 (tests), this may jump to 0
+        # and segfaults.  too bad.  the alternative is to continue anyway
+        # with eax==0, but that will segfault too.
+        self.mc.TEST_rr(eax.value, eax.value)
+        if WORD == 4:
+            self.mc.J_il(rx86.Conditions['Z'], self.propagate_exception_path)
+            self.mc.add_pending_relocation()
+        elif WORD == 8:
+            self.mc.J_il(rx86.Conditions['Z'], 0)
+            pos = self.mc.get_relative_pos()
+            self.pending_memoryerror_trampoline_from.append(pos)
 
     # ----------
 
@@ -1678,6 +1780,12 @@
         return GuardToken(faildescr, failargs, fail_locs, exc,
                           is_guard_not_invalidated)
 
+    def generate_propagate_error_64(self):
+        assert WORD == 8
+        startpos = self.mc.get_relative_pos()
+        self.mc.JMP(imm(self.propagate_exception_path))
+        return startpos
+
     def generate_quick_failure(self, guardtok):
         """Generate the initial code for handling a failure.  We try to
         keep it as compact as possible.
@@ -2013,7 +2121,8 @@
         else:
             tmp = eax
 
-        self._emit_call(force_index, x, arglocs, 3, tmp=tmp)
+        self._emit_call(force_index, x, arglocs, 3, tmp=tmp,
+                        argtypes=op.getdescr().get_arg_types())
 
         if IS_X86_32 and isinstance(resloc, StackLoc) and resloc.width == 8:
             # a float or a long long return
@@ -2025,7 +2134,19 @@
                 #     and this way is simpler also because the result loc
                 #     can just be always a stack location
             else:
-                self.mc.FSTP_b(resloc.value)   # float return
+                self.mc.FSTPL_b(resloc.value)   # float return
+        elif op.getdescr().get_return_type() == 'S':
+            # singlefloat return
+            assert resloc is eax
+            if IS_X86_32:
+                # must convert ST(0) to a 32-bit singlefloat and load it into EAX
+                # mess mess mess
+                self.mc.SUB_ri(esp.value, 4)
+                self.mc.FSTPS_s(0)
+                self.mc.POP_r(eax.value)
+            elif IS_X86_64:
+                # must copy from the lower 32 bits of XMM0 into eax
+                self.mc.MOVD_rx(eax.value, xmm0.value)
         elif size == WORD:
             assert resloc is eax or resloc is xmm0    # a full word
         elif size == 0:
@@ -2183,7 +2304,7 @@
         self._emit_call(fail_index, imm(asm_helper_adr), [eax, arglocs[1]], 0,
                         tmp=ecx)
         if IS_X86_32 and isinstance(result_loc, StackLoc) and result_loc.type == FLOAT:
-            self.mc.FSTP_b(result_loc.value)
+            self.mc.FSTPL_b(result_loc.value)
         #else: result_loc is already either eax or None, checked below
         self.mc.JMP_l8(0) # jump to done, patched later
         jmp_location = self.mc.get_relative_pos()
@@ -2424,8 +2545,7 @@
             # there are two helpers to call only with asmgcc
             slowpath_addr1 = self.malloc_slowpath1
             self.mc.CALL(imm(slowpath_addr1))
-        self.mark_gc_roots(self.write_new_force_index(),
-                           use_copy_area=shadow_stack)
+        self.mark_gc_roots(self.write_new_force_index(), use_copy_area=True)
         slowpath_addr2 = self.malloc_slowpath2
         self.mc.CALL(imm(slowpath_addr2))
 
diff --git a/pypy/jit/backend/x86/codebuf.py b/pypy/jit/backend/x86/codebuf.py
--- a/pypy/jit/backend/x86/codebuf.py
+++ b/pypy/jit/backend/x86/codebuf.py
@@ -25,8 +25,11 @@
         self.init_block_builder()
         # a list of relative positions; for each position p, the bytes
         # at [p-4:p] encode an absolute address that will need to be
-        # made relative.
-        self.relocations = []
+        # made relative.  Only works on 32-bit!
+        if WORD == 4:
+            self.relocations = []
+        else:
+            self.relocations = None
         #
         # ResOperation --> offset in the assembly.
         # ops_offset[None] represents the beginning of the code after the last op
@@ -42,9 +45,10 @@
 
     def copy_to_raw_memory(self, addr):
         self._copy_to_raw_memory(addr)
-        for reloc in self.relocations:
-            p = addr + reloc
-            adr = rffi.cast(rffi.LONGP, p - WORD)
-            adr[0] = intmask(adr[0] - p)
+        if self.relocations is not None:
+            for reloc in self.relocations:
+                p = addr + reloc
+                adr = rffi.cast(rffi.LONGP, p - WORD)
+                adr[0] = intmask(adr[0] - p)
         valgrind.discard_translations(addr, self.get_relative_pos())
         self._dump(addr, "jit-backend-dump", backend_name)
diff --git a/pypy/jit/backend/x86/regalloc.py b/pypy/jit/backend/x86/regalloc.py
--- a/pypy/jit/backend/x86/regalloc.py
+++ b/pypy/jit/backend/x86/regalloc.py
@@ -705,6 +705,17 @@
         self.Perform(op, [loc0], loc1)
         self.rm.possibly_free_var(op.getarg(0))
 
+    def consider_cast_float_to_singlefloat(self, op):
+        loc0 = self.xrm.make_sure_var_in_reg(op.getarg(0))
+        loc1 = self.rm.force_allocate_reg(op.result)
+        self.xrm.possibly_free_var(op.getarg(0))
+        tmpxvar = TempBox()
+        loctmp = self.xrm.force_allocate_reg(tmpxvar)   # may be equal to loc0
+        self.xrm.possibly_free_var(tmpxvar)
+        self.Perform(op, [loc0, loctmp], loc1)
+
+    consider_cast_singlefloat_to_float = consider_cast_int_to_float
+
     def _consider_llong_binop_xx(self, op):
         # must force both arguments into xmm registers, because we don't
         # know if they will be suitably aligned.  Exception: if the second
@@ -921,27 +932,13 @@
     def _do_fastpath_malloc(self, op, size, tid):
         gc_ll_descr = self.assembler.cpu.gc_ll_descr
         self.rm.force_allocate_reg(op.result, selected_reg=eax)
-
-        if gc_ll_descr.gcrootmap and gc_ll_descr.gcrootmap.is_shadow_stack:
-            # ---- shadowstack ----
-            # We need edx as a temporary, but otherwise don't save any more
-            # register.  See comments in _build_malloc_slowpath().
-            tmp_box = TempBox()
-            self.rm.force_allocate_reg(tmp_box, selected_reg=edx)
-            self.rm.possibly_free_var(tmp_box)
-        else:
-            # ---- asmgcc ----
-            # We need to force-allocate each of save_around_call_regs now.
-            # The alternative would be to save and restore them around the
-            # actual call to malloc(), in the rare case where we need to do
-            # it; however, mark_gc_roots() would need to be adapted to know
-            # where the variables end up being saved.  Messy.
-            for reg in self.rm.save_around_call_regs:
-                if reg is not eax:
-                    tmp_box = TempBox()
-                    self.rm.force_allocate_reg(tmp_box, selected_reg=reg)
-                    self.rm.possibly_free_var(tmp_box)
-
+        #
+        # We need edx as a temporary, but otherwise don't save any more
+        # register.  See comments in _build_malloc_slowpath().
+        tmp_box = TempBox()
+        self.rm.force_allocate_reg(tmp_box, selected_reg=edx)
+        self.rm.possibly_free_var(tmp_box)
+        #
         self.assembler.malloc_cond(
             gc_ll_descr.get_nursery_free_addr(),
             gc_ll_descr.get_nursery_top_addr(),
@@ -1337,14 +1334,26 @@
             if reg is eax:
                 continue      # ok to ignore this one
             if (isinstance(v, BoxPtr) and self.rm.stays_alive(v)):
-                if use_copy_area:
-                    assert reg in self.rm.REGLOC_TO_COPY_AREA_OFS
-                    area_offset = self.rm.REGLOC_TO_COPY_AREA_OFS[reg]
-                    gcrootmap.add_frame_offset(shape, area_offset)
-                else:
-                    assert reg in self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX
-                    gcrootmap.add_callee_save_reg(
-                        shape, self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX[reg])
+                #
+                # The register 'reg' is alive across this call.
+                gcrootmap = self.assembler.cpu.gc_ll_descr.gcrootmap
+                if gcrootmap is None or not gcrootmap.is_shadow_stack:
+                    #
+                    # Asmgcc: if reg is a callee-save register, we can
+                    # explicitly mark it as containing a BoxPtr.
+                    if reg in self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX:
+                        gcrootmap.add_callee_save_reg(
+                            shape, self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX[reg])
+                        continue
+                #
+                # Else, 'use_copy_area' must be True (otherwise this BoxPtr
+                # should not be in a register).  The copy area contains the
+                # real value of the register.
+                assert use_copy_area
+                assert reg in self.rm.REGLOC_TO_COPY_AREA_OFS
+                area_offset = self.rm.REGLOC_TO_COPY_AREA_OFS[reg]
+                gcrootmap.add_frame_offset(shape, area_offset)
+        #
         return gcrootmap.compress_callshape(shape,
                                             self.assembler.datablockwrapper)
 
diff --git a/pypy/jit/backend/x86/regloc.py b/pypy/jit/backend/x86/regloc.py
--- a/pypy/jit/backend/x86/regloc.py
+++ b/pypy/jit/backend/x86/regloc.py
@@ -521,6 +521,8 @@
     UCOMISD = _binaryop('UCOMISD')
     CVTSI2SD = _binaryop('CVTSI2SD')
     CVTTSD2SI = _binaryop('CVTTSD2SI')
+    CVTSD2SS = _binaryop('CVTSD2SS')
+    CVTSS2SD = _binaryop('CVTSS2SD')
     
     SQRTSD = _binaryop('SQRTSD')
 
@@ -534,6 +536,8 @@
     PXOR  = _binaryop('PXOR')
     PCMPEQD = _binaryop('PCMPEQD')
 
+    MOVD = _binaryop('MOVD')
+
     CALL = _relative_unaryop('CALL')
     JMP = _relative_unaryop('JMP')
 
diff --git a/pypy/jit/backend/x86/runner.py b/pypy/jit/backend/x86/runner.py
--- a/pypy/jit/backend/x86/runner.py
+++ b/pypy/jit/backend/x86/runner.py
@@ -19,6 +19,7 @@
 class AbstractX86CPU(AbstractLLCPU):
     debug = True
     supports_floats = True
+    supports_singlefloats = True
 
     BOOTSTRAP_TP = lltype.FuncType([], lltype.Signed)
     dont_keepalive_stuff = False # for tests
diff --git a/pypy/jit/backend/x86/rx86.py b/pypy/jit/backend/x86/rx86.py
--- a/pypy/jit/backend/x86/rx86.py
+++ b/pypy/jit/backend/x86/rx86.py
@@ -573,7 +573,8 @@
     BTS_jr = insn(rex_w, '\x0F\xAB', register(2,8), abs_, immediate(1))
 
     # x87 instructions
-    FSTP_b = insn('\xDD', orbyte(3<<3), stack_bp(1))
+    FSTPL_b = insn('\xDD', orbyte(3<<3), stack_bp(1)) # rffi.DOUBLE ('as' wants L??)
+    FSTPS_s = insn('\xD9', orbyte(3<<3), stack_sp(1)) # lltype.SingleFloat
 
     # ------------------------------ Random mess -----------------------
     RDTSC = insn('\x0F\x31')
@@ -590,8 +591,18 @@
     CVTTSD2SI_rx = xmminsn('\xF2', rex_w, '\x0F\x2C', register(1, 8), register(2), '\xC0')
     CVTTSD2SI_rb = xmminsn('\xF2', rex_w, '\x0F\x2C', register(1, 8), stack_bp(2))
 
-    MOVD_rx = xmminsn('\x66', rex_w, '\x0F\x7E', register(2, 8), register(1), '\xC0')
-    MOVD_xr = xmminsn('\x66', rex_w, '\x0F\x6E', register(1, 8), register(2), '\xC0')
+    CVTSD2SS_xx = xmminsn('\xF2', rex_nw, '\x0F\x5A',
+                          register(1, 8), register(2), '\xC0')
+    CVTSD2SS_xb = xmminsn('\xF2', rex_nw, '\x0F\x5A',
+                          register(1, 8), stack_bp(2))
+    CVTSS2SD_xx = xmminsn('\xF3', rex_nw, '\x0F\x5A',
+                          register(1, 8), register(2), '\xC0')
+    CVTSS2SD_xb = xmminsn('\xF3', rex_nw, '\x0F\x5A',
+                          register(1, 8), stack_bp(2))
+
+    MOVD_rx = xmminsn('\x66', rex_nw, '\x0F\x7E', register(2, 8), register(1), '\xC0')
+    MOVD_xr = xmminsn('\x66', rex_nw, '\x0F\x6E', register(1, 8), register(2), '\xC0')
+    MOVD_xb = xmminsn('\x66', rex_nw, '\x0F\x6E', register(1, 8), stack_bp(2))
 
     PSRAD_xi = xmminsn('\x66', rex_nw, '\x0F\x72', register(1), '\xE0', immediate(2, 'b'))
 
diff --git a/pypy/jit/backend/x86/test/test_regloc.py b/pypy/jit/backend/x86/test/test_regloc.py
--- a/pypy/jit/backend/x86/test/test_regloc.py
+++ b/pypy/jit/backend/x86/test/test_regloc.py
@@ -62,7 +62,7 @@
             assert mc.relocations == [5]
             expected = "\xE8" + struct.pack('<i', target - (rawstart + 5))
         elif IS_X86_64:
-            assert mc.relocations == []
+            assert mc.relocations is None
             if 0 <= target <= 0xffffffff:
                 assert length == 9
                 expected = (
diff --git a/pypy/jit/backend/x86/test/test_runner.py b/pypy/jit/backend/x86/test/test_runner.py
--- a/pypy/jit/backend/x86/test/test_runner.py
+++ b/pypy/jit/backend/x86/test/test_runner.py
@@ -463,7 +463,7 @@
             self.cpu.finish_once()
         finally:
             debug._log = None
-        assert ('jit-backend-counts', [('debug_print', '0:10')]) in dlog
+        assert ('jit-backend-counts', [('debug_print', 'loop -1:10')]) in dlog
 
     def test_debugger_checksum(self):
         loop = """
diff --git a/pypy/jit/backend/x86/test/test_rx86_32_auto_encoding.py b/pypy/jit/backend/x86/test/test_rx86_32_auto_encoding.py
--- a/pypy/jit/backend/x86/test/test_rx86_32_auto_encoding.py
+++ b/pypy/jit/backend/x86/test/test_rx86_32_auto_encoding.py
@@ -36,6 +36,14 @@
 def hexdump(s):
     return ' '.join(["%02X" % ord(c) for c in s])
 
+def reduce_to_32bit(s):
+    if s[:2] != '%r':
+        return s
+    if s[2:].isdigit():
+        return s + 'd'
+    else:
+        return '%e' + s[2:]
+
 # ____________________________________________________________
 
 COUNT1 = 15
@@ -180,12 +188,14 @@
     ##        for m, extra in args:
     ##            if m in (i386.MODRM, i386.MODRM8) or all:
     ##                suffix = suffixes[sizes[m]] + suffix
-            if argmodes and not self.is_xmm_insn:
+            if (argmodes and not self.is_xmm_insn
+                         and not instrname.startswith('FSTP')):
                 suffix = suffixes[self.WORD]
             # Special case: On 64-bit CPUs, rx86 assumes 64-bit integer
             # operands when converting to/from floating point, so we need to
             # indicate that with a suffix
-            if (self.WORD == 8) and instrname.startswith('CVT'):
+            if (self.WORD == 8) and (instrname.startswith('CVT') and
+                                     'SI' in instrname):
                 suffix = suffixes[self.WORD]
 
             if instr_suffix is not None:
@@ -218,10 +228,10 @@
                 and ops[1].startswith('%r')):
                 # movq $xxx, %rax => movl $xxx, %eax
                 suffix = 'l'
-                if ops[1][2:].isdigit():
-                    ops[1] += 'd'
-                else:
-                    ops[1] = '%e' + ops[1][2:]
+                ops[1] = reduce_to_32bit(ops[1])
+            if instrname.lower() == 'movd':
+                ops[0] = reduce_to_32bit(ops[0])
+                ops[1] = reduce_to_32bit(ops[1])
             #
             op = '\t%s%s %s%s' % (instrname.lower(), suffix,
                                   ', '.join(ops), following)
diff --git a/pypy/jit/codewriter/assembler.py b/pypy/jit/codewriter/assembler.py
--- a/pypy/jit/codewriter/assembler.py
+++ b/pypy/jit/codewriter/assembler.py
@@ -76,6 +76,8 @@
                 TYPE = llmemory.Address
             if TYPE == llmemory.Address:
                 value = heaptracker.adr2int(value)
+            if TYPE is lltype.SingleFloat:
+                value = longlong.singlefloat2int(value)
             if not isinstance(value, (llmemory.AddressAsInt,
                                       ComputedIntSymbolic)):
                 value = lltype.cast_primitive(lltype.Signed, value)
diff --git a/pypy/jit/codewriter/call.py b/pypy/jit/codewriter/call.py
--- a/pypy/jit/codewriter/call.py
+++ b/pypy/jit/codewriter/call.py
@@ -228,8 +228,10 @@
             elif loopinvariant:
                 extraeffect = EffectInfo.EF_LOOPINVARIANT
             elif elidable:
-                # XXX check what to do about exceptions (also MemoryError?)
-                extraeffect = EffectInfo.EF_ELIDABLE
+                if self._canraise(op):
+                    extraeffect = EffectInfo.EF_ELIDABLE_CAN_RAISE
+                else:
+                    extraeffect = EffectInfo.EF_ELIDABLE_CANNOT_RAISE
             elif self._canraise(op):
                 extraeffect = EffectInfo.EF_CAN_RAISE
             else:
@@ -263,7 +265,7 @@
     def calldescr_canraise(self, calldescr):
         effectinfo = calldescr.get_extra_info()
         return (effectinfo is None or
-                effectinfo.extraeffect >= EffectInfo.EF_CAN_RAISE)
+                effectinfo.extraeffect > EffectInfo.EF_CANNOT_RAISE)
 
     def jitdriver_sd_from_portal_graph(self, graph):
         for jd in self.jitdrivers_sd:
diff --git a/pypy/jit/codewriter/effectinfo.py b/pypy/jit/codewriter/effectinfo.py
--- a/pypy/jit/codewriter/effectinfo.py
+++ b/pypy/jit/codewriter/effectinfo.py
@@ -9,10 +9,11 @@
     _cache = {}
 
     # the 'extraeffect' field is one of the following values:
-    EF_ELIDABLE                        = 0 #elidable function (and cannot raise)
+    EF_ELIDABLE_CANNOT_RAISE           = 0 #elidable function (and cannot raise)
     EF_LOOPINVARIANT                   = 1 #special: call it only once per loop
     EF_CANNOT_RAISE                    = 2 #a function which cannot raise
-    EF_CAN_RAISE                       = 3 #normal function (can raise)
+    EF_ELIDABLE_CAN_RAISE              = 3 #elidable function (but can raise)
+    EF_CAN_RAISE                       = 4 #normal function (can raise)
     EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE = 5 #can raise and force virtualizables
 
     # the 'oopspecindex' field is one of the following values:
@@ -94,7 +95,8 @@
         result.readonly_descrs_fields = readonly_descrs_fields
         result.readonly_descrs_arrays = readonly_descrs_arrays
         if extraeffect == EffectInfo.EF_LOOPINVARIANT or \
-           extraeffect == EffectInfo.EF_ELIDABLE:
+           extraeffect == EffectInfo.EF_ELIDABLE_CANNOT_RAISE or \
+           extraeffect == EffectInfo.EF_ELIDABLE_CAN_RAISE:
             result.write_descrs_fields = []
             result.write_descrs_arrays = []
         else:
diff --git a/pypy/jit/codewriter/jitcode.py b/pypy/jit/codewriter/jitcode.py
--- a/pypy/jit/codewriter/jitcode.py
+++ b/pypy/jit/codewriter/jitcode.py
@@ -1,7 +1,6 @@
 from pypy.jit.metainterp.history import AbstractDescr
 from pypy.jit.codewriter import heaptracker
 from pypy.rlib.objectmodel import we_are_translated
-from pypy.rpython.lltypesystem import llmemory
 
 
 class JitCode(AbstractDescr):
@@ -102,7 +101,7 @@
 
     def _clone_if_mutable(self):
         raise NotImplementedError
-    
+
 class MissingLiveness(Exception):
     pass
 
diff --git a/pypy/jit/codewriter/jtransform.py b/pypy/jit/codewriter/jtransform.py
--- a/pypy/jit/codewriter/jtransform.py
+++ b/pypy/jit/codewriter/jtransform.py
@@ -1,18 +1,16 @@
-import py, sys
-from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rclass
-from pypy.rpython import rlist
-from pypy.jit.metainterp.history import getkind
-from pypy.objspace.flow.model import SpaceOperation, Variable, Constant
-from pypy.objspace.flow.model import Block, Link, c_last_exception
-from pypy.jit.codewriter.flatten import ListOfKind, IndirectCallTargets
+import py
 from pypy.jit.codewriter import support, heaptracker, longlong
 from pypy.jit.codewriter.effectinfo import EffectInfo
+from pypy.jit.codewriter.flatten import ListOfKind, IndirectCallTargets
 from pypy.jit.codewriter.policy import log
+from pypy.jit.metainterp import quasiimmut
+from pypy.jit.metainterp.history import getkind
 from pypy.jit.metainterp.typesystem import deref, arrayItem
-from pypy.jit.metainterp import quasiimmut
-from pypy.rpython.rclass import IR_QUASIIMMUTABLE, IR_QUASIIMMUTABLE_ARRAY
+from pypy.objspace.flow.model import SpaceOperation, Variable, Constant, c_last_exception
 from pypy.rlib import objectmodel
 from pypy.rlib.jit import _we_are_jitted
+from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rclass, rffi
+from pypy.rpython.rclass import IR_QUASIIMMUTABLE, IR_QUASIIMMUTABLE_ARRAY
 from pypy.translator.simplify import get_funcobj
 from pypy.translator.unsimplify import varoftype
 
@@ -200,7 +198,6 @@
             self.vable_array_vars[op.result]= self.vable_array_vars[op.args[0]]
 
     rewrite_op_cast_pointer = rewrite_op_same_as
-    rewrite_op_cast_opaque_ptr = rewrite_op_same_as   # rlib.rerased
     def rewrite_op_cast_bool_to_int(self, op): pass
     def rewrite_op_cast_bool_to_uint(self, op): pass
     def rewrite_op_cast_char_to_int(self, op): pass
@@ -787,7 +784,6 @@
             op2.result = op.result
             return op2
         elif toll:
-            from pypy.rpython.lltypesystem import rffi
             size, unsigned = rffi.size_and_sign(op.args[0].concretetype)
             if unsigned:
                 INTERMEDIATE = lltype.Unsigned
@@ -809,21 +805,27 @@
             return self.force_cast_without_longlong(op.args[0], op.result)
 
     def force_cast_without_longlong(self, v_arg, v_result):
-        from pypy.rpython.lltypesystem.rffi import size_and_sign, sizeof, FLOAT
-        from pypy.rlib.rarithmetic import intmask
-        #
-        if (v_result.concretetype in (FLOAT, lltype.Float) or
-            v_arg.concretetype in (FLOAT, lltype.Float)):
-            assert (v_result.concretetype == lltype.Float and
-                    v_arg.concretetype == lltype.Float), "xxx unsupported cast"
+        if v_result.concretetype == v_arg.concretetype:
             return
-        #
-        size2, unsigned2 = size_and_sign(v_result.concretetype)
-        assert size2 <= sizeof(lltype.Signed)
-        if size2 == sizeof(lltype.Signed):
+        if v_arg.concretetype == rffi.FLOAT:
+            assert v_result.concretetype == lltype.Float, "cast %s -> %s" % (
+                v_arg.concretetype, v_result.concretetype)
+            return SpaceOperation('cast_singlefloat_to_float', [v_arg],
+                                  v_result)
+        if v_result.concretetype == rffi.FLOAT:
+            assert v_arg.concretetype == lltype.Float, "cast %s -> %s" % (
+                v_arg.concretetype, v_result.concretetype)
+            return SpaceOperation('cast_float_to_singlefloat', [v_arg],
+                                  v_result)
+        return self.force_cast_without_singlefloat(v_arg, v_result)
+
+    def force_cast_without_singlefloat(self, v_arg, v_result):
+        size2, unsigned2 = rffi.size_and_sign(v_result.concretetype)
+        assert size2 <= rffi.sizeof(lltype.Signed)
+        if size2 == rffi.sizeof(lltype.Signed):
             return     # the target type is LONG or ULONG
-        size1, unsigned1 = size_and_sign(v_arg.concretetype)
-        assert size1 <= sizeof(lltype.Signed)
+        size1, unsigned1 = rffi.size_and_sign(v_arg.concretetype)
+        assert size1 <= rffi.sizeof(lltype.Signed)
         #
         def bounds(size, unsigned):
             if unsigned:
@@ -852,7 +854,6 @@
         return result
 
     def rewrite_op_direct_ptradd(self, op):
-        from pypy.rpython.lltypesystem import rffi
         # xxx otherwise, not implemented:
         assert op.args[0].concretetype == rffi.CCHARP
         #
@@ -905,7 +906,7 @@
                 op1 = self.prepare_builtin_call(op, "llong_%s", args)
                 op2 = self._handle_oopspec_call(op1, args,
                                                 EffectInfo.OS_LLONG_%s,
-                                                EffectInfo.EF_ELIDABLE)
+                                           EffectInfo.EF_ELIDABLE_CANNOT_RAISE)
                 if %r == "TO_INT":
                     assert op2.result.concretetype == lltype.Signed
                 return op2
@@ -1366,15 +1367,15 @@
                     otherindex += EffectInfo._OS_offset_uni
                 self._register_extra_helper(otherindex, othername,
                                             argtypes, resulttype,
-                                            EffectInfo.EF_ELIDABLE)
+                                           EffectInfo.EF_ELIDABLE_CANNOT_RAISE)
         #
         return self._handle_oopspec_call(op, args, dict[oopspec_name],
-                                         EffectInfo.EF_ELIDABLE)
+                                         EffectInfo.EF_ELIDABLE_CANNOT_RAISE)
 
     def _handle_str2unicode_call(self, op, oopspec_name, args):
-        # ll_str2unicode is not EF_ELIDABLE, because it can raise
-        # UnicodeDecodeError...
-        return self._handle_oopspec_call(op, args, EffectInfo.OS_STR2UNICODE)
+        # ll_str2unicode can raise UnicodeDecodeError
+        return self._handle_oopspec_call(op, args, EffectInfo.OS_STR2UNICODE,
+                                         EffectInfo.EF_ELIDABLE_CAN_RAISE)
 
     # ----------
     # VirtualRefs.
@@ -1412,13 +1413,13 @@
         assert vinfo is not None
         self.vable_flags[op.args[0]] = op.args[2].value
         return []
-        
+
     # ---------
     # ll_math.sqrt_nonneg()
-    
+
     def _handle_math_sqrt_call(self, op, oopspec_name, args):
         return self._handle_oopspec_call(op, args, EffectInfo.OS_MATH_SQRT,
-                                         EffectInfo.EF_ELIDABLE)
+                                         EffectInfo.EF_ELIDABLE_CANNOT_RAISE)
 
     def rewrite_op_jit_force_quasi_immutable(self, op):
         v_inst, c_fieldname = op.args
diff --git a/pypy/jit/codewriter/longlong.py b/pypy/jit/codewriter/longlong.py
--- a/pypy/jit/codewriter/longlong.py
+++ b/pypy/jit/codewriter/longlong.py
@@ -7,7 +7,8 @@
 """
 
 import sys
-from pypy.rpython.lltypesystem import lltype
+from pypy.rpython.lltypesystem import lltype, rffi
+from pypy.rlib import rarithmetic, longlong2float
 
 
 if sys.maxint > 2147483647:
@@ -31,8 +32,6 @@
     # ---------- 32-bit platform ----------
     # the type FloatStorage is r_longlong, and conversion is needed
 
-    from pypy.rlib import rarithmetic, longlong2float
-
     is_64_bit = False
     supports_longlong = True
     r_float_storage = rarithmetic.r_longlong
@@ -41,9 +40,19 @@
     getfloatstorage = longlong2float.float2longlong
     getrealfloat    = longlong2float.longlong2float
     gethash         = lambda xll: rarithmetic.intmask(xll - (xll >> 32))
-    is_longlong     = lambda TYPE: (TYPE == lltype.SignedLongLong or
-                                    TYPE == lltype.UnsignedLongLong)
+    is_longlong     = lambda TYPE: (TYPE is lltype.SignedLongLong or
+                                    TYPE is lltype.UnsignedLongLong)
 
     # -------------------------------------
 
 ZEROF = getfloatstorage(0.0)
+
+# ____________________________________________________________
+
+def int2singlefloat(x):
+    x = rffi.r_uint(x)
+    return longlong2float.uint2singlefloat(x)
+
+def singlefloat2int(x):
+    x = longlong2float.singlefloat2uint(x)
+    return rffi.cast(lltype.Signed, x)
diff --git a/pypy/jit/codewriter/policy.py b/pypy/jit/codewriter/policy.py
--- a/pypy/jit/codewriter/policy.py
+++ b/pypy/jit/codewriter/policy.py
@@ -1,9 +1,7 @@
-from pypy.translator.simplify import get_funcobj
 from pypy.jit.metainterp import history
-from pypy.rpython.lltypesystem import lltype, rclass
 from pypy.tool.udir import udir
 
-import py, sys
+import py
 from pypy.tool.ansi_print import ansi_log
 log = py.log.Producer('jitcodewriter')
 py.log.setconsumer('jitcodewriter', ansi_log)
@@ -14,6 +12,7 @@
         self.unsafe_loopy_graphs = set()
         self.supports_floats = False
         self.supports_longlong = False
+        self.supports_singlefloats = False
 
     def set_supports_floats(self, flag):
         self.supports_floats = flag
@@ -21,6 +20,9 @@
     def set_supports_longlong(self, flag):
         self.supports_longlong = flag
 
+    def set_supports_singlefloats(self, flag):
+        self.supports_singlefloats = flag
+
     def dump_unsafe_loops(self):
         f = udir.join("unsafe-loops.txt").open('w')
         strs = [str(graph) for graph in self.unsafe_loopy_graphs]
@@ -60,8 +62,9 @@
                     func, '_jit_unroll_safe_', False)
 
         unsupported = contains_unsupported_variable_type(graph,
-                                                         self.supports_floats,
-                                                         self.supports_longlong)
+                            self.supports_floats,
+                            self.supports_longlong,
+                            self.supports_singlefloats)
         res = see_function and not unsupported
         if res and contains_loop:
             self.unsafe_loopy_graphs.add(graph)
@@ -82,17 +85,24 @@
         return res
 
 def contains_unsupported_variable_type(graph, supports_floats,
-                                       supports_longlong):
+                                              supports_longlong,
+                                              supports_singlefloats):
     getkind = history.getkind
     try:
         for block in graph.iterblocks():
             for v in block.inputargs:
-                getkind(v.concretetype, supports_floats, supports_longlong)
+                getkind(v.concretetype, supports_floats,
+                                        supports_longlong,
+                                        supports_singlefloats)
             for op in block.operations:
                 for v in op.args:
-                    getkind(v.concretetype, supports_floats, supports_longlong)
+                    getkind(v.concretetype, supports_floats,
+                                            supports_longlong,
+                                            supports_singlefloats)
                 v = op.result
-                getkind(v.concretetype, supports_floats, supports_longlong)
+                getkind(v.concretetype, supports_floats,
+                                        supports_longlong,
+                                        supports_singlefloats)
     except NotImplementedError, e:
         log.WARNING('%s, ignoring graph' % (e,))
         log.WARNING('  %s' % (graph,))
diff --git a/pypy/jit/codewriter/regalloc.py b/pypy/jit/codewriter/regalloc.py
--- a/pypy/jit/codewriter/regalloc.py
+++ b/pypy/jit/codewriter/regalloc.py
@@ -1,129 +1,8 @@
-import sys
-from pypy.objspace.flow.model import Variable
-from pypy.tool.algo.color import DependencyGraph
-from pypy.tool.algo.unionfind import UnionFind
+from pypy.tool.algo import regalloc
 from pypy.jit.metainterp.history import getkind
 from pypy.jit.codewriter.flatten import ListOfKind
 
+
 def perform_register_allocation(graph, kind):
-    """Perform register allocation for the Variables of the given 'kind'
-    in the 'graph'."""
-    regalloc = RegAllocator(graph, kind)
-    regalloc.make_dependencies()
-    regalloc.coalesce_variables()
-    regalloc.find_node_coloring()
-    return regalloc
-
-
-class RegAllocator(object):
-    DEBUG_REGALLOC = False
-
-    def __init__(self, graph, kind):
-        self.graph = graph
-        self.kind = kind
-
-    def make_dependencies(self):
-        dg = DependencyGraph()
-        for block in self.graph.iterblocks():
-            # Compute die_at = {Variable: index_of_operation_with_last_usage}
-            die_at = dict.fromkeys(block.inputargs, 0)
-            for i, op in enumerate(block.operations):
-                for v in op.args:
-                    if isinstance(v, Variable):
-                        die_at[v] = i
-                    elif isinstance(v, ListOfKind):
-                        for v1 in v:
-                            if isinstance(v1, Variable):
-                                die_at[v1] = i
-                if op.result is not None:
-                    die_at[op.result] = i + 1
-            if isinstance(block.exitswitch, tuple):
-                for x in block.exitswitch:
-                    die_at.pop(x, None)
-            else:
-                die_at.pop(block.exitswitch, None)
-            for link in block.exits:
-                for v in link.args:
-                    die_at.pop(v, None)
-            die_at = [(value, key) for (key, value) in die_at.items()]
-            die_at.sort()
-            die_at.append((sys.maxint,))
-            # Done.  XXX the code above this line runs 3 times
-            # (for kind in KINDS) to produce the same result...
-            livevars = [v for v in block.inputargs
-                          if getkind(v.concretetype) == self.kind]
-            # Add the variables of this block to the dependency graph
-            for i, v in enumerate(livevars):
-                dg.add_node(v)
-                for j in range(i):
-                    dg.add_edge(livevars[j], v)
-            livevars = set(livevars)
-            die_index = 0
-            for i, op in enumerate(block.operations):
-                while die_at[die_index][0] == i:
-                    try:
-                        livevars.remove(die_at[die_index][1])
-                    except KeyError:
-                        pass
-                    die_index += 1
-                if (op.result is not None and
-                    getkind(op.result.concretetype) == self.kind):
-                    dg.add_node(op.result)
-                    for v in livevars:
-                        if getkind(v.concretetype) == self.kind:
-                            dg.add_edge(v, op.result)
-                    livevars.add(op.result)
-        self._depgraph = dg
-
-    def coalesce_variables(self):
-        self._unionfind = UnionFind()
-        pendingblocks = list(self.graph.iterblocks())
-        while pendingblocks:
-            block = pendingblocks.pop()
-            # Aggressively try to coalesce each source variable with its
-            # target.  We start from the end of the graph instead of
-            # from the beginning.  This is a bit arbitrary, but the idea
-            # is that the end of the graph runs typically more often
-            # than the start, given that we resume execution from the
-            # middle during blackholing.
-            for link in block.exits:
-                if link.last_exception is not None:
-                    self._depgraph.add_node(link.last_exception)
-                if link.last_exc_value is not None:
-                    self._depgraph.add_node(link.last_exc_value)
-                for i, v in enumerate(link.args):
-                    self._try_coalesce(v, link.target.inputargs[i])
-
-    def _try_coalesce(self, v, w):
-        if isinstance(v, Variable) and getkind(v.concretetype) == self.kind:
-            assert getkind(w.concretetype) == self.kind
-            dg = self._depgraph
-            uf = self._unionfind
-            v0 = uf.find_rep(v)
-            w0 = uf.find_rep(w)
-            if v0 is not w0 and v0 not in dg.neighbours[w0]:
-                _, rep, _ = uf.union(v0, w0)
-                assert uf.find_rep(v0) is uf.find_rep(w0) is rep
-                if rep is v0:
-                    dg.coalesce(w0, v0)
-                else:
-                    assert rep is w0
-                    dg.coalesce(v0, w0)
-
-    def find_node_coloring(self):
-        self._coloring = self._depgraph.find_node_coloring()
-        if self.DEBUG_REGALLOC:
-            for block in self.graph.iterblocks():
-                print block
-                for v in block.getvariables():
-                    print '\t', v, '\t', self.getcolor(v)
-
-    def getcolor(self, v):
-        return self._coloring[self._unionfind.find_rep(v)]
-
-    def swapcolors(self, col1, col2):
-        for key, value in self._coloring.items():
-            if value == col1:
-                self._coloring[key] = col2
-            elif value == col2:
-                self._coloring[key] = col1
+    checkkind = lambda v: getkind(v.concretetype) == kind
+    return regalloc.perform_register_allocation(graph, checkkind, ListOfKind)
diff --git a/pypy/jit/codewriter/support.py b/pypy/jit/codewriter/support.py
--- a/pypy/jit/codewriter/support.py
+++ b/pypy/jit/codewriter/support.py
@@ -20,6 +20,7 @@
 from pypy.rpython.annlowlevel import MixLevelHelperAnnotator
 from pypy.jit.metainterp.typesystem import deref
 from pypy.rlib import rgc
+from pypy.rlib.jit import elidable
 from pypy.rlib.rarithmetic import r_longlong, r_ulonglong, r_uint, intmask
 
 def getargtypes(annotator, values):
@@ -167,9 +168,14 @@
 
 _ll_5_list_ll_arraycopy = rgc.ll_arraycopy
 
+ at elidable
 def _ll_1_gc_identityhash(x):
     return lltype.identityhash(x)
 
+# the following function should not be "@elidable": I can think of
+# a corner case in which id(const) is constant-folded, and then 'const'
+# disappears and is collected too early (possibly causing another object
+# with the same id() to appear).
 def _ll_1_gc_id(ptr):
     return llop.gc_id(lltype.Signed, ptr)
 
@@ -420,10 +426,6 @@
     _ll_1_dict_values.need_result_type = True
     _ll_1_dict_items .need_result_type = True
 
-    def _ll_1_newdictiter(ITER, d):
-        return ll_rdict.ll_dictiter(lltype.Ptr(ITER), d)
-    _ll_1_newdictiter.need_result_type = True
-
     _dictnext_keys   = staticmethod(ll_rdict.ll_dictnext_group['keys'])
     _dictnext_values = staticmethod(ll_rdict.ll_dictnext_group['values'])
     _dictnext_items  = staticmethod(ll_rdict.ll_dictnext_group['items'])
@@ -574,10 +576,6 @@
     _ll_1_dict_values.need_result_type = True
     _ll_1_dict_items .need_result_type = True
 
-    def _ll_1_newdictiter(ITER, d):
-        return oo_rdict.ll_dictiter(ITER, d)
-    _ll_1_newdictiter.need_result_type = True
-
     _dictnext_keys   = staticmethod(oo_rdict.ll_dictnext_group['keys'])
     _dictnext_values = staticmethod(oo_rdict.ll_dictnext_group['values'])
     _dictnext_items  = staticmethod(oo_rdict.ll_dictnext_group['items'])
diff --git a/pypy/jit/codewriter/test/test_jtransform.py b/pypy/jit/codewriter/test/test_jtransform.py
--- a/pypy/jit/codewriter/test/test_jtransform.py
+++ b/pypy/jit/codewriter/test/test_jtransform.py
@@ -120,9 +120,9 @@
             assert argtypes[0] == [v.concretetype for v in op.args[1:]]
             assert argtypes[1] == op.result.concretetype
             if oopspecindex == EI.OS_STR2UNICODE:
-                assert extraeffect == None    # not pure, can raise!
+                assert extraeffect == EI.EF_ELIDABLE_CAN_RAISE
             else:
-                assert extraeffect == EI.EF_ELIDABLE
+                assert extraeffect == EI.EF_ELIDABLE_CANNOT_RAISE
         return 'calldescr-%d' % oopspecindex
     def calldescr_canraise(self, calldescr):
         return False
@@ -769,7 +769,7 @@
         def get_vinfo(self, v):
             return None
         def could_be_green_field(self, S1, name1):
-            assert S1 is S
+            assert S1 == S
             assert name1 == 'x'
             return True
     S = lltype.GcStruct('S', ('x', lltype.Char),
diff --git a/pypy/jit/codewriter/test/test_longlong.py b/pypy/jit/codewriter/test/test_longlong.py
--- a/pypy/jit/codewriter/test/test_longlong.py
+++ b/pypy/jit/codewriter/test/test_longlong.py
@@ -230,3 +230,18 @@
             assert list(op1.args[3]) == []
             assert list(op1.args[4]) == vlist
             assert op1.result == v_result
+
+
+##def test_singlefloat_constants():
+##    v_x = varoftype(TYPE)
+##    vlist = [v_x, const(rffi.cast(TYPE, 7))]
+##    v_result = varoftype(TYPE)
+##    op = SpaceOperation('llong_add', vlist, v_result)
+##    tr = Transformer(FakeCPU(), FakeBuiltinCallControl())
+##    op1 = tr.rewrite_operation(op)
+##    #
+##    assert op1.opname == 'residual_call_irf_f'
+##    assert list(op1.args[2]) == []
+##    assert list(op1.args[3]) == []
+##    assert list(op1.args[4]) == vlist
+##    assert op1.result == v_result
diff --git a/pypy/jit/codewriter/test/test_policy.py b/pypy/jit/codewriter/test/test_policy.py
--- a/pypy/jit/codewriter/test/test_policy.py
+++ b/pypy/jit/codewriter/test/test_policy.py
@@ -12,24 +12,30 @@
     graph = support.getgraph(f, [5])
     for sf in [False, True]:
         for sll in [False, True]:
-            assert not contains_unsupported_variable_type(graph, sf, sll)
+            for ssf in [False, True]:
+                assert not contains_unsupported_variable_type(graph, sf,
+                                                              sll, ssf)
     #
     graph = support.getgraph(f, [5.5])
     for sf in [False, True]:
         for sll in [False, True]:
-            res = contains_unsupported_variable_type(graph, sf, sll)
-            assert res is not sf
+            for ssf in [False, True]:
+                res = contains_unsupported_variable_type(graph, sf, sll, ssf)
+                assert res is not sf
     #
     graph = support.getgraph(f, [r_singlefloat(5.5)])
     for sf in [False, True]:
         for sll in [False, True]:
-            assert contains_unsupported_variable_type(graph, sf, sll)
+            for ssf in [False, True]:
+                res = contains_unsupported_variable_type(graph, sf, sll, ssf)
+                assert res == (not ssf)
     #
     graph = support.getgraph(f, [r_longlong(5)])
     for sf in [False, True]:
         for sll in [False, True]:
-            res = contains_unsupported_variable_type(graph, sf, sll)
-            assert res == (sys.maxint == 2147483647 and not sll)
+            for ssf in [False, True]:
+                res = contains_unsupported_variable_type(graph, sf, sll, ssf)
+                assert res == (sys.maxint == 2147483647 and not sll)
 
 
 def test_regular_function():
diff --git a/pypy/jit/metainterp/blackhole.py b/pypy/jit/metainterp/blackhole.py
--- a/pypy/jit/metainterp/blackhole.py
+++ b/pypy/jit/metainterp/blackhole.py
@@ -500,6 +500,9 @@
     @arguments("r", returns="i")
     def bhimpl_ptr_nonzero(a):
         return bool(a)
+    @arguments("r", returns="r")
+    def bhimpl_cast_opaque_ptr(a):
+        return a
 
     @arguments("i", returns="i")
     def bhimpl_int_copy(a):
@@ -623,6 +626,19 @@
         x = float(a)
         return longlong.getfloatstorage(x)
 
+    @arguments("f", returns="i")
+    def bhimpl_cast_float_to_singlefloat(a):
+        from pypy.rlib.rarithmetic import r_singlefloat
+        a = longlong.getrealfloat(a)
+        a = r_singlefloat(a)
+        return longlong.singlefloat2int(a)
+
+    @arguments("i", returns="f")
+    def bhimpl_cast_singlefloat_to_float(a):
+        a = longlong.int2singlefloat(a)
+        a = float(a)
+        return longlong.getfloatstorage(a)
+
     # ----------
     # control flow operations
 
diff --git a/pypy/jit/metainterp/compile.py b/pypy/jit/metainterp/compile.py
--- a/pypy/jit/metainterp/compile.py
+++ b/pypy/jit/metainterp/compile.py
@@ -673,10 +673,9 @@
     def handle_fail(self, metainterp_sd, jitdriver_sd):
         cpu = metainterp_sd.cpu
         exception = cpu.grab_exc_value()
+        assert exception, "PropagateExceptionDescr: no exception??"
         raise metainterp_sd.ExitFrameWithExceptionRef(cpu, exception)
 
-propagate_exception_descr = PropagateExceptionDescr()
-
 def compile_tmp_callback(cpu, jitdriver_sd, greenboxes, redboxes,
                          memory_manager=None):
     """Make a LoopToken that corresponds to assembler code that just
@@ -710,7 +709,7 @@
         finishargs = []
     #
     jd = jitdriver_sd
-    faildescr = propagate_exception_descr
+    faildescr = PropagateExceptionDescr()
     operations = [
         ResOperation(rop.CALL, callargs, result, descr=jd.portal_calldescr),
         ResOperation(rop.GUARD_NO_EXCEPTION, [], None, descr=faildescr),
diff --git a/pypy/jit/metainterp/executor.py b/pypy/jit/metainterp/executor.py
--- a/pypy/jit/metainterp/executor.py
+++ b/pypy/jit/metainterp/executor.py
@@ -50,7 +50,7 @@
     func = argboxes[0].getint()
     # do the call using the correct function from the cpu
     rettype = descr.get_return_type()
-    if rettype == INT:
+    if rettype == INT or rettype == 'S':       # *S*ingle float
         try:
             result = cpu.bh_call_i(func, descr, args_i, args_r, args_f)
         except Exception, e:
@@ -64,7 +64,7 @@
             metainterp.execute_raised(e)
             result = NULL
         return BoxPtr(result)
-    if rettype == FLOAT or rettype == 'L':
+    if rettype == FLOAT or rettype == 'L':     # *L*ong long
         try:
             result = cpu.bh_call_f(func, descr, args_i, args_r, args_f)
         except Exception, e:
diff --git a/pypy/jit/metainterp/history.py b/pypy/jit/metainterp/history.py
--- a/pypy/jit/metainterp/history.py
+++ b/pypy/jit/metainterp/history.py
@@ -20,12 +20,16 @@
 
 FAILARGS_LIMIT = 1000
 
-def getkind(TYPE, supports_floats=True, supports_longlong=True):
+def getkind(TYPE, supports_floats=True,
+                  supports_longlong=True,
+                  supports_singlefloats=True):
     if TYPE is lltype.Void:
         return "void"
     elif isinstance(TYPE, lltype.Primitive):
         if TYPE is lltype.Float and supports_floats:
             return 'float'
+        if TYPE is lltype.SingleFloat and supports_singlefloats:
+            return 'int'     # singlefloats are stored in an int
         if TYPE in (lltype.Float, lltype.SingleFloat):
             raise NotImplementedError("type %s not supported" % TYPE)
         # XXX fix this for oo...
@@ -145,6 +149,7 @@
         """ Implement in call descr.
         Must return INT, REF, FLOAT, or 'v' for void.
         On 32-bit (hack) it can also be 'L' for longlongs.
+        Additionally it can be 'S' for singlefloats.
         """
         raise NotImplementedError
 
diff --git a/pypy/jit/metainterp/optimizeopt/__init__.py b/pypy/jit/metainterp/optimizeopt/__init__.py
--- a/pypy/jit/metainterp/optimizeopt/__init__.py
+++ b/pypy/jit/metainterp/optimizeopt/__init__.py
@@ -51,7 +51,7 @@
 
 
 def optimize_loop_1(metainterp_sd, loop, enable_opts,
-                    inline_short_preamble=True, retraced=False):
+                    inline_short_preamble=True, retraced=False, bridge=False):
     """Optimize loop.operations to remove internal overheadish operations.
     """
 
@@ -60,7 +60,7 @@
     if unroll:
         optimize_unroll(metainterp_sd, loop, optimizations)
     else:
-        optimizer = Optimizer(metainterp_sd, loop, optimizations)
+        optimizer = Optimizer(metainterp_sd, loop, optimizations, bridge)
         optimizer.propagate_all_forward()
 
 def optimize_bridge_1(metainterp_sd, bridge, enable_opts,
@@ -72,7 +72,7 @@
     except KeyError:
         pass
     optimize_loop_1(metainterp_sd, bridge, enable_opts,
-                    inline_short_preamble, retraced)
+                    inline_short_preamble, retraced, bridge=True)
 
 if __name__ == '__main__':
     print ALL_OPTS_NAMES
diff --git a/pypy/jit/metainterp/optimizeopt/fficall.py b/pypy/jit/metainterp/optimizeopt/fficall.py
--- a/pypy/jit/metainterp/optimizeopt/fficall.py
+++ b/pypy/jit/metainterp/optimizeopt/fficall.py
@@ -1,12 +1,11 @@
 from pypy.rpython.annlowlevel import cast_base_ptr_to_instance
 from pypy.rlib.objectmodel import we_are_translated
 from pypy.rlib.libffi import Func
-from pypy.rlib.debug import debug_start, debug_stop, debug_print, have_debug_prints
+from pypy.rlib.debug import debug_print
 from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method
 from pypy.jit.metainterp.optimizeopt.optimizer import Optimization
-from pypy.jit.backend.llsupport.ffisupport import UnsupportedKind
 
 
 class FuncInfo(object):
@@ -20,11 +19,8 @@
         self.funcval = funcval
         self.opargs = []
         argtypes, restype = self._get_signature(funcval)
-        try:
-            self.descr = cpu.calldescrof_dynamic(argtypes, restype)
-        except UnsupportedKind:
-            # e.g., I or U for long longs
-            self.descr = None
+        self.descr = cpu.calldescrof_dynamic(argtypes, restype)
+        # ^^^ may be None if unsupported
         self.prepare_op = prepare_op
         self.delayed_ops = []
 
@@ -48,7 +44,7 @@
           inst_argtypes is actually a low-level array, but we can use it
           directly since the only thing we do with it is to read its items
         """
-        
+
         llfunc = funcval.box.getref_base()
         if we_are_translated():
             func = cast_base_ptr_to_instance(Func, llfunc)
@@ -78,14 +74,6 @@
         else:
             self.logops = None
 
-    def propagate_begin_forward(self):
-        debug_start('jit-log-ffiopt')
-        Optimization.propagate_begin_forward(self)
-
-    def propagate_end_forward(self):
-        debug_stop('jit-log-ffiopt')
-        Optimization.propagate_end_forward(self)
-
     def new(self):
         return OptFfiCall()
     
@@ -188,7 +176,8 @@
     def do_call(self, op):
         funcval = self._get_funcval(op)
         funcinfo = self.funcinfo
-        if not funcinfo or funcinfo.funcval is not funcval:
+        if (not funcinfo or funcinfo.funcval is not funcval or
+            funcinfo.descr is None):
             return [op] # cannot optimize
         funcsymval = self.getvalue(op.getarg(2))
         arglist = [funcsymval.force_box()]
diff --git a/pypy/jit/metainterp/optimizeopt/heap.py b/pypy/jit/metainterp/optimizeopt/heap.py
--- a/pypy/jit/metainterp/optimizeopt/heap.py
+++ b/pypy/jit/metainterp/optimizeopt/heap.py
@@ -1,10 +1,11 @@
 import os
-from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method
-from pypy.jit.metainterp.resoperation import rop, ResOperation
-from pypy.rlib.objectmodel import we_are_translated
+
 from pypy.jit.metainterp.jitexc import JitException
 from pypy.jit.metainterp.optimizeopt.optimizer import Optimization, MODE_ARRAY
 from pypy.jit.metainterp.history import ConstInt, Const
+from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method
+from pypy.jit.metainterp.resoperation import rop
+from pypy.rlib.objectmodel import we_are_translated
 
 
 class CachedField(object):
@@ -284,13 +285,14 @@
             return
         cf.force_lazy_setfield(self, can_cache)
 
-    def force_lazy_setarrayitem(self, arraydescr, can_cache=True):
+    def force_lazy_setarrayitem(self, arraydescr, indexvalue=None, can_cache=True):
         try:
             submap = self.cached_arrayitems[arraydescr]
         except KeyError:
             return
-        for cf in submap.values():
-            cf.force_lazy_setfield(self, can_cache)
+        for idx, cf in submap.iteritems():
+            if indexvalue is None or indexvalue.intbound.contains(idx):
+                cf.force_lazy_setfield(self, can_cache)
 
     def fixup_guard_situation(self):
         # hackish: reverse the order of the last two operations if it makes
@@ -403,7 +405,7 @@
                 return
         else:
             # variable index, so make sure the lazy setarrayitems are done
-            self.force_lazy_setarrayitem(op.getdescr())
+            self.force_lazy_setarrayitem(op.getdescr(), indexvalue=indexvalue)
         # default case: produce the operation
         arrayvalue.ensure_nonnull()
         self.emit_operation(op)
@@ -429,7 +431,7 @@
             cf.do_setfield(self, op)
         else:
             # variable index, so make sure the lazy setarrayitems are done
-            self.force_lazy_setarrayitem(op.getdescr(), can_cache=False)
+            self.force_lazy_setarrayitem(op.getdescr(), indexvalue=indexvalue, can_cache=False)
             # and then emit the operation
             self.emit_operation(op)
 
diff --git a/pypy/jit/metainterp/optimizeopt/intbounds.py b/pypy/jit/metainterp/optimizeopt/intbounds.py
--- a/pypy/jit/metainterp/optimizeopt/intbounds.py
+++ b/pypy/jit/metainterp/optimizeopt/intbounds.py
@@ -1,10 +1,11 @@
 from pypy.jit.metainterp.optimizeopt.optimizer import Optimization, CONST_1, CONST_0, \
                                                   MODE_ARRAY, MODE_STR, MODE_UNICODE
+from pypy.jit.metainterp.history import ConstInt
+from pypy.jit.metainterp.optimizeopt.intutils import (IntBound, IntLowerBound,
+    IntUpperBound)
 from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method
-from pypy.jit.metainterp.optimizeopt.intutils import (IntBound, IntUnbounded,
-    IntLowerBound, IntUpperBound)
-from pypy.jit.metainterp.history import Const, ConstInt
-from pypy.jit.metainterp.resoperation import rop, ResOperation
+from pypy.jit.metainterp.resoperation import rop
+
 
 class OptIntBounds(Optimization):
     """Keeps track of the bounds placed on integers by guards and remove
@@ -129,6 +130,17 @@
         r = self.getvalue(op.result)
         r.intbound.intersect(v1.intbound.div_bound(v2.intbound))
 
+    def optimize_INT_MOD(self, op):
+        self.emit_operation(op)
+        v2 = self.getvalue(op.getarg(1))
+        if v2.is_constant():
+            val = v2.box.getint()
+            r = self.getvalue(op.result)
+            if val < 0:
+                val = -val
+            r.intbound.make_gt(IntBound(-val, -val))
+            r.intbound.make_lt(IntBound(val, val))
+
     def optimize_INT_LSHIFT(self, op):
         v1 = self.getvalue(op.getarg(0))
         v2 = self.getvalue(op.getarg(1))
diff --git a/pypy/jit/metainterp/optimizeopt/optimizer.py b/pypy/jit/metainterp/optimizeopt/optimizer.py
--- a/pypy/jit/metainterp/optimizeopt/optimizer.py
+++ b/pypy/jit/metainterp/optimizeopt/optimizer.py
@@ -1,19 +1,13 @@
-from pypy.jit.metainterp.history import Box, BoxInt, LoopToken, BoxFloat,\
-     ConstFloat
-from pypy.jit.metainterp.history import Const, ConstInt, ConstPtr, ConstObj, REF
-from pypy.jit.metainterp.resoperation import rop, ResOperation
-from pypy.jit.metainterp import jitprof
+from pypy.jit.metainterp import jitprof, resume, compile
 from pypy.jit.metainterp.executor import execute_nonspec
-from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method, sort_descrs
-from pypy.jit.metainterp.optimizeopt.util import descrlist_dict, args_dict
-from pypy.jit.metainterp.optimize import InvalidLoop
-from pypy.jit.metainterp import resume, compile
-from pypy.jit.metainterp.typesystem import llhelper, oohelper
-from pypy.rpython.lltypesystem import lltype
-from pypy.jit.metainterp.history import AbstractDescr, make_hashable_int
+from pypy.jit.metainterp.history import BoxInt, BoxFloat, Const, ConstInt, REF
 from pypy.jit.metainterp.optimizeopt.intutils import IntBound, IntUnbounded, \
                                                      ImmutableIntUnbounded, \
                                                      IntLowerBound, MININT, MAXINT
+from pypy.jit.metainterp.optimizeopt.util import (make_dispatcher_method,
+    args_dict)
+from pypy.jit.metainterp.resoperation import rop, ResOperation
+from pypy.jit.metainterp.typesystem import llhelper, oohelper
 from pypy.tool.pairtype import extendabletype
 from pypy.rlib.debug import debug_start, debug_stop, debug_print
 
@@ -337,10 +331,11 @@
 
 class Optimizer(Optimization):
 
-    def __init__(self, metainterp_sd, loop, optimizations=None):
+    def __init__(self, metainterp_sd, loop, optimizations=None, bridge=False):
         self.metainterp_sd = metainterp_sd
         self.cpu = metainterp_sd.cpu
         self.loop = loop
+        self.bridge = bridge
         self.values = {}
         self.interned_refs = self.cpu.ts.new_ref_dict()
         self.resumedata_memo = resume.ResumeDataLoopMemo(metainterp_sd)
@@ -352,6 +347,7 @@
         self.posponedop = None
         self.exception_might_have_happened = False
         self.quasi_immutable_deps = None
+        self.opaque_pointers = {}
         self.newoperations = []
         self.emitting_dissabled = False
         if loop is not None:
@@ -543,9 +539,7 @@
             return CVAL_ZERO
 
     def propagate_all_forward(self):
-        self.exception_might_have_happened = True
-        # ^^^ at least at the start of bridges.  For loops, we could set
-        # it to False, but we probably don't care
+        self.exception_might_have_happened = self.bridge
         self.newoperations = []
         self.first_optimization.propagate_begin_forward()
         self.i = 0
@@ -692,6 +686,11 @@
     def optimize_DEBUG_MERGE_POINT(self, op):
         self.emit_operation(op)
 
+    def optimize_CAST_OPAQUE_PTR(self, op):
+        value = self.getvalue(op.getarg(0))
+        self.opaque_pointers[value] = True
+        self.make_equal_to(op.result, value)
+
     def optimize_GETARRAYITEM_GC_PURE(self, op):
         indexvalue = self.getvalue(op.getarg(1))
         if indexvalue.is_constant():
diff --git a/pypy/jit/metainterp/optimizeopt/rewrite.py b/pypy/jit/metainterp/optimizeopt/rewrite.py
--- a/pypy/jit/metainterp/optimizeopt/rewrite.py
+++ b/pypy/jit/metainterp/optimizeopt/rewrite.py
@@ -1,10 +1,11 @@
+from pypy.jit.codewriter.effectinfo import EffectInfo
+from pypy.jit.metainterp.history import ConstInt, make_hashable_int
+from pypy.jit.metainterp.optimize import InvalidLoop
+from pypy.jit.metainterp.optimizeopt.intutils import IntBound
 from pypy.jit.metainterp.optimizeopt.optimizer import *
-from pypy.jit.metainterp.resoperation import opboolinvers, opboolreflex
-from pypy.jit.metainterp.history import ConstInt
 from pypy.jit.metainterp.optimizeopt.util import _findall, make_dispatcher_method
-from pypy.jit.metainterp.resoperation import rop, ResOperation
-from pypy.jit.codewriter.effectinfo import EffectInfo
-from pypy.jit.metainterp.optimizeopt.intutils import IntBound
+from pypy.jit.metainterp.resoperation import (opboolinvers, opboolreflex, rop,
+    ResOperation)
 from pypy.rlib.rarithmetic import highest_bit
 
 
diff --git a/pypy/jit/metainterp/optimizeopt/simplify.py b/pypy/jit/metainterp/optimizeopt/simplify.py
--- a/pypy/jit/metainterp/optimizeopt/simplify.py
+++ b/pypy/jit/metainterp/optimizeopt/simplify.py
@@ -1,7 +1,7 @@
-
-from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.optimizeopt.optimizer import Optimization
 from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method
+from pypy.jit.metainterp.resoperation import ResOperation, rop
+
 
 class OptSimplify(Optimization):
     def optimize_CALL_PURE(self, op):
@@ -25,6 +25,8 @@
         #     but it's a bit hard to implement robustly if heap.py is also run
         pass
 
+    optimize_CAST_OPAQUE_PTR = optimize_VIRTUAL_REF
+
 
 dispatch_opt = make_dispatcher_method(OptSimplify, 'optimize_',
         default=OptSimplify.emit_operation)
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_optimizebasic.py b/pypy/jit/metainterp/optimizeopt/test/test_optimizebasic.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_optimizebasic.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_optimizebasic.py
@@ -693,7 +693,6 @@
         """
         expected = """
         [i]
-        guard_no_exception() []
         i1 = int_add(i, 3)
         i2 = call(i1, descr=nonwritedescr)
         guard_no_exception() [i1, i2]
@@ -4532,7 +4531,7 @@
         escape(i1)
         jump(p0, i0)
         """
-        self.optimize_loop(ops, expected)
+        self.optimize_strunicode_loop(ops, expected)
 
     def test_int_is_true_bounds(self):
         ops = """
@@ -4551,7 +4550,7 @@
         guard_true(i1) []
         jump(p0)
         """
-        self.optimize_loop(ops, expected)
+        self.optimize_strunicode_loop(ops, expected)
 
     def test_strslice_subtraction_folds(self):
         ops = """
@@ -4586,6 +4585,132 @@
         """
         self.optimize_loop(ops, expected)
 
+    def test_null_char_str(self):
+        ops = """
+        [p0]
+        p1 = newstr(4)
+        setfield_gc(p0, p1, descr=valuedescr)
+        jump(p0)
+        """
+        # It used to be the case that this would have a series of
+        # strsetitem(p1, idx, 0), which was silly because memory is 0 filled
+        # when allocated.
+        expected = """
+        [p0]
+        p1 = newstr(4)
+        setfield_gc(p0, p1, descr=valuedescr)
+        jump(p0)
+        """
+        self.optimize_strunicode_loop(ops, expected)
+
+    def test_newstr_strlen(self):
+        ops = """
+        [i0]
+        p0 = newstr(i0)
+        escape(p0)
+        i1 = strlen(p0)
+        i2 = int_add(i1, 1)
+        jump(i2)
+        """
+        expected = """
+        [i0]
+        p0 = newstr(i0)
+        escape(p0)
+        i1 = int_add(i0, 1)
+        jump(i1)
+        """
+        self.optimize_strunicode_loop(ops, expected)
+
+    def test_intmod_bounds(self):
+        ops = """
+        [i0, i1]
+        i2 = int_mod(i0, 12)
+        i3 = int_gt(i2, 12)
+        guard_false(i3) []
+        i4 = int_lt(i2, -12)
+        guard_false(i4) []
+        i5 = int_mod(i1, -12)
+        i6 = int_lt(i5, -12)
+        guard_false(i6) []
+        i7 = int_gt(i5, 12)
+        guard_false(i7) []
+        jump(i2, i5)
+        """
+        expected = """
+        [i0, i1]
+        i2 = int_mod(i0, 12)
+        i5 = int_mod(i1, -12)
+        jump(i2, i5)
+        """
+        self.optimize_loop(ops, expected)
+
+        # This the sequence of resoperations that is generated for a Python
+        # app-level int % int.  When the modulus is constant and when i0
+        # is known non-negative it should be optimized to a single int_mod.
+        ops = """
+        [i0]
+        i5 = int_ge(i0, 0)
+        guard_true(i5) []
+        i1 = int_mod(i0, 42)
+        i2 = int_rshift(i1, 63)
+        i3 = int_and(42, i2)
+        i4 = int_add(i1, i3)
+        finish(i4)
+        """
+        expected = """
+        [i0]
+        i5 = int_ge(i0, 0)
+        guard_true(i5) []
+        i1 = int_mod(i0, 42)
+        finish(i1)
+        """
+        py.test.skip("in-progress")
+        self.optimize_loop(ops, expected)
+
+        # Also, 'n % power-of-two' can be turned into int_and(),
+        # but that's a bit harder to detect here because it turns into
+        # several operations, and of course it is wrong to just turn
+        # int_mod(i0, 16) into int_and(i0, 15).
+        ops = """
+        [i0]
+        i1 = int_mod(i0, 16)
+        i2 = int_rshift(i1, 63)
+        i3 = int_and(16, i2)
+        i4 = int_add(i1, i3)
+        finish(i4)
+        """
+        expected = """
+        [i0]
+        i4 = int_and(i0, 15)
+        finish(i4)
+        """
+        py.test.skip("harder")
+        self.optimize_loop(ops, expected)
+
+    def test_bounded_lazy_setfield(self):
+        ops = """
+        [p0, i0]
+        i1 = int_gt(i0, 2)
+        guard_true(i1) []
+        setarrayitem_gc(p0, 0, 3)
+        setarrayitem_gc(p0, 2, 4)
+        setarrayitem_gc(p0, i0, 15)
+        i2 = getarrayitem_gc(p0, 2)
+        jump(p0, i2)
+        """
+        # Remove the getarrayitem_gc, because we know that p[i0] does not alias
+        # p0[2]
+        expected = """
+        [p0, i0]
+        i1 = int_gt(i0, 2)
+        guard_true(i1) []
+        setarrayitem_gc(p0, i0, 15)
+        setarrayitem_gc(p0, 0, 3)
+        setarrayitem_gc(p0, 2, 4)
+        jump(p0, 4)
+        """
+        self.optimize_loop(ops, expected)
+
 
 class TestLLtype(BaseTestOptimizeBasic, LLtypeMixin):
     pass
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py b/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_optimizeopt.py
@@ -917,12 +917,10 @@
         i3 = call(i2, descr=nonwritedescr)
         jump(i1)       # the exception is considered lost when we loop back
         """
-        # note that 'guard_no_exception' at the very start must be kept
-        # around: bridges may start with one.  (In case of loops we could
-        # remove it, but we probably don't care.)
+        # note that 'guard_no_exception' at the very start is kept around
+        # for bridges, but not for loops
         preamble = """
         [i]
-        guard_no_exception() []
         i1 = int_add(i, 3)
         i2 = call(i1, descr=nonwritedescr)
         guard_no_exception() [i1, i2]
@@ -3247,6 +3245,38 @@
         '''
         self.optimize_loop(ops, expected, preamble, call_pure_results)
 
+    def test_call_pure_constant_folding_exc(self):
+        # CALL_PURE may be followed by GUARD_NO_EXCEPTION
+        arg_consts = [ConstInt(i) for i in (123456, 4, 5, 6)]
+        call_pure_results = {tuple(arg_consts): ConstInt(42)}
+        ops = '''
+        [i0, i1, i2]
+        escape(i1)
+        escape(i2)
+        i3 = call_pure(123456, 4, 5, 6, descr=plaincalldescr)
+        guard_no_exception() []
+        i4 = call_pure(123456, 4, i0, 6, descr=plaincalldescr)
+        guard_no_exception() []
+        jump(i0, i3, i4)
+        '''
+        preamble = '''
+        [i0, i1, i2]
+        escape(i1)
+        escape(i2)
+        i4 = call(123456, 4, i0, 6, descr=plaincalldescr)
+        guard_no_exception() []
+        jump(i0, i4)
+        '''
+        expected = '''
+        [i0, i2]
+        escape(42)
+        escape(i2)
+        i4 = call(123456, 4, i0, 6, descr=plaincalldescr)
+        guard_no_exception() []
+        jump(i0, i4)
+        '''
+        self.optimize_loop(ops, expected, preamble, call_pure_results)
+
     # ----------
 
     def test_vref_nonvirtual_nonescape(self):
diff --git a/pypy/jit/metainterp/optimizeopt/unroll.py b/pypy/jit/metainterp/optimizeopt/unroll.py
--- a/pypy/jit/metainterp/optimizeopt/unroll.py
+++ b/pypy/jit/metainterp/optimizeopt/unroll.py
@@ -1,15 +1,14 @@
+from pypy.jit.codewriter.effectinfo import EffectInfo
+from pypy.jit.metainterp.optimizeopt.virtualstate import VirtualStateAdder, ShortBoxes
+from pypy.jit.metainterp.compile import ResumeGuardDescr
+from pypy.jit.metainterp.history import TreeLoop, LoopToken
+from pypy.jit.metainterp.jitexc import JitException
+from pypy.jit.metainterp.optimize import InvalidLoop, RetraceLoop
 from pypy.jit.metainterp.optimizeopt.optimizer import *
-from pypy.jit.metainterp.optimizeopt.virtualstate import VirtualStateAdder, ShortBoxes
+from pypy.jit.metainterp.optimizeopt.generalize import KillHugeIntBounds
 from pypy.jit.metainterp.resoperation import rop, ResOperation
-from pypy.jit.metainterp.compile import ResumeGuardDescr
 from pypy.jit.metainterp.resume import Snapshot
-from pypy.jit.metainterp.history import TreeLoop, LoopToken
-from pypy.rlib.debug import debug_start, debug_stop, debug_print
-from pypy.jit.metainterp.optimize import InvalidLoop, RetraceLoop
-from pypy.jit.metainterp.jitexc import JitException
-from pypy.jit.metainterp.history import make_hashable_int
-from pypy.jit.codewriter.effectinfo import EffectInfo
-from pypy.jit.metainterp.optimizeopt.generalize import KillHugeIntBounds
+from pypy.rlib.debug import debug_print
 
 # Assumptions
 # ===========
@@ -23,7 +22,7 @@
 # are also recreated to allow virtuals not supported to be forced.
 #
 # First of all, the optimizations are not allowed to introduce new
-# boxes. It is the unoptimized version of the trace that is inlined to 
+# boxes. It is the unoptimized version of the trace that is inlined to
 # form the second iteration of the loop. Otherwise the
 # state of the virtuals would not be updated correctly. Whenever some
 # box from the first iteration is reused in the second iteration, it
@@ -58,7 +57,7 @@
 # be absorbed into the virtual p2 and never seen by the heap
 # optimizer. At the end of the loop both p2 and p3 are virtuals, but
 # the loop needs p2 to be a pointer to be able to call itself. So it
-# is forced producing the operations 
+# is forced producing the operations
 #
 #         p2 = new_with_vtable(ConstClass(node_vtable))
 #         setfield_gc(p2, i2, descr=nextdescr)
@@ -69,7 +68,7 @@
 # the trace were optimized under the wrong assumption that the
 # setfield_gc was store sinked which could lead to errors. In this
 # case what would happen is that it would be inserted once more in
-# front of the guard. 
+# front of the guard.
 
 
 
@@ -116,7 +115,7 @@
     def inline_descr_inplace(self, descr):
         if isinstance(descr, ResumeGuardDescr):
             descr.rd_snapshot = self.inline_snapshot(descr.rd_snapshot)
-            
+
     def inline_arg(self, arg):
         if arg is None:
             return None
@@ -132,11 +131,12 @@
         self.snapshot_map[snapshot] = new_snapshot
         return new_snapshot
 
+
 class UnrollOptimizer(Optimization):
     """Unroll the loop into two iterations. The first one will
     become the preamble or entry bridge (don't think there is a
     distinction anymore)"""
-    
+
     def __init__(self, metainterp_sd, loop, optimizations):
         self.optimizer = Optimizer(metainterp_sd, loop, optimizations)
         self.cloned_operations = []
@@ -329,7 +329,7 @@
                 for op in short_loop.operations:
                     if op.result:
                         op.result.forget_value()
-                
+
     def inline(self, inputargs, loop_operations, loop_args, short_inputargs, virtual_state):
         inliner = self.inliner
 
@@ -546,7 +546,7 @@
                             "at preamble position: ", preamble_i,
                             "loop position: ", loop_i)
                 return None
-                
+
             if self.sameop(newop, loop_ops[loop_i]) \
                and loop_i < len(loop_ops):
                 try:
@@ -567,7 +567,7 @@
                                 "loop position: ", loop_i)
                     return None
                 short_preamble.append(op)
-                
+
             state.update(op)
             preamble_i += 1
 
@@ -577,7 +577,7 @@
                         "at position", loop_i)
             return None
 
-        
+
         jumpargs = []
         for i in range(len(loop.inputargs)):
             try:
@@ -605,7 +605,7 @@
                     return None
             if op.result:
                 seen[op.result] = True
-        
+
         return short_preamble
 
 class ExeState(object):
@@ -615,13 +615,16 @@
         self.unsafe_getitem = {}
         self.unsafe_getarrayitem = {}
         self.unsafe_getarrayitem_indexes = {}
-        
+
     # Make sure it is safe to move the instrucions in short_preamble
     # to the top making short_preamble followed by loop equvivalent
     # to preamble
     def safe_to_move(self, op):
         opnum = op.getopnum()
         descr = op.getdescr()
+        for box in op.getarglist():
+            if self.optimizer.getvalue(box) in self.optimizer.opaque_pointers:
+                return False
         if op.is_always_pure() or op.is_foldable_guard():
             return True
         elif opnum == rop.JUMP:
@@ -652,15 +655,17 @@
         elif opnum == rop.CALL:
             effectinfo = descr.get_extra_info()
             if effectinfo is not None:
-                if effectinfo.extraeffect == EffectInfo.EF_LOOPINVARIANT or \
-                   effectinfo.extraeffect == EffectInfo.EF_ELIDABLE:
+                ef = effectinfo.extraeffect
+                if ef == EffectInfo.EF_LOOPINVARIANT or \
+                   ef == EffectInfo.EF_ELIDABLE_CANNOT_RAISE or \
+                   ef == EffectInfo.EF_ELIDABLE_CAN_RAISE:
                     return True
         return False
-    
+
     def update(self, op):
         if (op.has_no_side_effect() or
             op.is_ovf() or
-            op.is_guard()): 
+            op.is_guard()):
             return
         opnum = op.getopnum()
         descr = op.getdescr()
@@ -673,7 +678,7 @@
         if (opnum == rop.SETARRAYITEM_GC or
             opnum == rop.SETARRAYITEM_RAW):
             index = op.getarg(1)
-            if isinstance(index, Const):                
+            if isinstance(index, Const):
                 d = self.unsafe_getarrayitem_indexes.get(descr, None)
                 if d is None:
                     d = self.unsafe_getarrayitem_indexes[descr] = {}
@@ -699,7 +704,7 @@
     def __init__(self):
         self.map = {}
 
-    
+
     def link_ops(self, preambleop, loopop):
         pargs = preambleop.getarglist()
         largs = loopop.getarglist()
@@ -713,7 +718,7 @@
             if not loopop.result:
                 raise ImpossibleLink
             self.link_boxes(preambleop.result, loopop.result)
-        
+
 
     def link_boxes(self, pbox, lbox):
         if lbox in self.map:
@@ -736,11 +741,11 @@
 
     def new(self):
         return OptInlineShortPreamble(self.retraced)
-        
+
     def reconstruct_for_next_iteration(self,  short_boxes, surviving_boxes,
                                        optimizer, valuemap):
         return OptInlineShortPreamble(self.retraced)
-    
+
     def propagate_forward(self, op):
         if op.getopnum() == rop.JUMP:
             loop_token = op.getdescr()
@@ -773,7 +778,7 @@
                             sh.virtual_state.generate_guards(virtual_state,
                                                              args, cpu,
                                                              extra_guards)
-                            
+
                             ok = True
                             debugmsg = 'Guarded to match '
                         except InvalidLoop:
@@ -821,6 +826,7 @@
                     else:
                         debug_print("Retracing (%d of %d)" % (retraced_count,
                                                               limit))
+
                         raise RetraceLoop
                 else:
                     if not loop_token.failed_states:
@@ -828,3 +834,7 @@
                     else:
                         loop_token.failed_states.append(virtual_state)
         self.emit_operation(op)
+
+
+
+
diff --git a/pypy/jit/metainterp/optimizeopt/virtualize.py b/pypy/jit/metainterp/optimizeopt/virtualize.py
--- a/pypy/jit/metainterp/optimizeopt/virtualize.py
+++ b/pypy/jit/metainterp/optimizeopt/virtualize.py
@@ -1,12 +1,12 @@
+from pypy.jit.codewriter.heaptracker import vtable2descr
+from pypy.jit.metainterp.executor import execute
 from pypy.jit.metainterp.history import Const, ConstInt, BoxInt
+from pypy.jit.metainterp.optimizeopt import optimizer
+from pypy.jit.metainterp.optimizeopt.util import (make_dispatcher_method,
+    descrlist_dict, sort_descrs)
 from pypy.jit.metainterp.resoperation import rop, ResOperation
-from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method
-from pypy.jit.metainterp.optimizeopt.util import descrlist_dict, sort_descrs
 from pypy.rlib.objectmodel import we_are_translated
-from pypy.jit.metainterp.optimizeopt import optimizer
 from pypy.jit.metainterp.optimizeopt.optimizer import OptValue
-from pypy.jit.metainterp.executor import execute
-from pypy.jit.codewriter.heaptracker import vtable2descr
 
 
 class AbstractVirtualValue(optimizer.OptValue):
diff --git a/pypy/jit/metainterp/optimizeopt/vstring.py b/pypy/jit/metainterp/optimizeopt/vstring.py
--- a/pypy/jit/metainterp/optimizeopt/vstring.py
+++ b/pypy/jit/metainterp/optimizeopt/vstring.py
@@ -1,18 +1,14 @@
-from pypy.rpython.lltypesystem import lltype, rstr, llmemory
+from pypy.jit.codewriter.effectinfo import EffectInfo
+from pypy.jit.metainterp.history import (BoxInt, Const, ConstInt, ConstPtr,
+    get_const_ptr_for_string, get_const_ptr_for_unicode)
+from pypy.jit.metainterp.optimizeopt import optimizer, virtualize
+from pypy.jit.metainterp.optimizeopt.optimizer import CONST_0, CONST_1, llhelper
+from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method
+from pypy.jit.metainterp.resoperation import rop, ResOperation
+from pypy.rlib.objectmodel import specialize, we_are_translated
+from pypy.rlib.unroll import unrolling_iterable
 from pypy.rpython import annlowlevel
-from pypy.jit.metainterp.history import Box, BoxInt, BoxPtr
-from pypy.jit.metainterp.history import Const, ConstInt, ConstPtr
-from pypy.jit.metainterp.history import get_const_ptr_for_string
-from pypy.jit.metainterp.history import get_const_ptr_for_unicode
-from pypy.jit.metainterp.resoperation import rop, ResOperation
-from pypy.jit.metainterp.optimizeopt import optimizer, virtualize
-from pypy.jit.metainterp.optimizeopt.optimizer import CONST_0, CONST_1
-from pypy.jit.metainterp.optimizeopt.optimizer import llhelper
-from pypy.jit.metainterp.optimizeopt.util import make_dispatcher_method
-from pypy.jit.codewriter.effectinfo import EffectInfo
-from pypy.jit.codewriter import heaptracker
-from pypy.rlib.unroll import unrolling_iterable
-from pypy.rlib.objectmodel import specialize, we_are_translated
+from pypy.rpython.lltypesystem import lltype, rstr
 
 
 class StrOrUnicode(object):
@@ -147,10 +143,11 @@
     def string_copy_parts(self, optimizer, targetbox, offsetbox, mode):
         for i in range(len(self._chars)):
             charbox = self._chars[i].force_box()
-            optimizer.emit_operation(ResOperation(mode.STRSETITEM, [targetbox,
-                                                                offsetbox,
-                                                                charbox],
-                                              None))
+            if not (isinstance(charbox, Const) and charbox.same_constant(CONST_0)):
+                optimizer.emit_operation(ResOperation(mode.STRSETITEM, [targetbox,
+                                                                    offsetbox,
+                                                                    charbox],
+                                                  None))
             offsetbox = _int_add(optimizer, offsetbox, CONST_1)
         return offsetbox
 
@@ -405,6 +402,7 @@
         else:
             self.getvalue(op.result).ensure_nonnull()
             self.emit_operation(op)
+            self.pure(mode.STRLEN, [op.result], op.getarg(0))
 
     def optimize_STRSETITEM(self, op):
         value = self.getvalue(op.getarg(0))
diff --git a/pypy/jit/metainterp/pyjitpl.py b/pypy/jit/metainterp/pyjitpl.py
--- a/pypy/jit/metainterp/pyjitpl.py
+++ b/pypy/jit/metainterp/pyjitpl.py
@@ -215,6 +215,7 @@
 
     for _opimpl in ['int_is_true', 'int_is_zero', 'int_neg', 'int_invert',
                     'cast_float_to_int', 'cast_int_to_float',
+                    'cast_float_to_singlefloat', 'cast_singlefloat_to_float',
                     'float_neg', 'float_abs',
                     ]:
         exec py.code.Source('''
@@ -232,6 +233,10 @@
         return self.execute(rop.PTR_EQ, box, history.CONST_NULL)
 
     @arguments("box")
+    def opimpl_cast_opaque_ptr(self, box):
+        return self.execute(rop.CAST_OPAQUE_PTR, box)
+
+    @arguments("box")
     def _opimpl_any_return(self, box):
         self.metainterp.finishframe(box)
 
@@ -1199,7 +1204,7 @@
         return self.metainterp.execute_and_record(opnum, descr, *argboxes)
 
     @specialize.arg(1)
-    def execute_varargs(self, opnum, argboxes, descr, exc):
+    def execute_varargs(self, opnum, argboxes, descr, exc, pure):
         self.metainterp.clear_exception()
         resbox = self.metainterp.execute_and_record_varargs(opnum, argboxes,
                                                             descr=descr)
@@ -1207,6 +1212,9 @@
             self.make_result_of_lastop(resbox)
             # ^^^ this is done before handle_possible_exception() because we
             # need the box to show up in get_list_of_active_boxes()
+        if pure and self.metainterp.last_exc_value_box is None:
+            resbox = self.metainterp.record_result_of_call_pure(resbox)
+            exc = exc and not isinstance(resbox, Const)
         if exc:
             self.metainterp.handle_possible_exception()
         else:
@@ -1224,7 +1232,7 @@
         src_i = src_r = src_f = 0
         i = 1
         for kind in descr.get_arg_types():
-            if kind == history.INT:
+            if kind == history.INT or kind == 'S':        # single float
                 while True:
                     box = argboxes[src_i]
                     src_i += 1
@@ -1269,16 +1277,14 @@
             return resbox
         else:
             effect = effectinfo.extraeffect
-            if effect == effectinfo.EF_CANNOT_RAISE:
-                return self.execute_varargs(rop.CALL, allboxes, descr, False)
-            elif effect == effectinfo.EF_ELIDABLE:
-                return self.metainterp.record_result_of_call_pure(
-                    self.execute_varargs(rop.CALL, allboxes, descr, False))
-            elif effect == effectinfo.EF_LOOPINVARIANT:
+            if effect == effectinfo.EF_LOOPINVARIANT:
                 return self.execute_varargs(rop.CALL_LOOPINVARIANT, allboxes,
-                                            descr, False)
-            else:
-                return self.execute_varargs(rop.CALL, allboxes, descr, True)
+                                            descr, False, False)
+            exc = (effect != effectinfo.EF_CANNOT_RAISE and
+                   effect != effectinfo.EF_ELIDABLE_CANNOT_RAISE)
+            pure = (effect == effectinfo.EF_ELIDABLE_CAN_RAISE or
+                    effect == effectinfo.EF_ELIDABLE_CANNOT_RAISE)
+            return self.execute_varargs(rop.CALL, allboxes, descr, exc, pure)
 
     def do_residual_or_indirect_call(self, funcbox, calldescr, argboxes):
         """The 'residual_call' operation is emitted in two cases:
@@ -1377,9 +1383,9 @@
             num = self.cpu.get_fail_descr_number(tokens[0].finishdescr)
             setattr(self.cpu, 'done_with_this_frame_%s_v' % name, num)
         #
-        tokens = self.loop_tokens_exit_frame_with_exception_ref
-        num = self.cpu.get_fail_descr_number(tokens[0].finishdescr)
-        self.cpu.exit_frame_with_exception_v = num
+        exc_descr = compile.PropagateExceptionDescr()
+        num = self.cpu.get_fail_descr_number(exc_descr)
+        self.cpu.propagate_exception_v = num
         #
         self.globaldata = MetaInterpGlobalData(self)
 
@@ -1686,8 +1692,12 @@
             return
         if opnum == rop.CALL:
             effectinfo = descr.get_extra_info()
-            if effectinfo.extraeffect == effectinfo.EF_ELIDABLE:
-                return
+            if effectinfo is not None:
+                ef = effectinfo.extraeffect
+                if ef == effectinfo.EF_LOOPINVARIANT or \
+                   ef == effectinfo.EF_ELIDABLE_CANNOT_RAISE or \
+                   ef == effectinfo.EF_ELIDABLE_CAN_RAISE:
+                    return
         if self.heap_cache:
             self.heap_cache.clear()
         if self.heap_array_cache:
@@ -2375,6 +2385,7 @@
                 tobox = newbox
             if change:
                 self.heap_cache[descr] = frombox, tobox
+        # XXX what about self.heap_array_cache?
 
     def find_biggest_function(self):
         start_stack = []
diff --git a/pypy/jit/metainterp/resoperation.py b/pypy/jit/metainterp/resoperation.py
--- a/pypy/jit/metainterp/resoperation.py
+++ b/pypy/jit/metainterp/resoperation.py
@@ -408,6 +408,8 @@
     'FLOAT_ABS/1',
     'CAST_FLOAT_TO_INT/1',
     'CAST_INT_TO_FLOAT/1',
+    'CAST_FLOAT_TO_SINGLEFLOAT/1',
+    'CAST_SINGLEFLOAT_TO_FLOAT/1',
     #
     'INT_LT/2b',
     'INT_LE/2b',
@@ -435,6 +437,7 @@
     #
     'PTR_EQ/2b',
     'PTR_NE/2b',
+    'CAST_OPAQUE_PTR/1b',
     #
     'ARRAYLEN_GC/1d',
     'STRLEN/1',
diff --git a/pypy/jit/metainterp/test/support.py b/pypy/jit/metainterp/test/support.py
--- a/pypy/jit/metainterp/test/support.py
+++ b/pypy/jit/metainterp/test/support.py
@@ -277,3 +277,15 @@
         NODE._add_fields({'value': ootype.Signed,
                           'next': NODE})
         return NODE
+
+# ____________________________________________________________
+
+class _Foo:
+    pass
+
+def noConst(x):
+    """Helper function for tests, returning 'x' as a BoxInt/BoxPtr
+    even if it is a ConstInt/ConstPtr."""
+    f1 = _Foo(); f2 = _Foo()
+    f1.x = x; f2.x = 0
+    return f1.x
diff --git a/pypy/jit/metainterp/test/test_ajit.py b/pypy/jit/metainterp/test/test_ajit.py
--- a/pypy/jit/metainterp/test/test_ajit.py
+++ b/pypy/jit/metainterp/test/test_ajit.py
@@ -14,7 +14,7 @@
 from pypy.rpython.lltypesystem import lltype, llmemory, rffi
 from pypy.rpython.ootypesystem import ootype
 from pypy.jit.metainterp.optimizeopt import ALL_OPTS_DICT
-from pypy.jit.metainterp.test.support import LLJitMixin, OOJitMixin
+from pypy.jit.metainterp.test.support import LLJitMixin, OOJitMixin, noConst
 
 class BasicTests:
 
@@ -441,6 +441,58 @@
         # the CALL_PURE is constant-folded away by optimizeopt.py
         self.check_loops(int_sub=1, call=0, call_pure=0, getfield_gc=0)
 
+    def test_elidable_raising(self):
+        myjitdriver = JitDriver(greens = ['m'], reds = ['n'])
+        @elidable
+        def externfn(x):
+            if x <= 0:
+                raise ValueError
+            return x - 1
+        def f(n, m):
+            while n > 0:
+                myjitdriver.can_enter_jit(n=n, m=m)
+                myjitdriver.jit_merge_point(n=n, m=m)
+                try:
+                    n -= externfn(m)
+                except ValueError:
+                    n -= 1
+            return n
+        res = self.meta_interp(f, [22, 6])
+        assert res == -3
+        # the CALL_PURE is constant-folded away during tracing
+        self.check_loops(int_sub=1, call=0, call_pure=0)
+        #
+        res = self.meta_interp(f, [22, -5])
+        assert res == 0
+        # raises: becomes CALL and is not constant-folded away
+        self.check_loops(int_sub=1, call=1, call_pure=0)
+
+    def test_elidable_raising_2(self):
+        myjitdriver = JitDriver(greens = ['m'], reds = ['n'])
+        @elidable
+        def externfn(x):
+            if x <= 0:
+                raise ValueError
+            return x - 1
+        def f(n, m):
+            while n > 0:
+                myjitdriver.can_enter_jit(n=n, m=m)
+                myjitdriver.jit_merge_point(n=n, m=m)
+                try:
+                    n -= externfn(noConst(m))
+                except ValueError:
+                    n -= 1
+            return n
+        res = self.meta_interp(f, [22, 6])
+        assert res == -3
+        # the CALL_PURE is constant-folded away by optimizeopt.py
+        self.check_loops(int_sub=1, call=0, call_pure=0)
+        #
+        res = self.meta_interp(f, [22, -5])
+        assert res == 0
+        # raises: becomes CALL and is not constant-folded away
+        self.check_loops(int_sub=1, call=1, call_pure=0)
+
     def test_constant_across_mp(self):
         myjitdriver = JitDriver(greens = [], reds = ['n'])
         class X(object):
diff --git a/pypy/jit/metainterp/test/test_dict.py b/pypy/jit/metainterp/test/test_dict.py
--- a/pypy/jit/metainterp/test/test_dict.py
+++ b/pypy/jit/metainterp/test/test_dict.py
@@ -157,7 +157,7 @@
         # the same arguments are not folded, because we have conflicting
         # definitions of pure, once strhash can be appropriately folded
         # this should be decreased to seven.
-        self.check_loops({"call": 8, "guard_false": 1, "guard_no_exception": 5,
+        self.check_loops({"call": 8, "guard_false": 1, "guard_no_exception": 6,
                           "guard_true": 1, "int_and": 1, "int_gt": 1,
                           "int_is_true": 1, "int_sub": 1, "jump": 1,
                           "new_with_vtable": 1, "setfield_gc": 1})
diff --git a/pypy/jit/metainterp/test/test_fficall.py b/pypy/jit/metainterp/test/test_fficall.py
--- a/pypy/jit/metainterp/test/test_fficall.py
+++ b/pypy/jit/metainterp/test/test_fficall.py
@@ -3,7 +3,7 @@
 from pypy.rlib.rarithmetic import r_singlefloat, r_longlong, r_ulonglong
 from pypy.rlib.jit import JitDriver, promote, dont_look_inside
 from pypy.rlib.unroll import unrolling_iterable
-from pypy.rlib.libffi import ArgChain, longlong2float, float2longlong
+from pypy.rlib.libffi import ArgChain
 from pypy.rlib.libffi import IS_32_BIT
 from pypy.rlib.test.test_libffi import TestLibffiCall as _TestLibffiCall
 from pypy.rpython.lltypesystem import lltype, rffi
@@ -12,10 +12,11 @@
 from pypy.jit.metainterp.test.support import LLJitMixin
 
 class TestFfiCall(LLJitMixin, _TestLibffiCall):
+    supports_all = False     # supports_{floats,longlong,singlefloats}
 
     # ===> ../../../rlib/test/test_libffi.py
 
-    def call(self, funcspec, args, RESULT, init_result=0, is_struct=False):
+    def call(self, funcspec, args, RESULT, is_struct=False, jitif=[]):
         """
         Call the function specified by funcspec in a loop, and let the jit to
         see and optimize it.
@@ -24,14 +25,7 @@
         lib, name, argtypes, restype = funcspec
         method_and_args = []
         for argval in args:
-            if type(argval) is r_singlefloat:
-                method_name = 'arg_singlefloat'
-                argval = float(argval)
-            elif IS_32_BIT and type(argval) in [r_longlong, r_ulonglong]:
-                method_name = 'arg_longlong'
-                argval = rffi.cast(rffi.LONGLONG, argval)
-                argval = longlong2float(argval)
-            elif isinstance(argval, tuple):
+            if isinstance(argval, tuple):
                 method_name, argval = argval
             else:
                 method_name = 'arg'
@@ -39,10 +33,20 @@
         method_and_args = unrolling_iterable(method_and_args)
         #
         reds = ['n', 'res', 'func']
-        if (RESULT in [rffi.FLOAT, rffi.DOUBLE] or
+        if (RESULT is rffi.DOUBLE or
             IS_32_BIT and RESULT in [rffi.LONGLONG, rffi.ULONGLONG]):
-            reds = ['n', 'func', 'res'] # floats must be *after* refs
+            reds = ['n', 'func', 'res'] # 'double' floats must be *after* refs
         driver = JitDriver(reds=reds, greens=[])
+        init_result = rffi.cast(RESULT, 0)
+        #
+        def g(func):
+            # a different function, which is marked as "dont_look_inside"
+            # in case it uses an unsupported argument
+            argchain = ArgChain()
+            # this loop is unrolled
+            for method_name, argval in method_and_args:
+                getattr(argchain, method_name)(argval)
+            return func.call(argchain, RESULT, is_struct=is_struct)
         #
         def f(n):
             func = lib.getpointer(name, argtypes, restype)
@@ -50,18 +54,44 @@
             while n < 10:
                 driver.jit_merge_point(n=n, res=res, func=func)
                 promote(func)
-                argchain = ArgChain()
-                # this loop is unrolled
-                for method_name, argval in method_and_args:
-                    getattr(argchain, method_name)(argval)
-                res = func.call(argchain, RESULT, is_struct=is_struct)
+                res = g(func)
                 n += 1
             return res
         #
-        res = self.meta_interp(f, [0], backendopt=True)
+        res = self.meta_interp(f, [0], backendopt=True,
+                               supports_floats       = self.supports_all,
+                               supports_longlong     = self.supports_all,
+                               supports_singlefloats = self.supports_all)
+        d = {'floats': self.supports_all,
+             'longlong': self.supports_all or not IS_32_BIT,
+             'singlefloats': self.supports_all,
+             'byval': False}
+        supported = all(d[check] for check in jitif)
+        if supported:
+            self.check_loops(
+                call_release_gil=1,   # a CALL_RELEASE_GIL, and no other CALLs
+                call=0,
+                call_may_force=0,
+                guard_no_exception=1,
+                guard_not_forced=1,
+                int_add=1,
+                int_lt=1,
+                guard_true=1,
+                jump=1)
+        else:
+            self.check_loops(
+                call_release_gil=0,   # no CALL_RELEASE_GIL
+                int_add=1,
+                int_lt=1,
+                guard_true=1,
+                jump=1)
         return res
 
     def test_byval_result(self):
         _TestLibffiCall.test_byval_result(self)
     test_byval_result.__doc__ = _TestLibffiCall.test_byval_result.__doc__
     test_byval_result.dont_track_allocations = True
+
+
+class TestFfiCallSupportAll(TestFfiCall):
+    supports_all = True     # supports_{floats,longlong,singlefloats}
diff --git a/pypy/jit/metainterp/test/test_float.py b/pypy/jit/metainterp/test/test_float.py
--- a/pypy/jit/metainterp/test/test_float.py
+++ b/pypy/jit/metainterp/test/test_float.py
@@ -36,6 +36,15 @@
         res = self.interp_operations(f, [x])
         assert res == -x
 
+    def test_singlefloat(self):
+        from pypy.rlib.rarithmetic import r_singlefloat
+        def f(a):
+            a = float(r_singlefloat(a))
+            a *= 4.25
+            return float(r_singlefloat(a))
+        res = self.interp_operations(f, [-2.0])
+        assert res == -8.5
+
 
 class TestOOtype(FloatTests, OOJitMixin):
     pass
diff --git a/pypy/jit/metainterp/test/test_loop.py b/pypy/jit/metainterp/test/test_loop.py
--- a/pypy/jit/metainterp/test/test_loop.py
+++ b/pypy/jit/metainterp/test/test_loop.py
@@ -800,6 +800,37 @@
 
         res = self.meta_interp(f, [200])        
 
+
+    def test_unerased_pointers_in_short_preamble(self):
+        from pypy.rlib.rerased import new_erasing_pair
+        from pypy.rpython.lltypesystem import lltype
+        class A(object):
+            def __init__(self, val):
+                self.val = val
+        erase_A, unerase_A = new_erasing_pair('A')
+        erase_TP, unerase_TP = new_erasing_pair('TP')
+        TP = lltype.GcArray(lltype.Signed)
+        myjitdriver = JitDriver(greens = [], reds = ['n', 'm', 'i', 'j', 'sa', 'p'])
+        def f(n, m, j):
+            i = sa = 0
+            p = erase_A(A(7))
+            while i < n:
+                myjitdriver.jit_merge_point(n=n, m=m, i=i, j=j, sa=sa, p=p)
+                if i < m:
+                    sa += unerase_A(p).val
+                elif i == m:
+                    a = lltype.malloc(TP, 5)
+                    a[0] = 42
+                    p = erase_TP(a)
+                else:
+                    sa += unerase_TP(p)[0]
+                sa += A(i).val
+                assert n>0 and m>0
+                i += j
+            return sa
+        res = self.meta_interp(f, [20, 10, 1])
+        assert res == f(20, 10, 1)
+
 class TestOOtype(LoopTest, OOJitMixin):
     pass
 
diff --git a/pypy/jit/metainterp/test/test_string.py b/pypy/jit/metainterp/test/test_string.py
--- a/pypy/jit/metainterp/test/test_string.py
+++ b/pypy/jit/metainterp/test/test_string.py
@@ -515,3 +515,22 @@
         self.check_loops(call=3,    # str(), _str(), escape()
                          newunicode=1, unicodegetitem=0,
                          unicodesetitem=1, copyunicodecontent=1)
+
+    def test_str2unicode_fold(self):
+        _str = self._str
+        jitdriver = JitDriver(greens = ['g'], reds = ['m'])
+        @dont_look_inside
+        def escape(x):
+            print str(x)
+        def f(g, m):
+            g = str(g)
+            while m >= 0:
+                jitdriver.can_enter_jit(g=g, m=m)
+                jitdriver.jit_merge_point(g=g, m=m)
+                escape(_str(g))
+                m -= 1
+            return 42
+        self.meta_interp(f, [6, 7])
+        self.check_loops(call_pure=0, call=1,
+                         newunicode=0, unicodegetitem=0,
+                         unicodesetitem=0, copyunicodecontent=0)
diff --git a/pypy/jit/metainterp/test/test_warmspot.py b/pypy/jit/metainterp/test/test_warmspot.py
--- a/pypy/jit/metainterp/test/test_warmspot.py
+++ b/pypy/jit/metainterp/test/test_warmspot.py
@@ -303,6 +303,7 @@
         class FakeCPU(object):
             supports_floats = False
             supports_longlong = False
+            supports_singlefloats = False
             ts = llhelper
             translate_support_code = False
             stats = "stats"
diff --git a/pypy/jit/metainterp/test/test_warmstate.py b/pypy/jit/metainterp/test/test_warmstate.py
--- a/pypy/jit/metainterp/test/test_warmstate.py
+++ b/pypy/jit/metainterp/test/test_warmstate.py
@@ -1,13 +1,14 @@
 from pypy.rpython.test.test_llinterp import interpret
-from pypy.rpython.lltypesystem import lltype, llmemory, rstr
+from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi
 from pypy.rpython.ootypesystem import ootype
 from pypy.rpython.annlowlevel import llhelper
-from pypy.jit.metainterp.warmstate import wrap, unwrap
+from pypy.jit.metainterp.warmstate import wrap, unwrap, specialize_value
 from pypy.jit.metainterp.warmstate import equal_whatever, hash_whatever
 from pypy.jit.metainterp.warmstate import WarmEnterState, JitCell
 from pypy.jit.metainterp.history import BoxInt, BoxFloat, BoxPtr
 from pypy.jit.metainterp.history import ConstInt, ConstFloat, ConstPtr
 from pypy.jit.codewriter import longlong
+from pypy.rlib.rarithmetic import r_singlefloat
 
 def boxfloat(x):
     return BoxFloat(longlong.getfloatstorage(x))
@@ -40,6 +41,28 @@
     assert _is(wrap(None, 42, in_const_box=True), ConstInt(42))
     assert _is(wrap(None, 42.5, in_const_box=True), constfloat(42.5))
     assert _is(wrap(None, p, in_const_box=True), ConstPtr(po))
+    if longlong.supports_longlong:
+        import sys
+        from pypy.rlib.rarithmetic import r_longlong, r_ulonglong
+        value = r_longlong(-sys.maxint*17)
+        assert _is(wrap(None, value), BoxFloat(value))
+        assert _is(wrap(None, value, in_const_box=True), ConstFloat(value))
+        value_unsigned = r_ulonglong(-sys.maxint*17)
+        assert _is(wrap(None, value_unsigned), BoxFloat(value))
+    sfval = r_singlefloat(42.5)
+    ival = longlong.singlefloat2int(sfval)
+    assert _is(wrap(None, sfval), BoxInt(ival))
+    assert _is(wrap(None, sfval, in_const_box=True), ConstInt(ival))
+
+def test_specialize_value():
+    assert specialize_value(lltype.Char, 0x41) == '\x41'
+    if longlong.supports_longlong:
+        import sys
+        value = longlong.r_float_storage(sys.maxint*17)
+        assert specialize_value(lltype.SignedLongLong, value) == sys.maxint*17
+    sfval = r_singlefloat(42.5)
+    ival = longlong.singlefloat2int(sfval)
+    assert specialize_value(rffi.FLOAT, ival) == sfval
 
 def test_hash_equal_whatever_lltype():
     s1 = rstr.mallocstr(2)
@@ -186,6 +209,7 @@
         _get_printable_location_ptr = None
         _confirm_enter_jit_ptr = None
         _can_never_inline_ptr = None
+        _should_unroll_one_iteration_ptr = None
     class FakeCell:
         dont_trace_here = False
     state = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
@@ -214,6 +238,7 @@
         _confirm_enter_jit_ptr = None
         _can_never_inline_ptr = None
         _get_jitcell_at_ptr = None
+        _should_unroll_one_iteration_ptr = None
     state = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
     state.make_jitdriver_callbacks()
     res = state.get_location_str([ConstInt(5), constfloat(42.5)])
@@ -238,6 +263,7 @@
         _confirm_enter_jit_ptr = llhelper(ENTER_JIT, confirm_enter_jit)
         _can_never_inline_ptr = None
         _get_jitcell_at_ptr = None
+        _should_unroll_one_iteration_ptr = None
 
     state = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
     state.make_jitdriver_callbacks()
@@ -262,6 +288,7 @@
         _confirm_enter_jit_ptr = None
         _can_never_inline_ptr = llhelper(CAN_NEVER_INLINE, can_never_inline)
         _get_jitcell_at_ptr = None
+        _should_unroll_one_iteration_ptr = None
 
     state = WarmEnterState(FakeWarmRunnerDesc(), FakeJitDriverSD())
     state.make_jitdriver_callbacks()
diff --git a/pypy/jit/metainterp/warmspot.py b/pypy/jit/metainterp/warmspot.py
--- a/pypy/jit/metainterp/warmspot.py
+++ b/pypy/jit/metainterp/warmspot.py
@@ -173,6 +173,7 @@
             policy = JitPolicy()
         policy.set_supports_floats(self.cpu.supports_floats)
         policy.set_supports_longlong(self.cpu.supports_longlong)
+        policy.set_supports_singlefloats(self.cpu.supports_singlefloats)
         graphs = self.codewriter.find_all_graphs(policy)
         policy.dump_unsafe_loops()
         self.check_access_directly_sanity(graphs)
@@ -283,7 +284,9 @@
         auto_inline_graphs(self.translator, graphs, 0.01)
 
     def build_cpu(self, CPUClass, translate_support_code=False,
-                  no_stats=False, **kwds):
+                  no_stats=False, supports_floats=True,
+                  supports_longlong=True, supports_singlefloats=True,
+                  **kwds):
         assert CPUClass is not None
         self.opt = history.Options(**kwds)
         if no_stats:
@@ -295,6 +298,9 @@
             self.annhelper = MixLevelHelperAnnotator(self.translator.rtyper)
         cpu = CPUClass(self.translator.rtyper, self.stats, self.opt,
                        translate_support_code, gcdescr=self.gcdescr)
+        if not supports_floats:       cpu.supports_floats       = False
+        if not supports_longlong:     cpu.supports_longlong     = False
+        if not supports_singlefloats: cpu.supports_singlefloats = False
         self.cpu = cpu
 
     def build_meta_interp(self, ProfilerClass):
@@ -409,6 +415,7 @@
         jd.warmstate = state
 
         def crash_in_jit(e):
+            tb = not we_are_translated() and sys.exc_info()[2]
             try:
                 raise e
             except JitException:
@@ -422,8 +429,8 @@
                     print "~~~ Crash in JIT!"
                     print '~~~ %s: %s' % (e.__class__, e)
                     if sys.stdout == sys.__stdout__:
-                        import pdb; pdb.post_mortem(sys.exc_info()[2])
-                    raise
+                        import pdb; pdb.post_mortem(tb)
+                    raise e.__class__, e, tb
                 fatalerror('~~~ Crash in JIT! %s' % (e,), traceback=True)
         crash_in_jit._dont_inline_ = True
 
diff --git a/pypy/jit/metainterp/warmstate.py b/pypy/jit/metainterp/warmstate.py
--- a/pypy/jit/metainterp/warmstate.py
+++ b/pypy/jit/metainterp/warmstate.py
@@ -25,9 +25,13 @@
         if isinstance(TYPE, lltype.Ptr) and TYPE.TO._gckind == 'raw':
             # non-gc pointer
             return rffi.cast(TYPE, x)
+        elif TYPE is lltype.SingleFloat:
+            return longlong.int2singlefloat(x)
         else:
             return lltype.cast_primitive(TYPE, x)
     elif INPUT is longlong.FLOATSTORAGE:
+        if longlong.is_longlong(TYPE):
+            return rffi.cast(TYPE, x)
         assert TYPE is lltype.Float
         return longlong.getrealfloat(x)
     else:
@@ -84,8 +88,12 @@
             return history.ConstObj(value)
         else:
             return history.BoxObj(value)
-    elif isinstance(value, float):
-        value = longlong.getfloatstorage(value)
+    elif (isinstance(value, float) or
+          longlong.is_longlong(lltype.typeOf(value))):
+        if isinstance(value, float):
+            value = longlong.getfloatstorage(value)
+        else:
+            value = rffi.cast(lltype.SignedLongLong, value)
         if in_const_box:
             return history.ConstFloat(value)
         else:
@@ -93,6 +101,8 @@
     elif isinstance(value, str) or isinstance(value, unicode):
         assert len(value) == 1     # must be a character
         value = ord(value)
+    elif lltype.typeOf(value) is lltype.SingleFloat:
+        value = longlong.singlefloat2int(value)
     else:
         value = intmask(value)
     if in_const_box:
diff --git a/pypy/jit/tool/oparser.py b/pypy/jit/tool/oparser.py
--- a/pypy/jit/tool/oparser.py
+++ b/pypy/jit/tool/oparser.py
@@ -53,7 +53,7 @@
 class OpParser(object):
 
     use_mock_model = False
-    
+
     def __init__(self, input, cpu, namespace, type_system, boxkinds,
                  invent_fail_descr=default_fail_descr,
                  nonstrict=False):
@@ -187,7 +187,7 @@
             poss_descr = allargs[-1].strip()
             if poss_descr.startswith('descr='):
                 descr = self.get_descr(poss_descr[len('descr='):])
-                allargs = allargs[:-1]        
+                allargs = allargs[:-1]
             for arg in allargs:
                 arg = arg.strip()
                 try:
@@ -240,7 +240,7 @@
             fail_args = None
             if opnum == rop.FINISH:
                 if descr is None and self.invent_fail_descr:
-                    descr = self.invent_fail_descr(self.model)
+                    descr = self.invent_fail_descr(self.model, fail_args)
             elif opnum == rop.JUMP:
                 if descr is None and self.invent_fail_descr:
                     descr = self.looptoken
diff --git a/pypy/module/_ffi/interp_ffi.py b/pypy/module/_ffi/interp_ffi.py
--- a/pypy/module/_ffi/interp_ffi.py
+++ b/pypy/module/_ffi/interp_ffi.py
@@ -11,6 +11,7 @@
 from pypy.rlib import libffi
 from pypy.rlib.rdynload import DLOpenError
 from pypy.rlib.rarithmetic import intmask, r_uint
+from pypy.rlib.objectmodel import we_are_translated
 
 class W_FFIType(Wrappable):
 
@@ -74,6 +75,13 @@
     def is_struct(self):
         return libffi.types.is_struct(self.ffitype)
 
+    def is_char_p(self):
+        return self is app_types.char_p
+
+    def is_unichar_p(self):
+        return self is app_types.unichar_p
+
+
 W_FFIType.typedef = TypeDef(
     'FFIType',
     __repr__ = interp2app(W_FFIType.repr),
@@ -115,7 +123,12 @@
         ## 'Z' : ffi_type_pointer,
 
         ]
-    return dict([(t.name, t) for t in types])
+    d = dict([(t.name, t) for t in types])
+    w_char = d['char']
+    w_unichar = d['unichar']
+    d['char_p'] = W_FFIType('char_p', libffi.types.pointer, w_pointer_to = w_char)
+    d['unichar_p'] = W_FFIType('unichar_p', libffi.types.pointer, w_pointer_to = w_unichar)
+    return d
 
 class app_types:
     pass
@@ -125,9 +138,14 @@
     try:
         return descr_new_pointer.cache[w_pointer_to]
     except KeyError:
-        w_pointer_to = space.interp_w(W_FFIType, w_pointer_to)
-        name = '(pointer to %s)' % w_pointer_to.name
-        w_result = W_FFIType(name, libffi.types.pointer, w_pointer_to = w_pointer_to)
+        if w_pointer_to is app_types.char:
+            w_result = app_types.char_p
+        elif w_pointer_to is app_types.unichar:
+            w_result = app_types.unichar_p
+        else:
+            w_pointer_to = space.interp_w(W_FFIType, w_pointer_to)
+            name = '(pointer to %s)' % w_pointer_to.name
+            w_result = W_FFIType(name, libffi.types.pointer, w_pointer_to = w_pointer_to)
         descr_new_pointer.cache[w_pointer_to] = w_result
         return w_result
 descr_new_pointer.cache = {}
@@ -164,6 +182,7 @@
         self.func = func
         self.argtypes_w = argtypes_w
         self.w_restype = w_restype
+        self.to_free = []
 
     @jit.unroll_safe
     def build_argchain(self, space, args_w):
@@ -188,6 +207,9 @@
                 self.arg_longlong(space, argchain, w_arg)
             elif w_argtype.is_signed():
                 argchain.arg(unwrap_truncate_int(rffi.LONG, space, w_arg))
+            elif self.add_char_p_maybe(space, argchain, w_arg, w_argtype):
+                # the argument is added to the argchain direcly by the method above
+                pass
             elif w_argtype.is_pointer():
                 w_arg = self.convert_pointer_arg_maybe(space, w_arg, w_argtype)
                 argchain.arg(intmask(space.uint_w(w_arg)))
@@ -200,9 +222,9 @@
                 w_arg = space.ord(w_arg)
                 argchain.arg(space.int_w(w_arg))
             elif w_argtype.is_double():
-                argchain.arg(space.float_w(w_arg))
+                self.arg_float(space, argchain, w_arg)
             elif w_argtype.is_singlefloat():
-                argchain.arg_singlefloat(space.float_w(w_arg))
+                self.arg_singlefloat(space, argchain, w_arg)
             elif w_argtype.is_struct():
                 # arg_raw directly takes value to put inside ll_args
                 w_arg = space.interp_w(W_StructureInstance, w_arg)
@@ -212,6 +234,29 @@
                 assert False, "Argument shape '%s' not supported" % w_argtype
         return argchain
 
+    def add_char_p_maybe(self, space, argchain, w_arg, w_argtype):
+        """
+        Automatic conversion from string to char_p. The allocated buffer will
+        be automatically freed after the call.
+        """
+        w_type = jit.promote(space.type(w_arg))
+        if w_argtype.is_char_p() and w_type is space.w_str:
+            strval = space.str_w(w_arg)
+            buf = rffi.str2charp(strval)
+            self.to_free.append(rffi.cast(rffi.VOIDP, buf))
+            addr = rffi.cast(rffi.ULONG, buf)
+            argchain.arg(addr)
+            return True
+        elif w_argtype.is_unichar_p() and (w_type is space.w_str or
+                                           w_type is space.w_unicode):
+            unicodeval = space.unicode_w(w_arg)
+            buf = rffi.unicode2wcharp(unicodeval)
+            self.to_free.append(rffi.cast(rffi.VOIDP, buf))
+            addr = rffi.cast(rffi.ULONG, buf)
+            argchain.arg(addr)
+            return True
+        return False
+
     def convert_pointer_arg_maybe(self, space, w_arg, w_argtype):
         """
         Try to convert the argument by calling _as_ffi_pointer_()
@@ -222,26 +267,47 @@
         else:
             return w_arg
 
-    @jit.dont_look_inside
+    def arg_float(self, space, argchain, w_arg):
+        # a separate function, which can be seen by the jit or not,
+        # depending on whether floats are supported
+        argchain.arg(space.float_w(w_arg))
+
     def arg_longlong(self, space, argchain, w_arg):
+        # a separate function, which can be seen by the jit or not,
+        # depending on whether longlongs are supported
         bigarg = space.bigint_w(w_arg)
         ullval = bigarg.ulonglongmask()
         llval = rffi.cast(rffi.LONGLONG, ullval)
-        # this is a hack: we store the 64 bits of the long long into the
-        # 64 bits of a float (i.e., a C double)
-        floatval = libffi.longlong2float(llval)
-        argchain.arg_longlong(floatval)
+        argchain.arg(llval)
+
+    def arg_singlefloat(self, space, argchain, w_arg):
+        # a separate function, which can be seen by the jit or not,
+        # depending on whether singlefloats are supported
+        from pypy.rlib.rarithmetic import r_singlefloat
+        fval = space.float_w(w_arg)
+        sfval = r_singlefloat(fval)
+        argchain.arg(sfval)
 
     def call(self, space, args_w):
         self = jit.promote(self)
         argchain = self.build_argchain(space, args_w)
+        return self._do_call(space, argchain)
+
+    def free_temp_buffers(self, space):
+        for buf in self.to_free:
+            if not we_are_translated():
+                buf[0] = '\00' # invalidate the buffer, so that
+                               # test_keepalive_temp_buffer can fail
+            lltype.free(buf, flavor='raw')
+        self.to_free = []
+
+    def _do_call(self, space, argchain):
         w_restype = self.w_restype
         if w_restype.is_longlong():
             # note that we must check for longlong first, because either
             # is_signed or is_unsigned returns true anyway
             assert libffi.IS_32_BIT
-            reskind = libffi.types.getkind(self.func.restype) # XXX: remove the kind
-            return self._call_longlong(space, argchain, reskind)
+            return self._call_longlong(space, argchain)
         elif w_restype.is_signed():
             return self._call_int(space, argchain)
         elif w_restype.is_unsigned() or w_restype.is_pointer():
@@ -253,12 +319,9 @@
             intres = self.func.call(argchain, rffi.WCHAR_T)
             return space.wrap(unichr(intres))
         elif w_restype.is_double():
-            floatres = self.func.call(argchain, rffi.DOUBLE)
-            return space.wrap(floatres)
+            return self._call_float(space, argchain)
         elif w_restype.is_singlefloat():
-            # the result is a float, but widened to be inside a double
-            floatres = self.func.call(argchain, rffi.FLOAT)
-            return space.wrap(floatres)
+            return self._call_singlefloat(space, argchain)
         elif w_restype.is_struct():
             w_datashape = w_restype.w_datashape
             assert isinstance(w_datashape, W_Structure)
@@ -327,19 +390,32 @@
                                  space.wrap('Unsupported restype'))
         return space.wrap(intres)
 
-    @jit.dont_look_inside
-    def _call_longlong(self, space, argchain, reskind):
-        # this is a hack: we store the 64 bits of the long long into the 64
-        # bits of a float (i.e., a C double)
-        floatres = self.func.call(argchain, rffi.LONGLONG)
-        llres = libffi.float2longlong(floatres)
-        if reskind == 'I':
+    def _call_float(self, space, argchain):
+        # a separate function, which can be seen by the jit or not,
+        # depending on whether floats are supported
+        floatres = self.func.call(argchain, rffi.DOUBLE)
+        return space.wrap(floatres)
+
+    def _call_longlong(self, space, argchain):
+        # a separate function, which can be seen by the jit or not,
+        # depending on whether longlongs are supported
+        restype = self.func.restype
+        call = self.func.call
+        if restype is libffi.types.slonglong:
+            llres = call(argchain, rffi.LONGLONG)
             return space.wrap(llres)
-        elif reskind == 'U':
-            ullres = rffi.cast(rffi.ULONGLONG, llres)
+        elif restype is libffi.types.ulonglong:
+            ullres = call(argchain, rffi.ULONGLONG)
             return space.wrap(ullres)
         else:
-            assert False
+            raise OperationError(space.w_ValueError,
+                                 space.wrap('Unsupported longlong restype'))
+
+    def _call_singlefloat(self, space, argchain):
+        # a separate function, which can be seen by the jit or not,
+        # depending on whether singlefloats are supported
+        sfres = self.func.call(argchain, rffi.FLOAT)
+        return space.wrap(float(sfres))
 
     def getaddr(self, space):
         """
@@ -372,6 +448,7 @@
     '_ffi.FuncPtr',
     __call__ = interp2app(W_FuncPtr.call),
     getaddr = interp2app(W_FuncPtr.getaddr),
+    free_temp_buffers = interp2app(W_FuncPtr.free_temp_buffers),
     fromaddr = interp2app(descr_fromaddr, as_classmethod=True)
     )
 
diff --git a/pypy/module/_ffi/test/test__ffi.py b/pypy/module/_ffi/test/test__ffi.py
--- a/pypy/module/_ffi/test/test__ffi.py
+++ b/pypy/module/_ffi/test/test__ffi.py
@@ -188,6 +188,75 @@
         assert get_dummy() == 123
         set_val_to_ptr(ptr2, 0)
 
+    def test_convert_strings_to_char_p(self):
+        """
+            long mystrlen(char* s)
+            {
+                long len = 0;
+                while(*s++)
+                    len++;
+                return len;
+            }
+        """
+        from _ffi import CDLL, types
+        import _rawffi
+        libfoo = CDLL(self.libfoo_name)
+        mystrlen = libfoo.getfunc('mystrlen', [types.char_p], types.slong)
+        #
+        # first, try automatic conversion from a string
+        assert mystrlen('foobar') == 6
+        # then, try to pass an explicit pointer
+        CharArray = _rawffi.Array('c')
+        mystr = CharArray(7, 'foobar')
+        assert mystrlen(mystr.buffer) == 6
+        mystr.free()
+        mystrlen.free_temp_buffers()
+
+    def test_convert_unicode_to_unichar_p(self):
+        """
+            #include <wchar.h>
+            long mystrlen_u(wchar_t* s)
+            {
+                long len = 0;
+                while(*s++)
+                    len++;
+                return len;
+            }
+        """
+        from _ffi import CDLL, types
+        import _rawffi
+        libfoo = CDLL(self.libfoo_name)
+        mystrlen = libfoo.getfunc('mystrlen_u', [types.unichar_p], types.slong)
+        #
+        # first, try automatic conversion from strings and unicode
+        assert mystrlen('foobar') == 6
+        assert mystrlen(u'foobar') == 6
+        assert mystrlen(u'ab\u2070') == 3
+        # then, try to pass an explicit pointer
+        UniCharArray = _rawffi.Array('u')
+        mystr = UniCharArray(7, u'foobar')
+        assert mystrlen(mystr.buffer) == 6
+        mystr.free()
+        mystrlen.free_temp_buffers()
+
+    def test_keepalive_temp_buffer(self):
+        """
+            char* do_nothing(char* s)
+            {
+                return s;
+            }
+        """
+        from _ffi import CDLL, types
+        import _rawffi
+        libfoo = CDLL(self.libfoo_name)
+        do_nothing = libfoo.getfunc('do_nothing', [types.char_p], types.char_p)
+        CharArray = _rawffi.Array('c')
+        #
+        ptr = do_nothing('foobar')
+        array = CharArray.fromaddress(ptr, 7)
+        assert list(array) == list('foobar\00')
+        do_nothing.free_temp_buffers()
+
     def test_typed_pointer(self):
         from _ffi import types
         intptr = types.Pointer(types.sint) # create a typed pointer to sint
@@ -204,6 +273,13 @@
         assert x is y
         assert x is not z
 
+    def test_char_p_cached(self):
+        from _ffi import types
+        x = types.Pointer(types.char)
+        assert x is types.char_p
+        x = types.Pointer(types.unichar)
+        assert x is types.unichar_p
+
     def test_typed_pointer_args(self):
         """
             extern int dummy; // defined in test_void_result 
diff --git a/pypy/module/_multibytecodec/__init__.py b/pypy/module/_multibytecodec/__init__.py
--- a/pypy/module/_multibytecodec/__init__.py
+++ b/pypy/module/_multibytecodec/__init__.py
@@ -7,13 +7,14 @@
         # for compatibility this name is obscured, and should be called
         # via the _codecs_*.py modules written in lib_pypy.
         '__getcodec': 'interp_multibytecodec.getcodec',
+
+        'MultibyteIncrementalDecoder':
+            'interp_incremental.MultibyteIncrementalDecoder',
+        'MultibyteIncrementalEncoder':
+            'interp_incremental.MultibyteIncrementalEncoder',
     }
 
     appleveldefs = {
-        'MultibyteIncrementalEncoder':
-            'app_multibytecodec.MultibyteIncrementalEncoder',
-        'MultibyteIncrementalDecoder':
-            'app_multibytecodec.MultibyteIncrementalDecoder',
         'MultibyteStreamReader':
             'app_multibytecodec.MultibyteStreamReader',
         'MultibyteStreamWriter':
diff --git a/pypy/module/_multibytecodec/app_multibytecodec.py b/pypy/module/_multibytecodec/app_multibytecodec.py
--- a/pypy/module/_multibytecodec/app_multibytecodec.py
+++ b/pypy/module/_multibytecodec/app_multibytecodec.py
@@ -1,34 +1,47 @@
 # NOT_RPYTHON
 #
-# These classes are not supported so far.
-#
-# My theory is that they are not widely used on CPython either, because
-# I found two bugs just by looking at their .c source: they always call
-# encreset() after a piece of data, even though I think it's wrong ---
-# it should be called only once at the end; and mbiencoder_reset() calls
-# decreset() instead of encreset().
-#
+# The interface here may be a little bit on the lightweight side.
 
-class MultibyteIncrementalEncoder(object):
-    def __init__(self, *args, **kwds):
-        raise LookupError(
-            "MultibyteIncrementalEncoder not implemented; "
-            "see pypy/module/_multibytecodec/app_multibytecodec.py")
+from _multibytecodec import MultibyteIncrementalDecoder
+from _multibytecodec import MultibyteIncrementalEncoder
 
-class MultibyteIncrementalDecoder(object):
-    def __init__(self, *args, **kwds):
-        raise LookupError(
-            "MultibyteIncrementalDecoder not implemented; "
-            "see pypy/module/_multibytecodec/app_multibytecodec.py")
 
-class MultibyteStreamReader(object):
-    def __init__(self, *args, **kwds):
-        raise LookupError(
-            "MultibyteStreamReader not implemented; "
-            "see pypy/module/_multibytecodec/app_multibytecodec.py")
+class MultibyteStreamReader(MultibyteIncrementalDecoder):
+    def __new__(cls, stream, errors=None):
+        self = MultibyteIncrementalDecoder.__new__(cls, errors)
+        self.stream = stream
+        return self
 
-class MultibyteStreamWriter(object):
-    def __init__(self, *args, **kwds):
-        raise LookupError(
-            "MultibyteStreamWriter not implemented; "
-            "see pypy/module/_multibytecodec/app_multibytecodec.py")
+    def __read(self, read, size):
+        if size is None or size < 0:
+            return MultibyteIncrementalDecoder.decode(self, read(), True)
+        while True:
+            data = read(size)
+            final = not data
+            output = MultibyteIncrementalDecoder.decode(self, data, final)
+            if output or final:
+                return output
+            size = 1   # read 1 more byte and retry
+
+    def read(self, size=None):
+        return self.__read(self.stream.read, size)
+
+    def readline(self, size=None):
+        return self.__read(self.stream.readline, size)
+
+    def readlines(self, sizehint=None):
+        return self.__read(self.stream.read, sizehint).splitlines(True)
+
+
+class MultibyteStreamWriter(MultibyteIncrementalEncoder):
+    def __new__(cls, stream, errors=None):
+        self = MultibyteIncrementalEncoder.__new__(cls, errors)
+        self.stream = stream
+        return self
+
+    def write(self, data):
+        self.stream.write(MultibyteIncrementalEncoder.encode(self, data))
+
+    def writelines(self, lines):
+        for data in lines:
+            self.write(data)
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -52,11 +52,13 @@
     includes = ['src/cjkcodecs/multibytecodec.h'],
     include_dirs = [str(srcdir)],
     export_symbols = [
+        "pypy_cjk_dec_new",
         "pypy_cjk_dec_init", "pypy_cjk_dec_free", "pypy_cjk_dec_chunk",
         "pypy_cjk_dec_outbuf", "pypy_cjk_dec_outlen",
         "pypy_cjk_dec_inbuf_remaining", "pypy_cjk_dec_inbuf_consumed",
         "pypy_cjk_dec_replace_on_error",
 
+        "pypy_cjk_enc_new",
         "pypy_cjk_enc_init", "pypy_cjk_enc_free", "pypy_cjk_enc_chunk",
         "pypy_cjk_enc_reset", "pypy_cjk_enc_outbuf", "pypy_cjk_enc_outlen",
         "pypy_cjk_enc_inbuf_remaining", "pypy_cjk_enc_inbuf_consumed",
@@ -92,9 +94,11 @@
 # Decoding
 
 DECODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_dec_s', compilation_info=eci)
+pypy_cjk_dec_new = llexternal('pypy_cjk_dec_new',
+                              [MULTIBYTECODEC_P], DECODEBUF_P)
 pypy_cjk_dec_init = llexternal('pypy_cjk_dec_init',
-                               [MULTIBYTECODEC_P, rffi.CCHARP, rffi.SSIZE_T],
-                               DECODEBUF_P)
+                               [DECODEBUF_P, rffi.CCHARP, rffi.SSIZE_T],
+                               rffi.SSIZE_T)
 pypy_cjk_dec_free = llexternal('pypy_cjk_dec_free', [DECODEBUF_P],
                                lltype.Void)
 pypy_cjk_dec_chunk = llexternal('pypy_cjk_dec_chunk', [DECODEBUF_P],
@@ -113,25 +117,30 @@
                                            rffi.SSIZE_T)
 
 def decode(codec, stringdata, errors="strict", errorcb=None, namecb=None):
+    decodebuf = pypy_cjk_dec_new(codec)
+    if not decodebuf:
+        raise MemoryError
+    try:
+        return decodeex(decodebuf, stringdata, errors, errorcb, namecb)
+    finally:
+        pypy_cjk_dec_free(decodebuf)
+
+def decodeex(decodebuf, stringdata, errors="strict", errorcb=None, namecb=None,
+             ignore_error=0):
     inleft = len(stringdata)
     inbuf = rffi.get_nonmovingbuffer(stringdata)
     try:
-        decodebuf = pypy_cjk_dec_init(codec, inbuf, inleft)
-        if not decodebuf:
+        if pypy_cjk_dec_init(decodebuf, inbuf, inleft) < 0:
             raise MemoryError
-        try:
-            while True:
-                r = pypy_cjk_dec_chunk(decodebuf)
-                if r == 0:
-                    break
-                multibytecodec_decerror(decodebuf, r, errors,
-                                        errorcb, namecb, stringdata)
-            src = pypy_cjk_dec_outbuf(decodebuf)
-            length = pypy_cjk_dec_outlen(decodebuf)
-            return rffi.wcharpsize2unicode(src, length)
-        #
-        finally:
-            pypy_cjk_dec_free(decodebuf)
+        while True:
+            r = pypy_cjk_dec_chunk(decodebuf)
+            if r == 0 or r == ignore_error:
+                break
+            multibytecodec_decerror(decodebuf, r, errors,
+                                    errorcb, namecb, stringdata)
+        src = pypy_cjk_dec_outbuf(decodebuf)
+        length = pypy_cjk_dec_outlen(decodebuf)
+        return rffi.wcharpsize2unicode(src, length)
     #
     finally:
         rffi.free_nonmovingbuffer(stringdata, inbuf)
@@ -174,13 +183,15 @@
 # ____________________________________________________________
 # Encoding
 ENCODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_enc_s', compilation_info=eci)
+pypy_cjk_enc_new = llexternal('pypy_cjk_enc_new',
+                               [MULTIBYTECODEC_P], ENCODEBUF_P)
 pypy_cjk_enc_init = llexternal('pypy_cjk_enc_init',
-                               [MULTIBYTECODEC_P, rffi.CWCHARP, rffi.SSIZE_T],
-                               ENCODEBUF_P)
+                               [ENCODEBUF_P, rffi.CWCHARP, rffi.SSIZE_T],
+                               rffi.SSIZE_T)
 pypy_cjk_enc_free = llexternal('pypy_cjk_enc_free', [ENCODEBUF_P],
                                lltype.Void)
-pypy_cjk_enc_chunk = llexternal('pypy_cjk_enc_chunk', [ENCODEBUF_P],
-                                rffi.SSIZE_T)
+pypy_cjk_enc_chunk = llexternal('pypy_cjk_enc_chunk',
+                                [ENCODEBUF_P, rffi.SSIZE_T], rffi.SSIZE_T)
 pypy_cjk_enc_reset = llexternal('pypy_cjk_enc_reset', [ENCODEBUF_P],
                                 rffi.SSIZE_T)
 pypy_cjk_enc_outbuf = llexternal('pypy_cjk_enc_outbuf', [ENCODEBUF_P],
@@ -195,39 +206,52 @@
                                            [ENCODEBUF_P, rffi.CCHARP,
                                             rffi.SSIZE_T, rffi.SSIZE_T],
                                            rffi.SSIZE_T)
+pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec',
+                                   [ENCODEBUF_P], MULTIBYTECODEC_P)
+MBENC_FLUSH = 1
+MBENC_RESET = 2
 
 def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None):
+    encodebuf = pypy_cjk_enc_new(codec)
+    if not encodebuf:
+        raise MemoryError
+    try:
+        return encodeex(encodebuf, unicodedata, errors, errorcb, namecb)
+    finally:
+        pypy_cjk_enc_free(encodebuf)
+
+def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
+             namecb=None, ignore_error=0):
     inleft = len(unicodedata)
     inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata)
     try:
-        encodebuf = pypy_cjk_enc_init(codec, inbuf, inleft)
-        if not encodebuf:
+        if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
             raise MemoryError
-        try:
-            while True:
-                r = pypy_cjk_enc_chunk(encodebuf)
-                if r == 0:
-                    break
-                multibytecodec_encerror(encodebuf, r, errors,
-                                        codec, errorcb, namecb, unicodedata)
-            while True:
-                r = pypy_cjk_enc_reset(encodebuf)
-                if r == 0:
-                    break
-                multibytecodec_encerror(encodebuf, r, errors,
-                                        codec, errorcb, namecb, unicodedata)
-            src = pypy_cjk_enc_outbuf(encodebuf)
-            length = pypy_cjk_enc_outlen(encodebuf)
-            return rffi.charpsize2str(src, length)
-        #
-        finally:
-            pypy_cjk_enc_free(encodebuf)
+        if ignore_error == 0:
+            flags = MBENC_FLUSH | MBENC_RESET
+        else:
+            flags = MBENC_RESET
+        while True:
+            r = pypy_cjk_enc_chunk(encodebuf, flags)
+            if r == 0 or r == ignore_error:
+                break
+            multibytecodec_encerror(encodebuf, r, errors,
+                                    errorcb, namecb, unicodedata)
+        while True:
+            r = pypy_cjk_enc_reset(encodebuf)
+            if r == 0:
+                break
+            multibytecodec_encerror(encodebuf, r, errors,
+                                    errorcb, namecb, unicodedata)
+        src = pypy_cjk_enc_outbuf(encodebuf)
+        length = pypy_cjk_enc_outlen(encodebuf)
+        return rffi.charpsize2str(src, length)
     #
     finally:
         rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf)
 
 def multibytecodec_encerror(encodebuf, e, errors,
-                            codec, errorcb, namecb, unicodedata):
+                            errorcb, namecb, unicodedata):
     if e > 0:
         reason = "illegal multibyte sequence"
         esize = e
@@ -248,6 +272,7 @@
     elif errors == "ignore":
         replace = ""
     elif errors == "replace":
+        codec = pypy_cjk_enc_getcodec(encodebuf)
         try:
             replace = encode(codec, u"?")
         except EncodeDecodeError:
diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -0,0 +1,141 @@
+from pypy.rpython.lltypesystem import lltype
+from pypy.module._multibytecodec import c_codecs
+from pypy.module._multibytecodec.interp_multibytecodec import (
+    MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror,
+    wrap_unicodeencodeerror)
+from pypy.interpreter.baseobjspace import Wrappable
+from pypy.interpreter.gateway import interp2app, unwrap_spec
+from pypy.interpreter.typedef import TypeDef, GetSetProperty
+from pypy.module._codecs.interp_codecs import CodecState
+
+
+class MultibyteIncrementalBase(Wrappable):
+
+    def __init__(self, space, errors):
+        if errors is None:
+            errors = 'strict'
+        self.space = space
+        self.errors = errors
+        w_codec = space.getattr(space.wrap(self), space.wrap("codec"))
+        codec = space.interp_w(MultibyteCodec, w_codec)
+        self.codec = codec.codec
+        self.name = codec.name
+        self._initialize()
+
+    def __del__(self):
+        self._free()
+
+    def reset_w(self):
+        self._free()
+        self._initialize()
+
+    def fget_errors(self, space):
+        return space.wrap(self.errors)
+
+    def fset_errors(self, space, w_errors):
+        self.errors = space.str_w(w_errors)
+
+
+class MultibyteIncrementalDecoder(MultibyteIncrementalBase):
+
+    def _initialize(self):
+        self.decodebuf = c_codecs.pypy_cjk_dec_new(self.codec)
+        self.pending = ""
+
+    def _free(self):
+        self.pending = None
+        if self.decodebuf:
+            c_codecs.pypy_cjk_dec_free(self.decodebuf)
+            self.decodebuf = lltype.nullptr(c_codecs.DECODEBUF_P.TO)
+
+    @unwrap_spec(object=str, final=bool)
+    def decode_w(self, object, final=False):
+        space = self.space
+        state = space.fromcache(CodecState)
+        if len(self.pending) > 0:
+            object = self.pending + object
+        try:
+            output = c_codecs.decodeex(self.decodebuf, object, self.errors,
+                                       state.decode_error_handler, self.name,
+                                       get_ignore_error(final))
+        except c_codecs.EncodeDecodeError, e:
+            raise wrap_unicodedecodeerror(space, e, object, self.name)
+        except RuntimeError:
+            raise wrap_runtimeerror(space)
+        pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
+        assert 0 <= pos <= len(object)
+        self.pending = object[pos:]
+        return space.wrap(output)
+
+
+ at unwrap_spec(errors="str_or_None")
+def mbidecoder_new(space, w_subtype, errors=None):
+    r = space.allocate_instance(MultibyteIncrementalDecoder, w_subtype)
+    r.__init__(space, errors)
+    return space.wrap(r)
+
+MultibyteIncrementalDecoder.typedef = TypeDef(
+    'MultibyteIncrementalDecoder',
+    __module__ = '_multibytecodec',
+    __new__ = interp2app(mbidecoder_new),
+    decode  = interp2app(MultibyteIncrementalDecoder.decode_w),
+    reset   = interp2app(MultibyteIncrementalDecoder.reset_w),
+    errors  = GetSetProperty(MultibyteIncrementalDecoder.fget_errors,
+                             MultibyteIncrementalDecoder.fset_errors),
+    )
+
+
+class MultibyteIncrementalEncoder(MultibyteIncrementalBase):
+
+    def _initialize(self):
+        self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec)
+        self.pending = u""
+
+    def _free(self):
+        self.pending = None
+        if self.encodebuf:
+            c_codecs.pypy_cjk_enc_free(self.encodebuf)
+            self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO)
+
+    @unwrap_spec(object=unicode, final=bool)
+    def encode_w(self, object, final=False):
+        space = self.space
+        state = space.fromcache(CodecState)
+        if len(self.pending) > 0:
+            object = self.pending + object
+        try:
+            output = c_codecs.encodeex(self.encodebuf, object, self.errors,
+                                       state.encode_error_handler, self.name,
+                                       get_ignore_error(final))
+        except c_codecs.EncodeDecodeError, e:
+            raise wrap_unicodeencodeerror(space, e, object, self.name)
+        except RuntimeError:
+            raise wrap_runtimeerror(space)
+        pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf)
+        assert 0 <= pos <= len(object)
+        self.pending = object[pos:]
+        return space.wrap(output)
+
+
+ at unwrap_spec(errors="str_or_None")
+def mbiencoder_new(space, w_subtype, errors=None):
+    r = space.allocate_instance(MultibyteIncrementalEncoder, w_subtype)
+    r.__init__(space, errors)
+    return space.wrap(r)
+
+MultibyteIncrementalEncoder.typedef = TypeDef(
+    'MultibyteIncrementalEncoder',
+    __module__ = '_multibytecodec',
+    __new__ = interp2app(mbiencoder_new),
+    encode  = interp2app(MultibyteIncrementalEncoder.encode_w),
+    reset   = interp2app(MultibyteIncrementalEncoder.reset_w),
+    errors  = GetSetProperty(MultibyteIncrementalEncoder.fget_errors,
+                             MultibyteIncrementalEncoder.fset_errors),
+    )
+
+
+def get_ignore_error(final):
+    if final:
+        return 0    # don't ignore any error
+    else:
+        return c_codecs.MBERR_TOOFEW
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -22,17 +22,9 @@
             output = c_codecs.decode(self.codec, input, errors,
                                      state.decode_error_handler, self.name)
         except c_codecs.EncodeDecodeError, e:
-            raise OperationError(
-                space.w_UnicodeDecodeError,
-                space.newtuple([
-                    space.wrap(self.name),
-                    space.wrap(input),
-                    space.wrap(e.start),
-                    space.wrap(e.end),
-                    space.wrap(e.reason)]))
+            raise wrap_unicodedecodeerror(space, e, input, self.name)
         except RuntimeError:
-            raise OperationError(space.w_RuntimeError,
-                                 space.wrap("internal codec error"))
+            raise wrap_runtimeerror(space)
         return space.newtuple([space.wrap(output),
                                space.wrap(len(input))])
 
@@ -46,17 +38,9 @@
             output = c_codecs.encode(self.codec, input, errors,
                                      state.encode_error_handler, self.name)
         except c_codecs.EncodeDecodeError, e:
-            raise OperationError(
-                space.w_UnicodeEncodeError,
-                space.newtuple([
-                    space.wrap(self.name),
-                    space.wrap(input),
-                    space.wrap(e.start),
-                    space.wrap(e.end),
-                    space.wrap(e.reason)]))
+            raise wrap_unicodeencodeerror(space, e, input, self.name)
         except RuntimeError:
-            raise OperationError(space.w_RuntimeError,
-                                 space.wrap("internal codec error"))
+            raise wrap_runtimeerror(space)
         return space.newtuple([space.wrap(output),
                                space.wrap(len(input))])
 
@@ -78,3 +62,28 @@
         raise OperationError(space.w_LookupError,
                              space.wrap("no such codec is supported."))
     return space.wrap(MultibyteCodec(name, codec))
+
+
+def wrap_unicodedecodeerror(space, e, input, name):
+    return OperationError(
+        space.w_UnicodeDecodeError,
+        space.newtuple([
+            space.wrap(name),
+            space.wrap(input),
+            space.wrap(e.start),
+            space.wrap(e.end),
+            space.wrap(e.reason)]))
+
+def wrap_unicodeencodeerror(space, e, input, name):
+    raise OperationError(
+        space.w_UnicodeEncodeError,
+        space.newtuple([
+            space.wrap(name),
+            space.wrap(input),
+            space.wrap(e.start),
+            space.wrap(e.end),
+            space.wrap(e.reason)]))
+
+def wrap_runtimeerror(space):
+    raise OperationError(space.w_RuntimeError,
+                         space.wrap("internal codec error"))
diff --git a/pypy/module/_multibytecodec/test/test_app_incremental.py b/pypy/module/_multibytecodec/test/test_app_incremental.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_multibytecodec/test/test_app_incremental.py
@@ -0,0 +1,163 @@
+from pypy.conftest import gettestobjspace
+
+
+class AppTestClasses:
+    def setup_class(cls):
+        cls.space = gettestobjspace(usemodules=['_multibytecodec'])
+        cls.w_IncrementalHzDecoder = cls.space.appexec([], """():
+            import _codecs_cn
+            from _multibytecodec import MultibyteIncrementalDecoder
+
+            class IncrementalHzDecoder(MultibyteIncrementalDecoder):
+                codec = _codecs_cn.getcodec('hz')
+
+            return IncrementalHzDecoder
+        """)
+        cls.w_IncrementalHzEncoder = cls.space.appexec([], """():
+            import _codecs_cn
+            from _multibytecodec import MultibyteIncrementalEncoder
+
+            class IncrementalHzEncoder(MultibyteIncrementalEncoder):
+                codec = _codecs_cn.getcodec('hz')
+
+            return IncrementalHzEncoder
+        """)
+        cls.w_IncrementalBig5hkscsEncoder = cls.space.appexec([], """():
+            import _codecs_cn
+            from _multibytecodec import MultibyteIncrementalEncoder
+
+            class IncrementalBig5hkscsEncoder(MultibyteIncrementalEncoder):
+                codec = _codecs_cn.getcodec('big5hkscs')
+
+            return IncrementalBig5hkscsEncoder
+        """)
+
+    def test_decode_hz(self):
+        d = self.IncrementalHzDecoder()
+        r = d.decode("~{abcd~}")
+        assert r == u'\u5f95\u6c85'
+        r = d.decode("~{efgh~}")
+        assert r == u'\u5f50\u73b7'
+        for c, output in zip("!~{abcd~}xyz~{efgh",
+              [u'!',  # !
+               u'',   # ~
+               u'',   # {
+               u'',   # a
+               u'\u5f95',   # b
+               u'',   # c
+               u'\u6c85',   # d
+               u'',   # ~
+               u'',   # }
+               u'x',  # x
+               u'y',  # y
+               u'z',  # z
+               u'',   # ~
+               u'',   # {
+               u'',   # e
+               u'\u5f50',   # f
+               u'',   # g
+               u'\u73b7',   # h
+               ]):
+            r = d.decode(c)
+            assert r == output
+
+    def test_decode_hz_final(self):
+        d = self.IncrementalHzDecoder()
+        r = d.decode("~{", True)
+        assert r == u''
+        raises(UnicodeDecodeError, d.decode, "~", True)
+        raises(UnicodeDecodeError, d.decode, "~{a", True)
+
+    def test_decode_hz_reset(self):
+        d = self.IncrementalHzDecoder()
+        r = d.decode("ab")
+        assert r == u'ab'
+        r = d.decode("~{")
+        assert r == u''
+        r = d.decode("ab")
+        assert r == u'\u5f95'
+        r = d.decode("ab")
+        assert r == u'\u5f95'
+        d.reset()
+        r = d.decode("ab")
+        assert r == u'ab'
+
+    def test_decode_hz_error(self):
+        d = self.IncrementalHzDecoder()
+        raises(UnicodeDecodeError, d.decode, "~{abc", True)
+        d = self.IncrementalHzDecoder("ignore")
+        r = d.decode("~{abc", True)
+        assert r == u'\u5f95'
+        d = self.IncrementalHzDecoder()
+        d.errors = "replace"
+        r = d.decode("~{abc", True)
+        assert r == u'\u5f95\ufffd'
+
+    def test_decode_hz_buffer_grow(self):
+        d = self.IncrementalHzDecoder()
+        for i in range(13):
+            r = d.decode("a" * (2**i))
+            assert r == u"a" * (2**i)
+
+    def test_encode_hz(self):
+        e = self.IncrementalHzEncoder()
+        r = e.encode("abcd")
+        assert r == 'abcd'
+        r = e.encode(u"\u5f95\u6c85")
+        assert r == '~{abcd~}'
+        r = e.encode(u"\u5f50")
+        assert r == '~{ef~}'
+        r = e.encode(u"\u73b7")
+        assert r == '~{gh~}'
+
+    def test_encode_hz_final(self):
+        e = self.IncrementalHzEncoder()
+        r = e.encode(u"xyz\u5f95\u6c85", True)
+        assert r == 'xyz~{abcd~}'
+        # This is a bit hard to test, because the only way I can see that
+        # encoders can return MBERR_TOOFEW is with surrogates, which only
+        # occur with 2-byte unicode characters...  We will just have to
+        # trust that the logic works, because it is exactly the same one
+        # as in the decode case :-/
+
+    def test_encode_hz_reset(self):
+        # Same issue as with test_encode_hz_final
+        e = self.IncrementalHzEncoder()
+        r = e.encode(u"xyz\u5f95\u6c85", True)
+        assert r == 'xyz~{abcd~}'
+        e.reset()
+        r = e.encode(u"xyz\u5f95\u6c85")
+        assert r == 'xyz~{abcd~}'
+
+    def test_encode_hz_error(self):
+        e = self.IncrementalHzEncoder()
+        raises(UnicodeEncodeError, e.encode, u"\u4321", True)
+        e = self.IncrementalHzEncoder("ignore")
+        r = e.encode(u"xy\u4321z", True)
+        assert r == 'xyz'
+        e = self.IncrementalHzEncoder()
+        e.errors = "replace"
+        r = e.encode(u"xy\u4321z", True)
+        assert r == 'xy?z'
+
+    def test_encode_hz_buffer_grow(self):
+        e = self.IncrementalHzEncoder()
+        for i in range(13):
+            r = e.encode(u"a" * (2**i))
+            assert r == "a" * (2**i)
+
+    def test_encode_big5hkscs(self):
+        #e = self.IncrementalBig5hkscsEncoder()
+        #r = e.encode(u'\xca', True)
+        #assert r == '\x88f'
+        #r = e.encode(u'\xca', True)
+        #assert r == '\x88f'
+        #raises(UnicodeEncodeError, e.encode, u'\u0304', True)
+        #
+        e = self.IncrementalBig5hkscsEncoder()
+        r = e.encode(u'\xca')
+        assert r == ''
+        r = e.encode(u'\xca')
+        assert r == '\x88f'
+        r = e.encode(u'\u0304')
+        assert r == '\x88b'
diff --git a/pypy/module/_multibytecodec/test/test_app_stream.py b/pypy/module/_multibytecodec/test/test_app_stream.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_multibytecodec/test/test_app_stream.py
@@ -0,0 +1,93 @@
+from pypy.conftest import gettestobjspace
+
+
+class AppTestStreams:
+    def setup_class(cls):
+        cls.space = gettestobjspace(usemodules=['_multibytecodec'])
+        cls.w_HzStreamReader = cls.space.appexec([], """():
+            import _codecs_cn
+            from _multibytecodec import MultibyteStreamReader
+
+            class HzStreamReader(MultibyteStreamReader):
+                codec = _codecs_cn.getcodec('hz')
+
+            return HzStreamReader
+        """)
+        cls.w_HzStreamWriter = cls.space.appexec([], """():
+            import _codecs_cn
+            from _multibytecodec import MultibyteStreamWriter
+
+            class HzStreamWriter(MultibyteStreamWriter):
+                codec = _codecs_cn.getcodec('hz')
+
+            return HzStreamWriter
+        """)
+        cls.w_ShiftJisx0213StreamWriter = cls.space.appexec([], """():
+            import _codecs_jp
+            from _multibytecodec import MultibyteStreamWriter
+
+            class ShiftJisx0213StreamWriter(MultibyteStreamWriter):
+                codec = _codecs_jp.getcodec('shift_jisx0213')
+
+            return ShiftJisx0213StreamWriter
+        """)
+
+    def test_reader(self):
+        class FakeFile:
+            def __init__(self, data):
+                self.data = data
+                self.pos = 0
+            def read(self, size):
+                res = self.data[self.pos : self.pos + size]
+                self.pos += size
+                return res
+        #
+        r = self.HzStreamReader(FakeFile("!~{abcd~}xyz~{efgh"))
+        for expected in u'!\u5f95\u6c85xyz\u5f50\u73b7':
+            c = r.read(1)
+            assert c == expected
+        c = r.read(1)
+        assert c == ''
+
+    def test_reader_replace(self):
+        class FakeFile:
+            def __init__(self, data):
+                self.data = data
+            def read(self):
+                return self.data
+        #
+        r = self.HzStreamReader(FakeFile("!~{a"), "replace")
+        c = r.read()
+        assert c == u'!\ufffd'
+        #
+        r = self.HzStreamReader(FakeFile("!~{a"))
+        r.errors = "replace"
+        assert r.errors == "replace"
+        c = r.read()
+        assert c == u'!\ufffd'
+
+    def test_writer(self):
+        class FakeFile:
+            def __init__(self):
+                self.output = []
+            def write(self, data):
+                self.output.append(data)
+        #
+        w = self.HzStreamWriter(FakeFile())
+        for input in u'!\u5f95\u6c85xyz\u5f50\u73b7':
+            w.write(input)
+        assert w.stream.output == ['!', '~{ab~}', '~{cd~}', 'x', 'y', 'z',
+                                   '~{ef~}', '~{gh~}']
+
+    def test_no_flush(self):
+        class FakeFile:
+            def __init__(self):
+                self.output = []
+            def write(self, data):
+                self.output.append(data)
+        #
+        w = self.ShiftJisx0213StreamWriter(FakeFile())
+        w.write(u'\u30ce')
+        w.write(u'\u304b')
+        w.write(u'\u309a')
+        assert w.stream.output == ['\x83m', '', '\x82\xf5']
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -2,6 +2,7 @@
 from pypy.module._multibytecodec.c_codecs import getcodec, codecs
 from pypy.module._multibytecodec.c_codecs import decode, encode
 from pypy.module._multibytecodec.c_codecs import EncodeDecodeError
+from pypy.module._multibytecodec import c_codecs
 
 
 def test_codecs_existence():
@@ -22,6 +23,52 @@
     c = getcodec("hz")
     u = decode(c, "~{abc}")
     assert u == u'\u5f95\u6cef'
+    u = decode(c, "~{")
+    assert u == u''
+
+def test_decodeex_hz():
+    c = getcodec("hz")
+    decodebuf = c_codecs.pypy_cjk_dec_new(c)
+    u = c_codecs.decodeex(decodebuf, "~{abcd~}")
+    assert u == u'\u5f95\u6c85'
+    u = c_codecs.decodeex(decodebuf, "~{efgh~}")
+    assert u == u'\u5f50\u73b7'
+    u = c_codecs.decodeex(decodebuf, "!~{abcd~}xyz~{efgh")
+    assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7'
+    c_codecs.pypy_cjk_dec_free(decodebuf)
+
+def test_decodeex_hz_incomplete():
+    c = getcodec("hz")
+    decodebuf = c_codecs.pypy_cjk_dec_new(c)
+    buf = ''
+    for c, output in zip("!~{abcd~}xyz~{efgh",
+          [u'!',  # !
+           u'',   # ~
+           u'',   # {
+           u'',   # a
+           u'\u5f95',   # b
+           u'',   # c
+           u'\u6c85',   # d
+           u'',   # ~
+           u'',   # }
+           u'x',  # x
+           u'y',  # y
+           u'z',  # z
+           u'',   # ~
+           u'',   # {
+           u'',   # e
+           u'\u5f50',   # f
+           u'',   # g
+           u'\u73b7',   # h
+           ]):
+        buf += c
+        u = c_codecs.decodeex(decodebuf, buf,
+                              ignore_error = c_codecs.MBERR_TOOFEW)
+        assert u == output
+        incompletepos = c_codecs.pypy_cjk_dec_inbuf_consumed(decodebuf)
+        buf = buf[incompletepos:]
+    assert buf == ''
+    c_codecs.pypy_cjk_dec_free(decodebuf)
 
 def test_decode_hz_error():
     # error
diff --git a/pypy/module/_rawffi/interp_rawffi.py b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -268,7 +268,7 @@
             self.ll_buffer = rffi.cast(rffi.VOIDP, address)
         else:
             self.ll_buffer = lltype.malloc(rffi.VOIDP.TO, size, flavor='raw',
-                                           zero=True)
+                                           zero=True, add_memory_pressure=True)
             if tracker.DO_TRACING:
                 ll_buf = rffi.cast(lltype.Signed, self.ll_buffer)
                 tracker.trace_allocation(ll_buf, self)
diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -14,8 +14,6 @@
 from pypy.rpython.lltypesystem import lltype, rffi
 
 
-memcpy = rffi.llexternal("memcpy", [rffi.VOIDP, rffi.VOIDP, rffi.SIZE_T], lltype.Void)
-
 @unwrap_spec(typecode=str)
 def w_array(space, w_cls, typecode, __args__):
     if len(__args__.arguments_w) > 1:
@@ -228,7 +226,8 @@
                     some += size >> 3
                     self.allocated = size + some
                     new_buffer = lltype.malloc(mytype.arraytype,
-                                               self.allocated, flavor='raw')
+                                               self.allocated, flavor='raw',
+                                               add_memory_pressure=True)
                     for i in range(min(size, self.len)):
                         new_buffer[i] = self.buffer[i]
                 else:
@@ -617,7 +616,7 @@
     def array_copy__Array(space, self):
         w_a = mytype.w_class(self.space)
         w_a.setlen(self.len)
-        memcpy(
+        rffi.c_memcpy(
             rffi.cast(rffi.VOIDP, w_a.buffer),
             rffi.cast(rffi.VOIDP, self.buffer),
             self.len * mytype.bytes
diff --git a/pypy/module/cpyext/include/patchlevel.h b/pypy/module/cpyext/include/patchlevel.h
--- a/pypy/module/cpyext/include/patchlevel.h
+++ b/pypy/module/cpyext/include/patchlevel.h
@@ -31,8 +31,9 @@
 /* PyPy version as a string */
 #define PYPY_VERSION "1.6.0"
 
-/* Subversion Revision number of this file (not of the repository) */
-#define PY_PATCHLEVEL_REVISION  "$Revision: 77872 $"
+/* Subversion Revision number of this file (not of the repository).
+ * Empty since Mercurial migration. */
+#define PY_PATCHLEVEL_REVISION  ""
 
 /* Version as a single 4-byte hex number, e.g. 0x010502B2 == 1.5.2b2.
    Use this for numeric comparisons, e.g. #if PY_VERSION_HEX >= ... */
diff --git a/pypy/module/itertools/interp_itertools.py b/pypy/module/itertools/interp_itertools.py
--- a/pypy/module/itertools/interp_itertools.py
+++ b/pypy/module/itertools/interp_itertools.py
@@ -339,16 +339,21 @@
                 start = 0
             else:
                 start = space.int_w(w_startstop)
+                if start < 0:
+                    raise OperationError(space.w_ValueError, space.wrap(
+                       "Indicies for islice() must be non-negative integers."))
             w_stop = args_w[0]
         else:
             raise OperationError(space.w_TypeError, space.wrap("islice() takes at most 4 arguments (" + str(num_args) + " given)"))
 
         if space.is_w(w_stop, space.w_None):
             stop = -1
-            stoppable = False
         else:
             stop = space.int_w(w_stop)
-            stoppable = True
+            if stop < 0:
+                raise OperationError(space.w_ValueError, space.wrap(
+                    "Stop argument must be a non-negative integer or None."))
+            stop = max(start, stop)    # for obscure CPython compatibility
 
         if num_args == 2:
             w_step = args_w[1]
@@ -356,38 +361,37 @@
                 step = 1
             else:
                 step = space.int_w(w_step)
+                if step < 1:
+                    raise OperationError(space.w_ValueError, space.wrap(
+                        "Step must be one or lager for islice()."))
         else:
             step = 1
 
-        if start < 0:
-            raise OperationError(space.w_ValueError, space.wrap("Indicies for islice() must be non-negative integers."))
-        if stoppable and stop < 0:
-            raise OperationError(space.w_ValueError, space.wrap("Stop argument must be a non-negative integer or None."))
-        if step < 1:
-            raise OperationError(space.w_ValueError, space.wrap("Step must be one or lager for islice()."))
-
+        self.ignore = step - 1
         self.start = start
         self.stop = stop
-        self.step = step
 
     def iter_w(self):
         return self.space.wrap(self)
 
     def next_w(self):
         if self.start >= 0:               # first call only
-            consume = self.start + 1
+            ignore = self.start
             self.start = -1
         else:                             # all following calls
-            consume = self.step
-        if consume > 1:
-            self._ignore_items(consume-1)
-        if self.stop >= 0:
-            if self.stop < consume:
+            ignore = self.ignore
+        stop = self.stop
+        if stop >= 0:
+            if stop <= ignore:
                 self.stop = 0   # reset the state so that a following next_w()
-                self.step = 1   # has no effect any more
+                                # has no effect any more
+                if stop > 0:
+                    self._ignore_items(stop)
                 raise OperationError(self.space.w_StopIteration,
                                      self.space.w_None)
-            self.stop -= consume
+            self.stop = stop - (ignore + 1)
+        if ignore > 0:
+            self._ignore_items(ignore)
         return self.space.next(self.iterable)
 
     def _ignore_items(self, num):
diff --git a/pypy/module/itertools/test/test_itertools.py b/pypy/module/itertools/test/test_itertools.py
--- a/pypy/module/itertools/test/test_itertools.py
+++ b/pypy/module/itertools/test/test_itertools.py
@@ -266,6 +266,13 @@
         raises(StopIteration, islc.next)  # drops the 6th and raise
         assert it.next() == "j"
 
+        it = iter("abcdefghij")
+        islc = itertools.islice(it, 3, 4, 3)
+        assert islc.next() == "d"    # drops 0, 1, 2, returns item #3
+        assert it.next() == "e"
+        raises(StopIteration, islc.next)  # item #4 is 'stop', so just raise
+        assert it.next() == "f"
+
     def test_islice_overflow(self):
         import itertools
         import sys
diff --git a/pypy/module/micronumpy/__init__.py b/pypy/module/micronumpy/__init__.py
--- a/pypy/module/micronumpy/__init__.py
+++ b/pypy/module/micronumpy/__init__.py
@@ -31,6 +31,9 @@
         'sin': 'interp_ufuncs.sin',
         'cos': 'interp_ufuncs.cos',
         'tan': 'interp_ufuncs.tan',
+        'arcsin': 'interp_ufuncs.arcsin',
+        'arccos': 'interp_ufuncs.arccos',
+        'arctan': 'interp_ufuncs.arctan',
     }
 
     appleveldefs = {
diff --git a/pypy/module/micronumpy/interp_numarray.py b/pypy/module/micronumpy/interp_numarray.py
--- a/pypy/module/micronumpy/interp_numarray.py
+++ b/pypy/module/micronumpy/interp_numarray.py
@@ -187,17 +187,17 @@
     def _getnums(self, comma):
         if self.find_size() > 1000:
             nums = [
-                float2string(self.getitem(index))
+                float2string(self.eval(index))
                 for index in range(3)
             ]
             nums.append("..." + "," * comma)
             nums.extend([
-                float2string(self.getitem(index))
+                float2string(self.eval(index))
                 for index in range(self.find_size() - 3, self.find_size())
             ])
         else:
             nums = [
-                float2string(self.getitem(index))
+                float2string(self.eval(index))
                 for index in range(self.find_size())
             ]
         return nums
@@ -229,7 +229,7 @@
         start, stop, step, slice_length = space.decode_index4(w_idx, self.find_size())
         if step == 0:
             # Single index
-            return space.wrap(self.get_concrete().getitem(start))
+            return space.wrap(self.get_concrete().eval(start))
         else:
             # Slice
             res = SingleDimSlice(start, stop, step, slice_length, self, self.signature.transition(SingleDimSlice.static_signature))
@@ -416,14 +416,12 @@
         # in fact, ViewArray never gets "concrete" as it never stores data.
         # This implementation is needed for BaseArray getitem/setitem to work,
         # can be refactored.
+        self.parent.get_concrete()
         return self
 
     def eval(self, i):
         return self.parent.eval(self.calc_index(i))
 
-    def getitem(self, item):
-        return self.parent.getitem(self.calc_index(item))
-
     @unwrap_spec(item=int, value=float)
     def setitem(self, item, value):
         return self.parent.setitem(self.calc_index(item), value)
@@ -479,7 +477,8 @@
         BaseArray.__init__(self)
         self.size = size
         self.storage = lltype.malloc(TP, size, zero=True,
-                                     flavor='raw', track_allocation=False)
+                                     flavor='raw', track_allocation=False,
+                                     add_memory_pressure=True)
         # XXX find out why test_zjit explodes with trackign of allocations
 
     def get_concrete(self):
@@ -497,9 +496,6 @@
     def descr_len(self, space):
         return space.wrap(self.size)
 
-    def getitem(self, item):
-        return self.storage[item]
-
     def setitem(self, item, value):
         self.invalidated()
         self.storage[item] = value
@@ -511,7 +507,7 @@
             self._sliceloop2(start, stop, step, arr, self)
 
     def __del__(self):
-        lltype.free(self.storage, flavor='raw')
+        lltype.free(self.storage, flavor='raw', track_allocation=False)
 
 def new_numarray(space, w_size_or_iterable):
     l = space.listview(w_size_or_iterable)
diff --git a/pypy/module/micronumpy/interp_ufuncs.py b/pypy/module/micronumpy/interp_ufuncs.py
--- a/pypy/module/micronumpy/interp_ufuncs.py
+++ b/pypy/module/micronumpy/interp_ufuncs.py
@@ -120,3 +120,20 @@
 @ufunc2
 def mod(lvalue, rvalue):
     return math.fmod(lvalue, rvalue)
+
+
+ at ufunc
+def arcsin(value):
+    if value < -1.0 or  value > 1.0:
+        return rfloat.NAN
+    return math.asin(value)
+
+ at ufunc
+def arccos(value):
+    if value < -1.0 or  value > 1.0:
+        return rfloat.NAN
+    return math.acos(value)
+
+ at ufunc
+def arctan(value):
+    return math.atan(value)
diff --git a/pypy/module/micronumpy/test/test_numarray.py b/pypy/module/micronumpy/test/test_numarray.py
--- a/pypy/module/micronumpy/test/test_numarray.py
+++ b/pypy/module/micronumpy/test/test_numarray.py
@@ -70,6 +70,7 @@
         from numpy import array, zeros
         a = array(range(5))
         assert str(a) == "[0.0 1.0 2.0 3.0 4.0]"
+        assert str((2*a)[:]) == "[0.0 2.0 4.0 6.0 8.0]"
         a = zeros(1001)
         assert str(a) == "[0.0 0.0 0.0 ..., 0.0 0.0 0.0]"
 
diff --git a/pypy/module/micronumpy/test/test_ufuncs.py b/pypy/module/micronumpy/test/test_ufuncs.py
--- a/pypy/module/micronumpy/test/test_ufuncs.py
+++ b/pypy/module/micronumpy/test/test_ufuncs.py
@@ -205,3 +205,47 @@
         b = tan(a)
         for i in range(len(a)):
             assert b[i] == math.tan(a[i])
+
+
+    def test_arcsin(self):
+        import math
+        from numpy import array, arcsin
+
+        a = array([-1, -0.5, -0.33, 0, 0.33, 0.5, 1])        
+        b = arcsin(a)
+        for i in range(len(a)):
+            assert b[i] == math.asin(a[i])
+
+        a = array([-10, -1.5, -1.01, 1.01, 1.5, 10, float('nan'), float('inf'), float('-inf')])
+        b = arcsin(a)
+        for f in b:
+            assert math.isnan(f)
+
+    def test_arccos(self):
+        import math
+        from numpy import array, arccos
+
+        a = array([-1, -0.5, -0.33, 0, 0.33, 0.5, 1])
+        b = arccos(a)
+        for i in range(len(a)):
+            assert b[i] == math.acos(a[i])
+
+        
+        a = array([-10, -1.5, -1.01, 1.01, 1.5, 10, float('nan'), float('inf'), float('-inf')])
+        b = arccos(a)
+        for f in b:
+            assert math.isnan(f)
+
+    def test_arctan(self):
+        import math
+        from numpy import array, arctan
+
+        a = array([-3, -2, -1, 0, 1, 2, 3, float('inf'), float('-inf')])
+        b = arctan(a)
+        for i in range(len(a)):
+            assert b[i] == math.atan(a[i])
+
+        a  = array([float('nan')])
+        b = arctan(a)
+        assert math.isnan(b[0])
+
diff --git a/pypy/module/pypyjit/test_pypy_c/test_array.py b/pypy/module/pypyjit/test_pypy_c/test_array.py
--- a/pypy/module/pypyjit/test_pypy_c/test_array.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_array.py
@@ -1,4 +1,4 @@
-import py
+import py, sys
 from pypy.module.pypyjit.test_pypy_c.test_00_model import BaseTestPyPyC
 
 class TestArray(BaseTestPyPyC):
@@ -88,6 +88,73 @@
             jump(p0, p1, p2, p3, p4, p5, p6, i28, i15, p9, i10, i11, descr=<Loop0>)
         """)
 
+    def test_array_of_doubles(self):
+        def main():
+            from array import array
+            img = array('d', [21.5]*1000)
+            i = 0
+            while i < 1000:
+                img[i] += 20.5
+                assert img[i] == 42.0
+                i += 1
+            return 123
+        #
+        log = self.run(main, [])
+        assert log.result == 123
+        loop, = log.loops_by_filename(self.filepath)
+        assert loop.match("""
+            i10 = int_lt(i6, 1000)
+            guard_true(i10, descr=...)
+            i11 = int_lt(i6, i7)
+            guard_true(i11, descr=...)
+            f13 = getarrayitem_raw(i8, i6, descr=<FloatArrayNoLengthDescr>)
+            f15 = float_add(f13, 20.500000)
+            setarrayitem_raw(i8, i6, f15, descr=<FloatArrayNoLengthDescr>)
+            f16 = getarrayitem_raw(i8, i6, descr=<FloatArrayNoLengthDescr>)
+            i18 = float_eq(f16, 42.000000)
+            guard_true(i18, descr=...)
+            i20 = int_add(i6, 1)
+            --TICK--
+            jump(..., descr=<Loop0>)
+        """)
+
+    def test_array_of_floats(self):
+        def main():
+            from array import array
+            img = array('f', [21.5]*1000)
+            i = 0
+            while i < 1000:
+                img[i] += 20.5
+                assert img[i] == 42.0
+                i += 1
+            return 321
+        #
+        log = self.run(main, [])
+        assert log.result == 321
+        loop, = log.loops_by_filename(self.filepath)
+        if sys.maxint == 2147483647:
+            arraydescr = 'UnsignedArrayNoLengthDescr'
+        else:
+            arraydescr = 'UINTArrayNoLengthDescr'
+        assert loop.match("""
+            i10 = int_lt(i6, 1000)
+            guard_true(i10, descr=...)
+            i11 = int_lt(i6, i7)
+            guard_true(i11, descr=...)
+            i13 = getarrayitem_raw(i8, i6, descr=<%s>)
+            f14 = cast_singlefloat_to_float(i13)
+            f16 = float_add(f14, 20.500000)
+            i17 = cast_float_to_singlefloat(f16)
+            setarrayitem_raw(i8, i6,i17, descr=<%s>)
+            i18 = getarrayitem_raw(i8, i6, descr=<%s>)
+            f19 = cast_singlefloat_to_float(i18)
+            i21 = float_eq(f19, 42.000000)
+            guard_true(i21, descr=...)
+            i23 = int_add(i6, 1)
+            --TICK--
+            jump(..., descr=<Loop0>)
+        """ % (arraydescr, arraydescr, arraydescr))
+
 
     def test_zeropadded(self):
         def main():
diff --git a/pypy/module/pypyjit/test_pypy_c/test_misc.py b/pypy/module/pypyjit/test_pypy_c/test_misc.py
--- a/pypy/module/pypyjit/test_pypy_c/test_misc.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_misc.py
@@ -63,6 +63,7 @@
             i7 = int_gt(i4, 1)
             guard_true(i7, descr=...)
             p9 = call(ConstClass(fromint), i4, descr=...)
+            guard_no_exception(descr=...)
             p11 = call(ConstClass(rbigint.mul), p5, p9, descr=...)
             guard_no_exception(descr=...)
             i13 = int_sub(i4, 1)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_string.py b/pypy/module/pypyjit/test_pypy_c/test_string.py
--- a/pypy/module/pypyjit/test_pypy_c/test_string.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_string.py
@@ -90,12 +90,12 @@
             i46 = call(ConstClass(ll_startswith__rpy_stringPtr_rpy_stringPtr), p28, ConstPtr(ptr45), descr=<BoolCallDescr>)
             guard_false(i46, descr=...)
             p51 = new_with_vtable(21136408)
-            setfield_gc(p51, p28, descr=<GcPtrFieldDescr .*NumberStringParser.inst_literal .*>)
-            setfield_gc(p51, ConstPtr(ptr51), descr=<GcPtrFieldDescr pypy.objspace.std.strutil.NumberStringParser.inst_fname .*>)
-            setfield_gc(p51, 1, descr=<SignedFieldDescr .*NumberStringParser.inst_sign .*>)
-            setfield_gc(p51, 16, descr=<SignedFieldDescr .*NumberStringParser.inst_base .*>)
-            setfield_gc(p51, p28, descr=<GcPtrFieldDescr .*NumberStringParser.inst_s .*>)
-            setfield_gc(p51, i29, descr=<SignedFieldDescr .*NumberStringParser.inst_n .*>)
+            setfield_gc(p51, _, descr=...)    # 6 setfields, but the order is dict-order-dependent
+            setfield_gc(p51, _, descr=...)
+            setfield_gc(p51, _, descr=...)
+            setfield_gc(p51, _, descr=...)
+            setfield_gc(p51, _, descr=...)
+            setfield_gc(p51, _, descr=...)
             p55 = call(ConstClass(parse_digit_string), p51, descr=<GcPtrCallDescr>)
             guard_no_exception(descr=...)
             i57 = call(ConstClass(rbigint.toint), p55, descr=<SignedCallDescr>)
diff --git a/pypy/module/rctime/interp_time.py b/pypy/module/rctime/interp_time.py
--- a/pypy/module/rctime/interp_time.py
+++ b/pypy/module/rctime/interp_time.py
@@ -207,13 +207,13 @@
         t = (((c_time(lltype.nullptr(rffi.TIME_TP.TO))) / YEAR) * YEAR)
         # we cannot have reference to stack variable, put it on the heap
         t_ref = lltype.malloc(rffi.TIME_TP.TO, 1, flavor='raw')
-        t_ref[0] = t
+        t_ref[0] = rffi.cast(rffi.TIME_T, t)
         p = c_localtime(t_ref)
         janzone = -p.c_tm_gmtoff
         tm_zone = rffi.charp2str(p.c_tm_zone)
         janname = ["   ", tm_zone][bool(tm_zone)]
         tt = t + YEAR / 2
-        t_ref[0] = tt
+        t_ref[0] = rffi.cast(rffi.TIME_T, tt)
         p = c_localtime(t_ref)
         lltype.free(t_ref, flavor='raw')
         tm_zone = rffi.charp2str(p.c_tm_zone)
@@ -292,11 +292,14 @@
     else:
         seconds = space.float_w(w_seconds)
     try:
-        ovfcheck_float_to_int(seconds)
+        seconds = ovfcheck_float_to_int(seconds)
+        t = rffi.r_time_t(seconds)
+        if rffi.cast(lltype.Signed, t) != seconds:
+            raise OverflowError
     except OverflowError:
         raise OperationError(space.w_ValueError,
                              space.wrap("time argument too large"))
-    return rffi.r_time_t(seconds)
+    return t
 
 def _tm_to_tuple(space, t):
     time_tuple = [
@@ -317,7 +320,7 @@
 def _gettmarg(space, w_tup, allowNone=True):
     if allowNone and space.is_w(w_tup, space.w_None):
         # default to the current local time
-        tt = rffi.r_time_t(pytime.time())
+        tt = rffi.r_time_t(int(pytime.time()))
         t_ref = lltype.malloc(rffi.TIME_TP.TO, 1, flavor='raw')
         t_ref[0] = tt
         pbuf = c_localtime(t_ref)
diff --git a/pypy/module/test_lib_pypy/ctypes_tests/test_callbacks.py b/pypy/module/test_lib_pypy/ctypes_tests/test_callbacks.py
--- a/pypy/module/test_lib_pypy/ctypes_tests/test_callbacks.py
+++ b/pypy/module/test_lib_pypy/ctypes_tests/test_callbacks.py
@@ -14,14 +14,27 @@
         return args[-1]
 
     def check_type(self, typ, arg):
+        unwrapped_types = {
+            c_float: (float,),
+            c_double: (float,),
+            c_char: (str,),
+            c_char_p: (str,),
+            c_uint: (int, long),
+            c_ulong: (int, long),
+            }
+        
         PROTO = self.functype.im_func(typ, typ)
-        result = PROTO(self.callback)(arg)
+        cfunc = PROTO(self.callback)
+        result = cfunc(arg)
         if typ == c_float:
             assert abs(result - arg) < 0.000001
         else:
             assert self.got_args == (arg,)
             assert result == arg
 
+        result2 = cfunc(typ(arg))
+        assert type(result2) in unwrapped_types.get(typ, (int, long))
+
         PROTO = self.functype.im_func(typ, c_byte, typ)
         result = PROTO(self.callback)(-3, arg)
         if typ == c_float:
@@ -222,3 +235,20 @@
         out, err = capsys.readouterr()
         assert (out, err) == ("", "")
 
+
+    def test_callback_pyobject(self):
+        def callback(obj):
+            return obj
+
+        FUNC = CFUNCTYPE(py_object, py_object)
+        cfunc = FUNC(callback)
+        param = c_int(42)
+        assert cfunc(param) is param
+
+    def test_raise_argumenterror(self):
+        def callback(x):
+            pass
+        FUNC = CFUNCTYPE(None, c_void_p)
+        cfunc = FUNC(callback)
+        param = c_uint(42)
+        py.test.raises(ArgumentError, "cfunc(param)")
diff --git a/pypy/module/test_lib_pypy/ctypes_tests/test_cast.py b/pypy/module/test_lib_pypy/ctypes_tests/test_cast.py
--- a/pypy/module/test_lib_pypy/ctypes_tests/test_cast.py
+++ b/pypy/module/test_lib_pypy/ctypes_tests/test_cast.py
@@ -90,3 +90,8 @@
         assert sqrt._objects is my_sqrt._objects   # on CPython too
         my_sqrt._objects.clear()
         my_sqrt._objects.update(saved_objects)
+
+    def test_cast_argumenterror(self):
+        param = c_uint(42)
+        py.test.raises(ArgumentError, "cast(param, c_void_p)")
+        
diff --git a/pypy/module/test_lib_pypy/ctypes_tests/test_fastpath.py b/pypy/module/test_lib_pypy/ctypes_tests/test_fastpath.py
--- a/pypy/module/test_lib_pypy/ctypes_tests/test_fastpath.py
+++ b/pypy/module/test_lib_pypy/ctypes_tests/test_fastpath.py
@@ -1,4 +1,4 @@
-from ctypes import CDLL, POINTER, pointer, c_byte, c_int, c_char_p
+from ctypes import CDLL, POINTER, pointer, c_byte, c_int, c_char_p, CFUNCTYPE, c_void_p, c_size_t
 import sys
 import py
 from support import BaseCTypesTestChecker
@@ -46,6 +46,24 @@
         tf_b.argtypes = (c_byte,)
         assert tf_b(-126) == -42
 
+    def test_from_cfunctype(self):
+        from _ctypes import _memmove_addr
+        functype = CFUNCTYPE(c_void_p, c_void_p, c_void_p, c_size_t)
+        my_memmove = functype(_memmove_addr)
+        assert my_memmove._is_fastpath
+
+    def test_undeclared_restype(self):
+        # make sure we get a fresh function
+        try:
+            del dll.tf_i
+        except AttributeError:
+            pass
+        tf_i = dll.tf_i
+        assert not tf_i._is_fastpath
+        tf_i.argtypes = (c_int,)
+        assert tf_i._is_fastpath
+        assert tf_i(12) == 4
+
     def test_pointer_args(self):
         f = dll._testfunc_p_p
         f.restype = POINTER(c_int)
@@ -63,13 +81,10 @@
         result = f(mystr, ord("b"))
         assert result == "bcd"
 
-    @py.test.mark.xfail
     def test_strings(self):
         f = dll.my_strchr
         f.argtypes = [c_char_p, c_int]
         f.restype = c_char_p
-        # python strings need to be converted to c_char_p, but this is
-        # supported only in the slow path so far
         result = f("abcd", ord("b"))
         assert result == "bcd"
 
diff --git a/pypy/module/test_lib_pypy/ctypes_tests/test_functions.py b/pypy/module/test_lib_pypy/ctypes_tests/test_functions.py
--- a/pypy/module/test_lib_pypy/ctypes_tests/test_functions.py
+++ b/pypy/module/test_lib_pypy/ctypes_tests/test_functions.py
@@ -488,11 +488,9 @@
         warnings.simplefilter("always")
         with warnings.catch_warnings(record=True) as w:
             dll.get_an_integer()
-            assert len(w) == 2
+            assert len(w) == 1
             assert issubclass(w[0].category, RuntimeWarning)
-            assert issubclass(w[1].category, RuntimeWarning)
             assert "C function without declared arguments called" in str(w[0].message)
-            assert "C function without declared return type called" in str(w[1].message)
 
     def test_errcheck(self):
         py.test.skip('fixme')
diff --git a/pypy/module/test_lib_pypy/ctypes_tests/test_structures.py b/pypy/module/test_lib_pypy/ctypes_tests/test_structures.py
--- a/pypy/module/test_lib_pypy/ctypes_tests/test_structures.py
+++ b/pypy/module/test_lib_pypy/ctypes_tests/test_structures.py
@@ -424,6 +424,15 @@
             sys.settrace(oldtrace)
             events = None
 
+    def test_large_fields(self):
+        # make sure that large fields are not "confused" with bitfields
+        # (because the bitfields use the higher bits of the "size" attribute)
+        Array = c_long * 8192
+        class X(Structure):
+            _fields_ = [('items', Array)]
+        obj = X()
+        assert isinstance(obj.items, Array)
+
 class TestPointerMember(BaseCTypesTestChecker):
 
     def test_1(self):
diff --git a/pypy/objspace/descroperation.py b/pypy/objspace/descroperation.py
--- a/pypy/objspace/descroperation.py
+++ b/pypy/objspace/descroperation.py
@@ -35,6 +35,13 @@
     return w_hash
 object_hash._annspecialcase_ = 'specialize:memo'
 
+def type_eq(space):
+    "Utility that returns the app-level descriptor type.__eq__."
+    w_src, w_eq = space.lookup_in_type_where(space.w_type,
+                                             '__eq__')
+    return w_eq
+type_eq._annspecialcase_ = 'specialize:memo'
+
 def raiseattrerror(space, w_obj, name, w_descr=None):
     w_type = space.type(w_obj)
     typename = w_type.getname(space)
diff --git a/pypy/objspace/flow/operation.py b/pypy/objspace/flow/operation.py
--- a/pypy/objspace/flow/operation.py
+++ b/pypy/objspace/flow/operation.py
@@ -359,10 +359,10 @@
                 # All arguments are constants: call the operator now
                 try:
                     result = op(*args)
-                except:
-                    etype, evalue, etb = sys.exc_info()
-                    msg = "generated by a constant operation:  %s%r" % (
-                        name, tuple(args))
+                except Exception, e:
+                    etype = e.__class__
+                    msg = "generated by a constant operation:  %s" % (
+                        name)
                     raise OperationThatShouldNotBePropagatedError(
                         self.wrap(etype), self.wrap(msg))
                 else:
diff --git a/pypy/objspace/std/bytearrayobject.py b/pypy/objspace/std/bytearrayobject.py
--- a/pypy/objspace/std/bytearrayobject.py
+++ b/pypy/objspace/std/bytearrayobject.py
@@ -282,8 +282,8 @@
     return space.wrap(''.join(w_bytearray.data))
 
 def _convert_idx_params(space, w_self, w_start, w_stop):
-    start = slicetype._Eval_SliceIndex(space, w_start)
-    stop = slicetype._Eval_SliceIndex(space, w_stop)
+    start = slicetype.eval_slice_index(space, w_start)
+    stop = slicetype.eval_slice_index(space, w_stop)
     length = len(w_self.data)
     if start < 0:
         start += length
diff --git a/pypy/objspace/std/mapdict.py b/pypy/objspace/std/mapdict.py
--- a/pypy/objspace/std/mapdict.py
+++ b/pypy/objspace/std/mapdict.py
@@ -421,6 +421,14 @@
         key = ("slot", SLOTS_STARTING_FROM + index)
         self._get_mapdict_map().write(self, key, w_value)
 
+    def delslotvalue(self, index):
+        key = ("slot", SLOTS_STARTING_FROM + index)
+        new_obj = self._get_mapdict_map().delete(self, key)
+        if new_obj is None:
+            return False
+        self._become(new_obj)
+        return True
+
     # used by _weakref implemenation
 
     def getweakref(self):
diff --git a/pypy/objspace/std/sliceobject.py b/pypy/objspace/std/sliceobject.py
--- a/pypy/objspace/std/sliceobject.py
+++ b/pypy/objspace/std/sliceobject.py
@@ -4,7 +4,7 @@
 from pypy.interpreter import gateway
 from pypy.objspace.std.model import registerimplementation, W_Object
 from pypy.objspace.std.register_all import register_all
-from pypy.objspace.std.slicetype import _Eval_SliceIndex
+from pypy.objspace.std.slicetype import eval_slice_index
 
 class W_SliceObject(W_Object):
     from pypy.objspace.std.slicetype import slice_typedef as typedef
@@ -25,7 +25,7 @@
         if space.is_w(w_slice.w_step, space.w_None):
             step = 1
         else:
-            step = _Eval_SliceIndex(space, w_slice.w_step)
+            step = eval_slice_index(space, w_slice.w_step)
             if step == 0:
                 raise OperationError(space.w_ValueError,
                                      space.wrap("slice step cannot be zero"))
@@ -35,7 +35,7 @@
             else:
                 start = 0
         else:
-            start = _Eval_SliceIndex(space, w_slice.w_start)
+            start = eval_slice_index(space, w_slice.w_start)
             if start < 0:
                 start += length
                 if start < 0:
@@ -54,7 +54,7 @@
             else:
                 stop = length
         else:
-            stop = _Eval_SliceIndex(space, w_slice.w_stop)
+            stop = eval_slice_index(space, w_slice.w_stop)
             if stop < 0:
                 stop += length
                 if stop < 0:
diff --git a/pypy/objspace/std/slicetype.py b/pypy/objspace/std/slicetype.py
--- a/pypy/objspace/std/slicetype.py
+++ b/pypy/objspace/std/slicetype.py
@@ -14,7 +14,7 @@
                         ' normal slices.')
 
 # utility functions
-def _Eval_SliceIndex(space, w_int):
+def eval_slice_index(space, w_int):
     try:
         return space.getindex_w(w_int, None) # clamp if long integer too large
     except OperationError, err:
@@ -25,7 +25,7 @@
                                         "None or have an __index__ method"))
 
 def adapt_lower_bound(space, size, w_index):
-    index = _Eval_SliceIndex(space, w_index)
+    index = eval_slice_index(space, w_index)
     if index < 0:
         index = index + size
         if index < 0:
@@ -34,7 +34,7 @@
     return index
 
 def adapt_bound(space, size, w_index):
-    index = _Eval_SliceIndex(space, w_index)
+    index = eval_slice_index(space, w_index)
     if index < 0:
         index = index + size
         if index < 0:
diff --git a/pypy/objspace/std/stringobject.py b/pypy/objspace/std/stringobject.py
--- a/pypy/objspace/std/stringobject.py
+++ b/pypy/objspace/std/stringobject.py
@@ -913,7 +913,7 @@
 def repr__String(space, w_str):
     s = w_str._value
 
-    buf = StringBuilder(50)
+    buf = StringBuilder(len(s) + 2)
 
     quote = "'"
     if quote in s and '"' not in s:
diff --git a/pypy/objspace/std/test/test_identitydict.py b/pypy/objspace/std/test/test_identitydict.py
--- a/pypy/objspace/std/test/test_identitydict.py
+++ b/pypy/objspace/std/test/test_identitydict.py
@@ -32,10 +32,20 @@
             def __hash__(self):
                 return 0
 
+        class TypeSubclass(type):
+            pass
+
+        class TypeSubclassCustomCmp(type):
+            def __cmp__(self, other):
+                return 0
+
         assert self.compares_by_identity(Plain)
         assert not self.compares_by_identity(CustomEq)
         assert not self.compares_by_identity(CustomCmp)
         assert not self.compares_by_identity(CustomHash)
+        assert self.compares_by_identity(type)
+        assert self.compares_by_identity(TypeSubclass)
+        assert not self.compares_by_identity(TypeSubclassCustomCmp)
 
     def test_modify_class(self):
         class X(object):
diff --git a/pypy/objspace/std/test/test_mapdict.py b/pypy/objspace/std/test/test_mapdict.py
--- a/pypy/objspace/std/test/test_mapdict.py
+++ b/pypy/objspace/std/test/test_mapdict.py
@@ -210,6 +210,12 @@
     assert obj2.storage == [501, 601, 701, 51, 61, 71]
     assert obj.map is obj2.map
 
+    assert obj2.getslotvalue(b) == 601
+    assert obj2.delslotvalue(b)
+    assert obj2.getslotvalue(b) is None
+    assert obj2.storage == [501, 701, 51, 61, 71]
+    assert not obj2.delslotvalue(b)
+
 
 def test_slots_no_dict():
     cls = Class(hasdict=False)
@@ -631,6 +637,14 @@
         a.__dict__ = {}
         a.__dict__ = {}
 
+    def test_delete_slot(self):
+        class A(object):
+            __slots__ = ['x']
+        
+        a = A()
+        a.x = 42
+        del a.x
+        raises(AttributeError, "a.x")
 
 class AppTestWithMapDictAndCounters(object):
     def setup_class(cls):
diff --git a/pypy/objspace/std/tupleobject.py b/pypy/objspace/std/tupleobject.py
--- a/pypy/objspace/std/tupleobject.py
+++ b/pypy/objspace/std/tupleobject.py
@@ -154,7 +154,7 @@
     x = 0x345678
     z = len(wrappeditems)
     for w_item in wrappeditems:
-        y = space.int_w(space.hash(w_item))
+        y = space.hash_w(w_item)
         x = (x ^ y) * mult
         z -= 1
         mult += 82520 + z + z
@@ -172,8 +172,8 @@
     return space.wrap(count)
 
 def tuple_index__Tuple_ANY_ANY_ANY(space, w_tuple, w_obj, w_start, w_stop):
-    start = slicetype._Eval_SliceIndex(space, w_start)
-    stop = slicetype._Eval_SliceIndex(space, w_stop)
+    start = slicetype.eval_slice_index(space, w_start)
+    stop = slicetype.eval_slice_index(space, w_stop)
     length = len(w_tuple.wrappeditems)
     if start < 0:
         start += length
diff --git a/pypy/objspace/std/typeobject.py b/pypy/objspace/std/typeobject.py
--- a/pypy/objspace/std/typeobject.py
+++ b/pypy/objspace/std/typeobject.py
@@ -173,8 +173,6 @@
             # ^^^ conservative default, fixed during real usage
 
         if space.config.objspace.std.withidentitydict:
-            did_compare_by_identity = (
-                w_self.compares_by_identity_status == COMPARES_BY_IDENTITY)
             if (key is None or key == '__eq__' or
                 key == '__cmp__' or key == '__hash__'):
                 w_self.compares_by_identity_status = UNKNOWN
@@ -229,7 +227,7 @@
         return w_self.getattribute_if_not_from_object() is None
 
     def compares_by_identity(w_self):
-        from pypy.objspace.descroperation import object_hash
+        from pypy.objspace.descroperation import object_hash, type_eq
         if not w_self.space.config.objspace.std.withidentitydict:
             return False # conservative
         #
@@ -238,7 +236,9 @@
             return w_self.compares_by_identity_status == COMPARES_BY_IDENTITY
         #
         default_hash = object_hash(w_self.space)
-        overrides_eq_cmp_or_hash = (w_self.lookup('__eq__') or
+        my_eq = w_self.lookup('__eq__')
+        overrides_eq = (my_eq and my_eq is not type_eq(w_self.space))
+        overrides_eq_cmp_or_hash = (overrides_eq or
                                     w_self.lookup('__cmp__') or
                                     w_self.lookup('__hash__') is not default_hash)
         if overrides_eq_cmp_or_hash:
diff --git a/pypy/rlib/jit.py b/pypy/rlib/jit.py
--- a/pypy/rlib/jit.py
+++ b/pypy/rlib/jit.py
@@ -18,7 +18,8 @@
 
     Most importantly it doesn't mean that an elidable function has no observable
     side effect, but those side effects are idempotent (ie caching).
-    For now, such a function should never raise an exception.
+    If a particular call to this function ends up raising an exception, then it
+    is handled like a normal function call (this decorator is ignored).
     """
     func._elidable_function_ = True
     return func
diff --git a/pypy/rlib/libffi.py b/pypy/rlib/libffi.py
--- a/pypy/rlib/libffi.py
+++ b/pypy/rlib/libffi.py
@@ -2,14 +2,13 @@
 
 from pypy.rpython.lltypesystem import rffi, lltype
 from pypy.rlib.objectmodel import specialize, enforceargs, we_are_translated
-from pypy.rlib.rarithmetic import intmask, r_uint, r_singlefloat
+from pypy.rlib.rarithmetic import intmask, r_uint, r_singlefloat, r_longlong
 from pypy.rlib import jit
 from pypy.rlib import clibffi
 from pypy.rlib.clibffi import get_libc_name, FUNCFLAG_CDECL, AbstractFuncPtr, \
     push_arg_as_ffiptr, c_ffi_call, FFI_TYPE_STRUCT
 from pypy.rlib.rdynload import dlopen, dlclose, dlsym, dlsym_byordinal
 from pypy.rlib.rdynload import DLLHANDLE
-from pypy.rlib.longlong2float import longlong2float, float2longlong
 
 class types(object):
     """
@@ -122,9 +121,10 @@
         elif TYPE is rffi.DOUBLE:
             cls = FloatArg
         elif TYPE is rffi.LONGLONG or TYPE is rffi.ULONGLONG:
-            raise TypeError, 'r_(u)longlong not supported by arg(), use arg_(u)longlong()'
+            cls = LongLongArg
+            val = rffi.cast(rffi.LONGLONG, val)
         elif TYPE is rffi.FLOAT:
-            raise TypeError, 'r_singlefloat not supported by arg(), use arg_singlefloat()'
+            cls = SingleFloatArg
         else:
             raise TypeError, 'Unsupported argument type: %s' % TYPE
         self._append(cls(val))
@@ -133,25 +133,6 @@
     def arg_raw(self, val):
         self._append(RawArg(val))
 
-    def arg_longlong(self, val):
-        """
-        Note: this is a hack. So far, the JIT does not support long longs, so
-        you must pass it as if it were a python Float (rffi.DOUBLE).  You can
-        use the convenience functions longlong2float and float2longlong to do
-        the conversions.  Note that if you use long longs, the call won't
-        be jitted at all.
-        """
-        assert IS_32_BIT      # use a normal integer on 64-bit platforms
-        self._append(LongLongArg(val))
-
-    def arg_singlefloat(self, val):
-        """
-        Note: you must pass a python Float (rffi.DOUBLE), not a r_singlefloat
-        (else the jit complains).  Note that if you use single floats, the
-        call won't be jitted at all.
-        """
-        self._append(SingleFloatArg(val))
-
     def _append(self, arg):
         if self.first is None:
             self.first = self.last = arg
@@ -196,25 +177,25 @@
         func._push_raw(self.ptrval, ll_args, i)
 
 class SingleFloatArg(AbstractArg):
-    """ An argument representing a C float (but holding a C double)
+    """ An argument representing a C float
     """
 
-    def __init__(self, floatval):
-        self.floatval = floatval
+    def __init__(self, singlefloatval):
+        self.singlefloatval = singlefloatval
 
     def push(self, func, ll_args, i):
-        func._push_single_float(self.floatval, ll_args, i)
+        func._push_singlefloat(self.singlefloatval, ll_args, i)
 
 
 class LongLongArg(AbstractArg):
-    """ An argument representing a C long long (but holding a C double)
+    """ An argument representing a C long long
     """
 
-    def __init__(self, floatval):
-        self.floatval = floatval
+    def __init__(self, longlongval):
+        self.longlongval = longlongval
 
     def push(self, func, ll_args, i):
-        func._push_longlong(self.floatval, ll_args, i)
+        func._push_longlong(self.longlongval, ll_args, i)
 
 
 # ======================================================================
@@ -274,15 +255,10 @@
         elif RESULT is rffi.DOUBLE:
             return self._do_call_float(self.funcsym, ll_args)
         elif RESULT is rffi.FLOAT:
-            # XXX: even if RESULT is FLOAT, we still return a DOUBLE, else the
-            # jit complains. Note that the jit is disabled in this case
-            return self._do_call_single_float(self.funcsym, ll_args)
+            return self._do_call_singlefloat(self.funcsym, ll_args)
         elif RESULT is rffi.LONGLONG or RESULT is rffi.ULONGLONG:
-            # XXX: even if RESULT is LONGLONG, we still return a DOUBLE, else the
-            # jit complains. Note that the jit is disabled in this case
-            # (it's not a typo, we really return a DOUBLE)
             assert IS_32_BIT
-            return self._do_call_longlong(self.funcsym, ll_args)
+            res = self._do_call_longlong(self.funcsym, ll_args)
         elif RESULT is lltype.Void:
             return self._do_call_void(self.funcsym, ll_args)
         else:
@@ -320,16 +296,15 @@
     def _push_float(self, value, ll_args, i):
         self._push_arg(value, ll_args, i)
 
-    @jit.dont_look_inside
-    def _push_single_float(self, value, ll_args, i):
-        self._push_arg(r_singlefloat(value), ll_args, i)
+    @jit.oopspec('libffi_push_singlefloat(self, value, ll_args, i)')
+    @enforceargs(None, r_singlefloat, None, int) # fix the annotation for tests
+    def _push_singlefloat(self, value, ll_args, i):
+        self._push_arg(value, ll_args, i)
 
-    @jit.dont_look_inside
-    def _push_longlong(self, floatval, ll_args, i):
-        """
-        Takes a longlong represented as a python Float. It's a hack for the
-        jit, else we could not see the whole libffi module at all"""  
-        self._push_arg(float2longlong(floatval), ll_args, i)
+    @jit.oopspec('libffi_push_longlong(self, value, ll_args, i)')
+    @enforceargs(None, r_longlong, None, int) # fix the annotation for tests
+    def _push_longlong(self, value, ll_args, i):
+        self._push_arg(value, ll_args, i)
 
     @jit.oopspec('libffi_call_int(self, funcsym, ll_args)')
     def _do_call_int(self, funcsym, ll_args):
@@ -339,20 +314,18 @@
     def _do_call_float(self, funcsym, ll_args):
         return self._do_call(funcsym, ll_args, rffi.DOUBLE)
 
-    @jit.dont_look_inside
-    def _do_call_single_float(self, funcsym, ll_args):
-        single_res = self._do_call(funcsym, ll_args, rffi.FLOAT)
-        return float(single_res)
+    @jit.oopspec('libffi_call_singlefloat(self, funcsym, ll_args)')
+    def _do_call_singlefloat(self, funcsym, ll_args):
+        return self._do_call(funcsym, ll_args, rffi.FLOAT)
 
     @jit.dont_look_inside
     def _do_call_raw(self, funcsym, ll_args):
         # same as _do_call_int, but marked as jit.dont_look_inside
         return self._do_call(funcsym, ll_args, rffi.LONG)
 
-    @jit.dont_look_inside
+    @jit.oopspec('libffi_call_longlong(self, funcsym, ll_args)')
     def _do_call_longlong(self, funcsym, ll_args):
-        llres = self._do_call(funcsym, ll_args, rffi.LONGLONG)
-        return longlong2float(llres)
+        return self._do_call(funcsym, ll_args, rffi.LONGLONG)
 
     @jit.oopspec('libffi_call_void(self, funcsym, ll_args)')
     def _do_call_void(self, funcsym, ll_args):
diff --git a/pypy/rlib/longlong2float.py b/pypy/rlib/longlong2float.py
--- a/pypy/rlib/longlong2float.py
+++ b/pypy/rlib/longlong2float.py
@@ -11,6 +11,8 @@
 # -------- implement longlong2float and float2longlong --------
 DOUBLE_ARRAY_PTR = lltype.Ptr(lltype.Array(rffi.DOUBLE))
 LONGLONG_ARRAY_PTR = lltype.Ptr(lltype.Array(rffi.LONGLONG))
+UINT_ARRAY_PTR = lltype.Ptr(lltype.Array(rffi.UINT))
+FLOAT_ARRAY_PTR = lltype.Ptr(lltype.Array(rffi.FLOAT))
 
 # these definitions are used only in tests, when not translated
 def longlong2float_emulator(llval):
@@ -29,6 +31,22 @@
     lltype.free(d_array, flavor='raw')
     return llval
 
+def uint2singlefloat_emulator(ival):
+    f_array = lltype.malloc(FLOAT_ARRAY_PTR.TO, 1, flavor='raw')
+    i_array = rffi.cast(UINT_ARRAY_PTR, f_array)
+    i_array[0] = ival
+    singlefloatval = f_array[0]
+    lltype.free(f_array, flavor='raw')
+    return singlefloatval
+
+def singlefloat2uint_emulator(singlefloatval):
+    f_array = lltype.malloc(FLOAT_ARRAY_PTR.TO, 1, flavor='raw')
+    i_array = rffi.cast(UINT_ARRAY_PTR, f_array)
+    f_array[0] = singlefloatval
+    ival = i_array[0]
+    lltype.free(f_array, flavor='raw')
+    return ival
+
 from pypy.translator.tool.cbuild import ExternalCompilationInfo
 eci = ExternalCompilationInfo(includes=['string.h', 'assert.h'],
                               post_include_bits=["""
@@ -44,6 +62,18 @@
     memcpy(&ll, &x, 8);
     return ll;
 }
+static float pypy__uint2singlefloat(unsigned int x) {
+    float ff;
+    assert(sizeof(float) == 4 && sizeof(unsigned int) == 4);
+    memcpy(&ff, &x, 4);
+    return ff;
+}
+static unsigned int pypy__singlefloat2uint(float x) {
+    unsigned int ii;
+    assert(sizeof(float) == 4 && sizeof(unsigned int) == 4);
+    memcpy(&ii, &x, 4);
+    return ii;
+}
 """])
 
 longlong2float = rffi.llexternal(
@@ -55,3 +85,13 @@
     "pypy__float2longlong", [rffi.DOUBLE], rffi.LONGLONG,
     _callable=float2longlong_emulator, compilation_info=eci,
     _nowrapper=True, elidable_function=True)
+
+uint2singlefloat = rffi.llexternal(
+    "pypy__uint2singlefloat", [rffi.UINT], rffi.FLOAT,
+    _callable=uint2singlefloat_emulator, compilation_info=eci,
+    _nowrapper=True, elidable_function=True)
+
+singlefloat2uint = rffi.llexternal(
+    "pypy__singlefloat2uint", [rffi.FLOAT], rffi.UINT,
+    _callable=singlefloat2uint_emulator, compilation_info=eci,
+    _nowrapper=True, elidable_function=True)
diff --git a/pypy/rlib/rarithmetic.py b/pypy/rlib/rarithmetic.py
--- a/pypy/rlib/rarithmetic.py
+++ b/pypy/rlib/rarithmetic.py
@@ -71,9 +71,8 @@
     return int(n)
 
 def longlongmask(n):
-    if isinstance(n, int):
-        n = long(n)
-    assert isinstance(n, long)
+    assert isinstance(n, (int, long))
+    n = long(n)
     n &= LONGLONG_MASK
     if n >= LONGLONG_TEST:
         n -= 2*LONGLONG_TEST
diff --git a/pypy/rlib/rstring.py b/pypy/rlib/rstring.py
--- a/pypy/rlib/rstring.py
+++ b/pypy/rlib/rstring.py
@@ -1,8 +1,8 @@
 """ String builder interface and string functions
 """
 
-from pypy.annotation.model import SomeObject, SomeString, s_None,\
-     SomeChar, SomeInteger, SomeUnicodeCodePoint, SomeUnicodeString
+from pypy.annotation.model import (SomeObject, SomeString, s_None, SomeChar,
+    SomeInteger, SomeUnicodeCodePoint, SomeUnicodeString, SomePtr)
 from pypy.rpython.extregistry import ExtRegistryEntry
 
 
@@ -65,6 +65,12 @@
         assert isinstance(c, self.tp)
         self.l.append(c * times)
 
+    def append_charpsize(self, s, size):
+        l = []
+        for i in xrange(size):
+            l.append(s[i])
+        self.l.append(self.tp("").join(l))
+
     def build(self):
         return self.tp("").join(self.l)
 
@@ -100,6 +106,11 @@
         assert isinstance(s_times, SomeInteger)
         return s_None
 
+    def method_append_charpsize(self, s_ptr, s_size):
+        assert isinstance(s_ptr, SomePtr)
+        assert isinstance(s_size, SomeInteger)
+        return s_None
+
     def method_getlength(self):
         return SomeInteger(nonneg=True)
 
@@ -127,6 +138,11 @@
         assert isinstance(s_times, SomeInteger)
         return s_None
 
+    def method_append_charpsize(self, s_ptr, s_size):
+        assert isinstance(s_ptr, SomePtr)
+        assert isinstance(s_size, SomeInteger)
+        return s_None
+
     def method_getlength(self):
         return SomeInteger(nonneg=True)
 
diff --git a/pypy/rlib/rzlib.py b/pypy/rlib/rzlib.py
--- a/pypy/rlib/rzlib.py
+++ b/pypy/rlib/rzlib.py
@@ -1,8 +1,11 @@
 import sys
+
+from pypy.rlib.rstring import StringBuilder
 from pypy.rpython.lltypesystem import rffi, lltype
 from pypy.rpython.tool import rffi_platform
+from pypy.translator.platform import platform as compiler, CompilationError
 from pypy.translator.tool.cbuild import ExternalCompilationInfo
-from pypy.translator.platform import platform as compiler, CompilationError
+
 
 if compiler.name == "msvc":
     libname = 'zlib'
@@ -337,23 +340,18 @@
     """Common code for compress() and decompress().
     """
     # Prepare the input buffer for the stream
-    inbuf = lltype.malloc(rffi.CCHARP.TO, len(data), flavor='raw')
-    try:
+    with lltype.scoped_alloc(rffi.CCHARP.TO, len(data)) as inbuf:
         for i in xrange(len(data)):
             inbuf[i] = data[i]
         stream.c_next_in = rffi.cast(Bytefp, inbuf)
         rffi.setintfield(stream, 'c_avail_in', len(data))
 
         # Prepare the output buffer
-        outbuf = lltype.malloc(rffi.CCHARP.TO, OUTPUT_BUFFER_SIZE,
-                               flavor='raw')
-        try:
-            # Strategy: we call deflate() to get as much output data as
-            # fits in the buffer, then accumulate all output into a list
-            # of characters 'result'.  We don't need to gradually
-            # increase the output buffer size because there is no
-            # quadratic factor.
-            result = []
+        with lltype.scoped_alloc(rffi.CCHARP.TO, OUTPUT_BUFFER_SIZE) as outbuf:
+            # Strategy: we call deflate() to get as much output data as fits in
+            # the buffer, then accumulate all output into a StringBuffer
+            # 'result'.
+            result = StringBuilder()
 
             while True:
                 stream.c_next_out = rffi.cast(Bytefp, outbuf)
@@ -369,8 +367,7 @@
                 if err == Z_OK or err == Z_STREAM_END:
                     # accumulate data into 'result'
                     avail_out = rffi.cast(lltype.Signed, stream.c_avail_out)
-                    for i in xrange(bufsize - avail_out):
-                        result.append(outbuf[i])
+                    result.append_charpsize(outbuf, bufsize - avail_out)
                     # if the output buffer is full, there might be more data
                     # so we need to try again.  Otherwise, we're done.
                     if avail_out > 0:
@@ -393,14 +390,9 @@
                 # fallback case: report this error
                 raise RZlibError.fromstream(stream, err, while_doing)
 
-        finally:
-            lltype.free(outbuf, flavor='raw')
-    finally:
-        lltype.free(inbuf, flavor='raw')
-
     # When decompressing, if the compressed stream of data was truncated,
     # then the zlib simply returns Z_OK and waits for more.  If it is
     # complete it returns Z_STREAM_END.
-    return (''.join(result),
+    return (result.build(),
             err,
             rffi.cast(lltype.Signed, stream.c_avail_in))
diff --git a/pypy/rlib/streamio.py b/pypy/rlib/streamio.py
--- a/pypy/rlib/streamio.py
+++ b/pypy/rlib/streamio.py
@@ -894,13 +894,10 @@
             self.buf.append(data)
             self.buflen += datalen
         elif buflen:
-            i = self.bufsize - buflen
-            assert i >= 0
-            self.buf.append(data[:i])
+            self.buf.append(data)
             self.do_write(''.join(self.buf))
             self.buf = []
             self.buflen = 0
-            self.write(data[i:])
         else:
             self.do_write(data)
 
diff --git a/pypy/rlib/test/test_libffi.py b/pypy/rlib/test/test_libffi.py
--- a/pypy/rlib/test/test_libffi.py
+++ b/pypy/rlib/test/test_libffi.py
@@ -5,7 +5,7 @@
 from pypy.rlib.rarithmetic import r_singlefloat, r_longlong, r_ulonglong
 from pypy.rlib.test.test_clibffi import BaseFfiTest, get_libm_name, make_struct_ffitype_e
 from pypy.rlib.libffi import CDLL, Func, get_libc_name, ArgChain, types
-from pypy.rlib.libffi import longlong2float, float2longlong, IS_32_BIT
+from pypy.rlib.libffi import IS_32_BIT
 
 class TestLibffiMisc(BaseFfiTest):
 
@@ -52,19 +52,6 @@
         del lib
         assert not ALLOCATED
 
-    def test_longlong_as_float(self):
-        from pypy.translator.c.test.test_genc import compile
-        maxint64 = r_longlong(9223372036854775807)
-        def fn(x):
-            d = longlong2float(x)
-            ll = float2longlong(d)
-            return ll
-        assert fn(maxint64) == maxint64
-        #
-        fn2 = compile(fn, [r_longlong])
-        res = fn2(maxint64)
-        assert res == maxint64
-
 class TestLibffiCall(BaseFfiTest):
     """
     Test various kind of calls through libffi.
@@ -111,7 +98,7 @@
     def get_libfoo(self):
         return self.CDLL(self.libfoo_name)
 
-    def call(self, funcspec, args, RESULT, init_result=0, is_struct=False):
+    def call(self, funcspec, args, RESULT, is_struct=False, jitif=[]):
         """
         Call the specified function after constructing and ArgChain with the
         arguments in ``args``.
@@ -128,14 +115,7 @@
         func = lib.getpointer(name, argtypes, restype)
         chain = ArgChain()
         for arg in args:
-            if isinstance(arg, r_singlefloat):
-                chain.arg_singlefloat(float(arg))
-            elif IS_32_BIT and isinstance(arg, r_longlong):
-                chain.arg_longlong(longlong2float(arg))
-            elif IS_32_BIT and isinstance(arg, r_ulonglong):
-                arg = rffi.cast(rffi.LONGLONG, arg)
-                chain.arg_longlong(longlong2float(arg))
-            elif isinstance(arg, tuple):
+            if isinstance(arg, tuple):
                 methname, arg = arg
                 meth = getattr(chain, methname)
                 meth(arg)
@@ -143,13 +123,19 @@
                 chain.arg(arg)
         return func.call(chain, RESULT, is_struct=is_struct)
 
-    def check_loops(self, *args, **kwds):
+    # ------------------------------------------------------------------------
+
+    def test_very_simple(self):
         """
-        Ignored here, but does something in the JIT tests
+            int diff_xy(int x, long y)
+            {
+                return x - y;
+            }
         """
-        pass
-
-    # ------------------------------------------------------------------------
+        libfoo = self.get_libfoo() 
+        func = (libfoo, 'diff_xy', [types.sint, types.slong], types.sint)
+        res = self.call(func, [50, 8], lltype.Signed)
+        assert res == 42
 
     def test_simple(self):
         """
@@ -160,23 +146,14 @@
         """
         libfoo = self.get_libfoo() 
         func = (libfoo, 'sum_xy', [types.sint, types.double], types.sint)
-        res = self.call(func, [38, 4.2], rffi.LONG)
+        res = self.call(func, [38, 4.2], lltype.Signed, jitif=["floats"])
         assert res == 42
-        self.check_loops({
-                'call_release_gil': 1,
-                'guard_no_exception': 1,
-                'guard_not_forced': 1,
-                'int_add': 1,
-                'int_lt': 1,
-                'guard_true': 1,
-                'jump': 1})
 
     def test_float_result(self):
         libm = self.get_libm()
         func = (libm, 'pow', [types.double, types.double], types.double)
-        res = self.call(func, [2.0, 3.0], rffi.DOUBLE, init_result=0.0)
+        res = self.call(func, [2.0, 3.0], rffi.DOUBLE, jitif=["floats"])
         assert res == 8.0
-        self.check_loops(call_release_gil=1, guard_no_exception=1, guard_not_forced=1)
 
     def test_cast_result(self):
         """
@@ -189,7 +166,6 @@
         func = (libfoo, 'cast_to_uchar_and_ovf', [types.sint], types.uchar)
         res = self.call(func, [0], rffi.UCHAR)
         assert res == 200
-        self.check_loops(call_release_gil=1, guard_no_exception=1, guard_not_forced=1)
 
     def test_cast_argument(self):
         """
@@ -271,8 +247,7 @@
         libfoo = self.get_libfoo()
         func = (libfoo, 'get_pointer_to_b', [], types.pointer)
         LONGP = lltype.Ptr(rffi.CArray(rffi.LONG))
-        null = lltype.nullptr(LONGP.TO)
-        res = self.call(func, [], LONGP, init_result=null)
+        res = self.call(func, [], LONGP)
         assert res[0] == 20
 
     def test_void_result(self):
@@ -287,7 +262,7 @@
         #
         initval = self.call(get_dummy, [], rffi.LONG)
         #
-        res = self.call(set_dummy, [initval+1], lltype.Void, init_result=None)
+        res = self.call(set_dummy, [initval+1], lltype.Void)
         assert res is None
         #
         res = self.call(get_dummy, [], rffi.LONG)
@@ -305,9 +280,9 @@
         func = (libfoo, 'sum_xy_float', [types.float, types.float], types.float)
         x = r_singlefloat(12.34)
         y = r_singlefloat(56.78)
-        res = self.call(func, [x, y], rffi.FLOAT, init_result=0.0)
+        res = self.call(func, [x, y], rffi.FLOAT, jitif=["singlefloats"])
         expected = c_float(c_float(12.34).value + c_float(56.78).value).value
-        assert res == expected
+        assert float(res) == expected
 
     def test_slonglong_args(self):
         """
@@ -325,16 +300,10 @@
         if IS_32_BIT:
             x = r_longlong(maxint32+1)
             y = r_longlong(maxint32+2)
-            zero = longlong2float(r_longlong(0))
         else:
             x = maxint32+1
             y = maxint32+2
-            zero = 0
-        res = self.call(func, [x, y], rffi.LONGLONG, init_result=zero)
-        if IS_32_BIT:
-            # obscure, on 32bit it's really a long long, so it returns a
-            # DOUBLE because of the JIT hack
-            res = float2longlong(res)
+        res = self.call(func, [x, y], rffi.LONGLONG, jitif=["longlong"])
         expected = maxint32*2 + 3
         assert res == expected
 
@@ -354,12 +323,7 @@
                 types.ulonglong)
         x = r_ulonglong(maxint64+1)
         y = r_ulonglong(2)
-        res = self.call(func, [x, y], rffi.ULONGLONG, init_result=0)
-        if IS_32_BIT:
-            # obscure, on 32bit it's really a long long, so it returns a
-            # DOUBLE because of the JIT hack
-            res = float2longlong(res)
-            res = rffi.cast(rffi.ULONGLONG, res)
+        res = self.call(func, [x, y], rffi.ULONGLONG, jitif=["longlong"])
         expected = maxint64 + 3
         assert res == expected
 
@@ -406,7 +370,8 @@
         buf[0] = 30
         buf[1] = 12
         adr = rffi.cast(rffi.VOIDP, buf)
-        res = self.call(sum_point, [('arg_raw', adr)], rffi.LONG, init_result=0)
+        res = self.call(sum_point, [('arg_raw', adr)], rffi.LONG,
+                        jitif=["byval"])
         assert res == 42
         # check that we still have the ownership on the buffer
         assert buf[0] == 30
@@ -431,8 +396,8 @@
         make_point = (libfoo, 'make_point', [types.slong, types.slong], ffi_point)
         #
         PTR = lltype.Ptr(rffi.CArray(rffi.LONG))
-        p = self.call(make_point, [12, 34], PTR, init_result=lltype.nullptr(PTR.TO),
-                      is_struct=True)
+        p = self.call(make_point, [12, 34], PTR, is_struct=True,
+                      jitif=["byval"])
         assert p[0] == 12
         assert p[1] == 34
         lltype.free(p, flavor='raw')
diff --git a/pypy/rlib/test/test_longlong2float.py b/pypy/rlib/test/test_longlong2float.py
--- a/pypy/rlib/test/test_longlong2float.py
+++ b/pypy/rlib/test/test_longlong2float.py
@@ -1,5 +1,7 @@
 from pypy.translator.c.test.test_genc import compile
 from pypy.rlib.longlong2float import longlong2float, float2longlong
+from pypy.rlib.longlong2float import uint2singlefloat, singlefloat2uint
+from pypy.rlib.rarithmetic import r_singlefloat
 
 
 def fn(f1):
@@ -28,3 +30,23 @@
     for x in enum_floats():
         res = fn2(x)
         assert repr(res) == repr(x)
+
+# ____________________________________________________________
+
+def fnsingle(f1):
+    sf1 = r_singlefloat(f1)
+    ii = singlefloat2uint(sf1)
+    sf2 = uint2singlefloat(ii)
+    f2 = float(sf2)
+    return f2
+
+def test_int_as_singlefloat():
+    for x in enum_floats():
+        res = fnsingle(x)
+        assert repr(res) == repr(float(r_singlefloat(x)))
+
+def test_compiled_single():
+    fn2 = compile(fnsingle, [float])
+    for x in enum_floats():
+        res = fn2(x)
+        assert repr(res) == repr(float(r_singlefloat(x)))
diff --git a/pypy/rpython/lltypesystem/ll2ctypes.py b/pypy/rpython/lltypesystem/ll2ctypes.py
--- a/pypy/rpython/lltypesystem/ll2ctypes.py
+++ b/pypy/rpython/lltypesystem/ll2ctypes.py
@@ -27,7 +27,11 @@
 from pypy.rpython import raddress
 from pypy.translator.platform import platform
 from array import array
-from thread import _local as tlsobject
+try:
+    from thread import _local as tlsobject
+except ImportError:
+    class tlsobject(object):
+        pass
 
 # ____________________________________________________________
 
@@ -688,6 +692,8 @@
                     res = ctypes.cast(res, ctypes.c_void_p).value
                     if res is None:
                         return 0
+                if T.TO.RESULT == lltype.SingleFloat:
+                    res = res.value     # baaaah, cannot return a c_float()
                 return res
 
             def callback(*cargs):
diff --git a/pypy/rpython/lltypesystem/lltype.py b/pypy/rpython/lltypesystem/lltype.py
--- a/pypy/rpython/lltypesystem/lltype.py
+++ b/pypy/rpython/lltypesystem/lltype.py
@@ -1,7 +1,7 @@
 import py
 from pypy.rlib.rarithmetic import (r_int, r_uint, intmask, r_singlefloat,
                                    r_ulonglong, r_longlong, r_longfloat,
-                                   base_int, normalizedinttype)
+                                   base_int, normalizedinttype, longlongmask)
 from pypy.rlib.objectmodel import Symbolic
 from pypy.tool.uid import Hashable
 from pypy.tool.identity_dict import identity_dict
@@ -654,6 +654,9 @@
 
 _numbertypes = {int: Number("Signed", int, intmask)}
 _numbertypes[r_int] = _numbertypes[int]
+if r_longlong is not r_int:
+    _numbertypes[r_longlong] = Number("SignedLongLong", r_longlong,
+                                      longlongmask)
 
 def build_number(name, type):
     try:
@@ -1146,7 +1149,7 @@
         try:
             return self._lookup_adtmeth(field_name)
         except AttributeError:
-            raise AttributeError("%r instance has no field %r" % (self._T,
+            raise AttributeError("%r instance has no field %r" % (self._T._name,
                                                                   field_name))
 
     def __setattr__(self, field_name, val):
@@ -1936,7 +1939,7 @@
 
 
 def malloc(T, n=None, flavor='gc', immortal=False, zero=False,
-           track_allocation=True):
+           track_allocation=True, add_memory_pressure=False):
     assert flavor in ('gc', 'raw')
     if zero or immortal:
         initialization = 'example'
diff --git a/pypy/rpython/lltypesystem/rbuilder.py b/pypy/rpython/lltypesystem/rbuilder.py
--- a/pypy/rpython/lltypesystem/rbuilder.py
+++ b/pypy/rpython/lltypesystem/rbuilder.py
@@ -1,13 +1,13 @@
-
+from pypy.rlib import rgc
+from pypy.rlib.objectmodel import enforceargs
+from pypy.rlib.rarithmetic import ovfcheck
+from pypy.rpython.annlowlevel import llstr
+from pypy.rpython.rptr import PtrRepr
+from pypy.rpython.lltypesystem import lltype, rstr
+from pypy.rpython.lltypesystem.lltype import staticAdtMethod
+from pypy.rpython.lltypesystem.rstr import (STR, UNICODE, char_repr,
+    string_repr, unichar_repr, unicode_repr)
 from pypy.rpython.rbuilder import AbstractStringBuilderRepr
-from pypy.rpython.lltypesystem import lltype, rstr
-from pypy.rpython.lltypesystem.rstr import STR, UNICODE, char_repr,\
-     string_repr, unichar_repr, unicode_repr
-from pypy.rpython.annlowlevel import llstr
-from pypy.rlib import rgc
-from pypy.rlib.rarithmetic import ovfcheck
-from pypy.rlib.objectmodel import enforceargs
-from pypy.rpython.lltypesystem.lltype import staticAdtMethod
 from pypy.tool.sourcetools import func_with_new_name
 
 # Think about heuristics below, maybe we can come up with something
@@ -73,7 +73,7 @@
             ll_builder.grow(ll_builder, lgt)
         ll_str.copy_contents(ll_str, ll_builder.buf, 0, used, lgt)
         ll_builder.used = needed
-    
+
     @staticmethod
     def ll_append_char(ll_builder, char):
         if ll_builder.used == ll_builder.allocated:
@@ -102,6 +102,16 @@
         ll_builder.used = used
 
     @staticmethod
+    def ll_append_charpsize(ll_builder, charp, size):
+        used = ll_builder.used
+        if used + size > ll_builder.allocated:
+            ll_builder.grow(ll_builder, size)
+        for i in xrange(size):
+            ll_builder.buf.chars[used] = charp[i]
+            used += 1
+        ll_builder.used = used
+
+    @staticmethod
     def ll_getlength(ll_builder):
         return ll_builder.used
 
@@ -119,6 +129,9 @@
     mallocfn = staticmethod(rstr.mallocstr)
     string_repr = string_repr
     char_repr = char_repr
+    raw_ptr_repr = PtrRepr(
+        lltype.Ptr(lltype.Array(lltype.Char, hints={'nolength': True}))
+    )
 
 class UnicodeBuilderRepr(BaseStringBuilderRepr):
     lowleveltype = lltype.Ptr(UNICODEBUILDER)
@@ -126,6 +139,9 @@
     mallocfn = staticmethod(rstr.mallocunicode)
     string_repr = unicode_repr
     char_repr = unichar_repr
+    raw_ptr_repr = PtrRepr(
+        lltype.Ptr(lltype.Array(lltype.UniChar, hints={'nolength': True}))
+    )
 
 unicodebuilder_repr = UnicodeBuilderRepr()
 stringbuilder_repr = StringBuilderRepr()
diff --git a/pypy/rpython/lltypesystem/rdict.py b/pypy/rpython/lltypesystem/rdict.py
--- a/pypy/rpython/lltypesystem/rdict.py
+++ b/pypy/rpython/lltypesystem/rdict.py
@@ -29,7 +29,7 @@
 #        DICTVALUE value;
 #        int f_hash;        # (optional) key hash, if hard to recompute
 #    }
-#    
+#
 #    struct dicttable {
 #        int num_items;
 #        int num_pristine_entries;  # never used entries
@@ -50,12 +50,12 @@
         self.custom_eq_hash = custom_eq_hash is not None
         if not isinstance(key_repr, rmodel.Repr):  # not computed yet, done by setup()
             assert callable(key_repr)
-            self._key_repr_computer = key_repr 
+            self._key_repr_computer = key_repr
         else:
             self.external_key_repr, self.key_repr = self.pickkeyrepr(key_repr)
         if not isinstance(value_repr, rmodel.Repr):  # not computed yet, done by setup()
             assert callable(value_repr)
-            self._value_repr_computer = value_repr 
+            self._value_repr_computer = value_repr
         else:
             self.external_value_repr, self.value_repr = self.pickrepr(value_repr)
         self.dictkey = dictkey
@@ -176,7 +176,7 @@
             self.DICTENTRYARRAY = lltype.GcArray(self.DICTENTRY,
                                                  adtmeths=entrymeths)
             fields =          [ ("num_items", lltype.Signed),
-                                ("num_pristine_entries", lltype.Signed), 
+                                ("num_pristine_entries", lltype.Signed),
                                 ("entries", lltype.Ptr(self.DICTENTRYARRAY)) ]
             if self.custom_eq_hash:
                 self.r_rdict_eqfn, self.r_rdict_hashfn = self._custom_eq_hash_repr()
@@ -211,7 +211,7 @@
     def convert_const(self, dictobj):
         from pypy.rpython.lltypesystem import llmemory
         # get object from bound dict methods
-        #dictobj = getattr(dictobj, '__self__', dictobj) 
+        #dictobj = getattr(dictobj, '__self__', dictobj)
         if dictobj is None:
             return lltype.nullptr(self.DICT)
         if not isinstance(dictobj, (dict, objectmodel.r_dict)):
@@ -222,7 +222,7 @@
         except KeyError:
             self.setup()
             l_dict = ll_newdict_size(self.DICT, len(dictobj))
-            self.dict_cache[key] = l_dict 
+            self.dict_cache[key] = l_dict
             r_key = self.key_repr
             if r_key.lowleveltype == llmemory.Address:
                 raise TypeError("No prebuilt dicts of address keys")
@@ -274,7 +274,7 @@
         hop.exception_cannot_occur()
         v_res = hop.gendirectcall(ll_setdefault, v_dict, v_key, v_default)
         return self.recast_value(hop.llops, v_res)
-    
+
     def rtype_method_copy(self, hop):
         v_dict, = hop.inputargs(self)
         hop.exception_cannot_occur()
@@ -325,7 +325,7 @@
         hop.exception_is_here()
         return hop.gendirectcall(ll_popitem, cTUPLE, v_dict)
 
-class __extend__(pairtype(DictRepr, rmodel.Repr)): 
+class __extend__(pairtype(DictRepr, rmodel.Repr)):
 
     def rtype_getitem((r_dict, r_key), hop):
         v_dict, v_key = hop.inputargs(r_dict, r_dict.key_repr)
@@ -338,7 +338,7 @@
     def rtype_delitem((r_dict, r_key), hop):
         v_dict, v_key = hop.inputargs(r_dict, r_dict.key_repr)
         if not r_dict.custom_eq_hash:
-            hop.has_implicit_exception(KeyError)   # record that we know about it        
+            hop.has_implicit_exception(KeyError)   # record that we know about it
         hop.exception_is_here()
         return hop.gendirectcall(ll_dict_delitem, v_dict, v_key)
 
@@ -354,11 +354,11 @@
         v_dict, v_key = hop.inputargs(r_dict, r_dict.key_repr)
         hop.exception_is_here()
         return hop.gendirectcall(ll_contains, v_dict, v_key)
-        
+
 class __extend__(pairtype(DictRepr, DictRepr)):
     def convert_from_to((r_dict1, r_dict2), v, llops):
         # check that we don't convert from Dicts with
-        # different key/value types 
+        # different key/value types
         if r_dict1.dictkey is None or r_dict2.dictkey is None:
             return NotImplemented
         if r_dict1.dictkey is not r_dict2.dictkey:
@@ -430,7 +430,7 @@
     return hlinvoke(DICT.r_rdict_eqfn, d.fnkeyeq, key1, key2)
 
 def ll_dict_len(d):
-    return d.num_items 
+    return d.num_items
 
 def ll_dict_is_true(d):
     # check if a dict is True, allowing for None
@@ -491,8 +491,8 @@
     if i & HIGHEST_BIT:
         raise KeyError
     _ll_dict_del(d, i)
-ll_dict_delitem.oopspec = 'dict.delitem(d, key)'
 
+ at jit.dont_look_inside
 def _ll_dict_del(d, i):
     d.entries.mark_deleted(i)
     d.num_items -= 1
@@ -510,7 +510,7 @@
 
 def ll_dict_resize(d):
     old_entries = d.entries
-    old_size = len(old_entries) 
+    old_size = len(old_entries)
     # make a 'new_size' estimate and shrink it if there are many
     # deleted entry markers
     new_size = old_size * 2
@@ -538,7 +538,7 @@
     direct_compare = not hasattr(ENTRIES, 'no_direct_compare')
     mask = len(entries) - 1
     i = hash & mask
-    # do the first try before any looping 
+    # do the first try before any looping
     if entries.valid(i):
         checkingkey = entries[i].key
         if direct_compare and checkingkey == key:
@@ -562,8 +562,8 @@
 
     # In the loop, a deleted entry (everused and not valid) is by far
     # (factor of 100s) the least likely outcome, so test for that last.
-    perturb = r_uint(hash) 
-    while 1: 
+    perturb = r_uint(hash)
+    while 1:
         # compute the next index using unsigned arithmetic
         i = r_uint(i)
         i = (i << 2) + i + perturb + 1
@@ -591,7 +591,7 @@
                 if found:
                     return i   # found the entry
         elif freeslot == -1:
-            freeslot = i 
+            freeslot = i
         perturb >>= PERTURB_SHIFT
 
 def ll_dict_lookup_clean(d, hash):
@@ -601,7 +601,7 @@
     entries = d.entries
     mask = len(entries) - 1
     i = hash & mask
-    perturb = r_uint(hash) 
+    perturb = r_uint(hash)
     while entries.everused(i):
         i = r_uint(i)
         i = (i << 2) + i + perturb + 1
@@ -687,7 +687,6 @@
     iter.dict = d
     iter.index = 0
     return iter
-ll_dictiter.oopspec = 'newdictiter(d)'
 
 def _make_ll_dictnext(kind):
     # make three versions of the following function: keys, values, items
diff --git a/pypy/rpython/lltypesystem/rffi.py b/pypy/rpython/lltypesystem/rffi.py
--- a/pypy/rpython/lltypesystem/rffi.py
+++ b/pypy/rpython/lltypesystem/rffi.py
@@ -789,8 +789,7 @@
     # char* and size -> str (which can contain null bytes)
     def charpsize2str(cp, size):
         b = builder_class(size)
-        for i in xrange(size):
-            b.append(cp[i])
+        b.append_charpsize(cp, size)
         return b.build()
     charpsize2str._annenforceargs_ = [None, int]
 
@@ -1062,3 +1061,11 @@
         keep_unicodebuffer_alive_until_here(self.raw, self.gc_buf)
     def str(self, length):
         return unicode_from_buffer(self.raw, self.gc_buf, self.size, length)
+
+# You would have to have a *huge* amount of data for this to block long enough
+# to be worth it to release the GIL.
+c_memcpy = llexternal("memcpy",
+    [VOIDP, VOIDP, SIZE_T],
+    lltype.Void,
+    threadsafe=False
+)
\ No newline at end of file
diff --git a/pypy/rpython/lltypesystem/rlist.py b/pypy/rpython/lltypesystem/rlist.py
--- a/pypy/rpython/lltypesystem/rlist.py
+++ b/pypy/rpython/lltypesystem/rlist.py
@@ -14,7 +14,6 @@
 from pypy.rpython.lltypesystem import rstr
 from pypy.rpython import robject
 from pypy.rlib.debug import ll_assert
-from pypy.rlib.rarithmetic import ovfcheck
 from pypy.rpython.lltypesystem import rffi
 from pypy.rpython.lltypesystem.lloperation import llop
 from pypy.rlib import rgc
@@ -200,12 +199,11 @@
         else:
             some = 6
         some += newsize >> 3
-        try:
-            new_allocated = ovfcheck(newsize + some)
-        except OverflowError:
-            raise MemoryError
+        new_allocated = newsize + some
     # new_allocated is a bit more than newsize, enough to ensure an amortized
-    # linear complexity for e.g. repeated usage of l.append().
+    # linear complexity for e.g. repeated usage of l.append().  In case
+    # it overflows sys.maxint, it is guaranteed negative, and the following
+    # malloc() will fail.
     items = l.items
     newitems = malloc(typeOf(l).TO.items.TO, new_allocated)
     before_len = l.length
diff --git a/pypy/rpython/lltypesystem/rstr.py b/pypy/rpython/lltypesystem/rstr.py
--- a/pypy/rpython/lltypesystem/rstr.py
+++ b/pypy/rpython/lltypesystem/rstr.py
@@ -345,6 +345,8 @@
     def ll_strconcat(s1, s2):
         len1 = len(s1.chars)
         len2 = len(s2.chars)
+        # a single '+' like this is allowed to overflow: it gets
+        # a negative result, and the gc will complain
         newstr = s1.malloc(len1 + len2)
         s1.copy_contents(s1, newstr, 0, 0, len1)
         s1.copy_contents(s2, newstr, 0, len1, len2)
@@ -412,9 +414,18 @@
         itemslen = 0
         i = 0
         while i < num_items:
-            itemslen += len(items[i].chars)
+            try:
+                itemslen = ovfcheck(itemslen + len(items[i].chars))
+            except OverflowError:
+                raise MemoryError
             i += 1
-        result = s.malloc(itemslen + s_len * (num_items - 1))
+        try:
+            seplen = ovfcheck(s_len * (num_items - 1))
+        except OverflowError:
+            raise MemoryError
+        # a single '+' at the end is allowed to overflow: it gets
+        # a negative result, and the gc will complain
+        result = s.malloc(itemslen + seplen)
         res_index = len(items[0].chars)
         s.copy_contents(items[0], result, 0, 0, res_index)
         i = 1
@@ -688,7 +699,10 @@
         itemslen = 0
         i = 0
         while i < num_items:
-            itemslen += len(items[i].chars)
+            try:
+                itemslen = ovfcheck(itemslen + len(items[i].chars))
+            except OverflowError:
+                raise MemoryError
             i += 1
         if typeOf(items).TO.OF.TO == STR:
             malloc = mallocstr
diff --git a/pypy/rpython/memory/gc/minimark.py b/pypy/rpython/memory/gc/minimark.py
--- a/pypy/rpython/memory/gc/minimark.py
+++ b/pypy/rpython/memory/gc/minimark.py
@@ -49,6 +49,7 @@
 from pypy.rpython.lltypesystem.llmemory import raw_malloc_usage
 from pypy.rpython.memory.gc.base import GCBase, MovingGCBase
 from pypy.rpython.memory.gc import minimarkpage, env
+from pypy.rpython.memory.support import mangle_hash
 from pypy.rlib.rarithmetic import ovfcheck, LONG_BIT, intmask, r_uint
 from pypy.rlib.rarithmetic import LONG_BIT_SHIFT
 from pypy.rlib.debug import ll_assert, debug_print, debug_start, debug_stop
@@ -389,6 +390,11 @@
         # initialize the threshold
         self.min_heap_size = max(self.min_heap_size, self.nursery_size *
                                               self.major_collection_threshold)
+        # the following two values are usually equal, but during raw mallocs
+        # of arrays, next_major_collection_threshold is decremented to make
+        # the next major collection arrive earlier.
+        # See translator/c/test/test_newgc, test_nongc_attached_to_gc
+        self.next_major_collection_initial = self.min_heap_size
         self.next_major_collection_threshold = self.min_heap_size
         self.set_major_threshold_from(0.0)
         debug_stop("gc-set-nursery-size")
@@ -396,7 +402,7 @@
 
     def set_major_threshold_from(self, threshold, reserving_size=0):
         # Set the next_major_collection_threshold.
-        threshold_max = (self.next_major_collection_threshold *
+        threshold_max = (self.next_major_collection_initial *
                          self.growth_rate_max)
         if threshold > threshold_max:
             threshold = threshold_max
@@ -411,6 +417,7 @@
         else:
             bounded = False
         #
+        self.next_major_collection_initial = threshold
         self.next_major_collection_threshold = threshold
         return bounded
 
@@ -510,17 +517,19 @@
         # constant-folded because self.nonlarge_max, size and itemsize
         # are all constants (the arguments are constant due to
         # inlining).
-        if not raw_malloc_usage(itemsize):
-            too_many_items = raw_malloc_usage(nonvarsize) > self.nonlarge_max
+        maxsize = self.nonlarge_max - raw_malloc_usage(nonvarsize)
+        if maxsize < 0:
+            toobig = r_uint(0)    # the nonvarsize alone is too big
+        elif raw_malloc_usage(itemsize):
+            toobig = r_uint(maxsize // raw_malloc_usage(itemsize)) + 1
         else:
-            maxlength = self.nonlarge_max - raw_malloc_usage(nonvarsize)
-            maxlength = maxlength // raw_malloc_usage(itemsize)
-            too_many_items = length > maxlength
+            toobig = r_uint(sys.maxint) + 1
 
-        if too_many_items:
+        if r_uint(length) >= r_uint(toobig):
             #
             # If the total size of the object would be larger than
-            # 'nonlarge_max', then allocate it externally.
+            # 'nonlarge_max', then allocate it externally.  We also
+            # go there if 'length' is actually negative.
             obj = self.external_malloc(typeid, length)
             #
         else:
@@ -603,13 +612,18 @@
             # this includes the case of fixed-size objects, for which we
             # should not even ask for the varsize_item_sizes().
             totalsize = nonvarsize
-        else:
+        elif length > 0:
+            # var-sized allocation with at least one item
             itemsize = self.varsize_item_sizes(typeid)
             try:
                 varsize = ovfcheck(itemsize * length)
                 totalsize = ovfcheck(nonvarsize + varsize)
             except OverflowError:
                 raise MemoryError
+        else:
+            # negative length!  This likely comes from an overflow
+            # earlier.  We will just raise MemoryError here.
+            raise MemoryError
         #
         # If somebody calls this function a lot, we must eventually
         # force a full collection.
@@ -717,9 +731,18 @@
     def set_max_heap_size(self, size):
         self.max_heap_size = float(size)
         if self.max_heap_size > 0.0:
+            if self.max_heap_size < self.next_major_collection_initial:
+                self.next_major_collection_initial = self.max_heap_size
             if self.max_heap_size < self.next_major_collection_threshold:
                 self.next_major_collection_threshold = self.max_heap_size
 
+    def raw_malloc_memory_pressure(self, sizehint):
+        self.next_major_collection_threshold -= sizehint
+        if self.next_major_collection_threshold < 0:
+            # cannot trigger a full collection now, but we can ensure
+            # that one will occur very soon
+            self.nursery_free = self.nursery_top
+
     def can_malloc_nonmovable(self):
         return True
 
@@ -1599,7 +1622,7 @@
         # Max heap size: gives an upper bound on the threshold.  If we
         # already have at least this much allocated, raise MemoryError.
         if bounded and (float(self.get_total_memory_used()) + reserving_size >=
-                        self.next_major_collection_threshold):
+                        self.next_major_collection_initial):
             #
             # First raise MemoryError, giving the program a chance to
             # quit cleanly.  It might still allocate in the nursery,
@@ -1732,7 +1755,7 @@
     # ----------
     # id() and identityhash() support
 
-    def id_or_identityhash(self, gcobj, special_case_prebuilt):
+    def id_or_identityhash(self, gcobj, is_hash):
         """Implement the common logic of id() and identityhash()
         of an object, given as a GCREF.
         """
@@ -1775,7 +1798,7 @@
                 # The answer is the address of the shadow.
                 obj = shadow
                 #
-            elif special_case_prebuilt:
+            elif is_hash:
                 if self.header(obj).tid & GCFLAG_HAS_SHADOW:
                     #
                     # For identityhash(), we need a special case for some
@@ -1784,10 +1807,14 @@
                     # after the object.  But we cannot use it for id()
                     # because the stored value might clash with a real one.
                     size = self.get_size(obj)
-                    return (obj + size).signed[0]
+                    i = (obj + size).signed[0]
+                    # Important: the returned value is not mangle_hash()ed!
+                    return i
         #
-        return llmemory.cast_adr_to_int(obj)
-
+        i = llmemory.cast_adr_to_int(obj)
+        if is_hash:
+            i = mangle_hash(i)
+        return i
 
     def id(self, gcobj):
         return self.id_or_identityhash(gcobj, False)
diff --git a/pypy/rpython/memory/gc/test/test_minimark.py b/pypy/rpython/memory/gc/test/test_minimark.py
--- a/pypy/rpython/memory/gc/test/test_minimark.py
+++ b/pypy/rpython/memory/gc/test/test_minimark.py
@@ -34,6 +34,7 @@
                     growth_rate_max=1.5)
     gc.min_heap_size = 100.0
     gc.max_heap_size = 300.0
+    gc.next_major_collection_initial = 0.0
     gc.next_major_collection_threshold = 0.0
     # first, we don't grow past min_heap_size
     for i in range(5):
diff --git a/pypy/rpython/memory/gctransform/framework.py b/pypy/rpython/memory/gctransform/framework.py
--- a/pypy/rpython/memory/gctransform/framework.py
+++ b/pypy/rpython/memory/gctransform/framework.py
@@ -386,6 +386,18 @@
         else:
             self.malloc_varsize_nonmovable_ptr = None
 
+        if getattr(GCClass, 'raw_malloc_memory_pressure', False):
+            def raw_malloc_memory_pressure(length, itemsize):
+                totalmem = length * itemsize
+                if totalmem > 0:
+                    gcdata.gc.raw_malloc_memory_pressure(totalmem)
+                #else: probably an overflow -- the following rawmalloc
+                #      will fail then
+            self.raw_malloc_memory_pressure_ptr = getfn(
+                raw_malloc_memory_pressure,
+                [annmodel.SomeInteger(), annmodel.SomeInteger()],
+                annmodel.s_None, minimal_transform = False)
+
         self.identityhash_ptr = getfn(GCClass.identityhash.im_func,
                                       [s_gc, s_gcref],
                                       annmodel.SomeInteger(),
@@ -525,7 +537,8 @@
         self.c_vtinfo_skip_offset = rmodel.inputconst(lltype.typeOf(sko), sko)
 
     def build_root_walker(self):
-        return ShadowStackRootWalker(self)
+        from pypy.rpython.memory.gctransform import shadowstack
+        return shadowstack.ShadowStackRootWalker(self)
 
     def consider_constant(self, TYPE, value):
         self.layoutbuilder.consider_constant(TYPE, value, self.gcdata.gc)
@@ -932,10 +945,10 @@
     def gct_gc_identityhash(self, hop):
         livevars = self.push_roots(hop)
         [v_ptr] = hop.spaceop.args
-        v_adr = hop.genop("cast_ptr_to_adr", [v_ptr],
-                          resulttype=llmemory.Address)
+        v_ptr = hop.genop("cast_opaque_ptr", [v_ptr],
+                          resulttype=llmemory.GCREF)
         hop.genop("direct_call",
-                  [self.identityhash_ptr, self.c_const_gc, v_adr],
+                  [self.identityhash_ptr, self.c_const_gc, v_ptr],
                   resultvar=hop.spaceop.result)
         self.pop_roots(hop, livevars)
 
@@ -1323,217 +1336,3 @@
     def need_thread_support(self, gctransformer, getfn):
         raise Exception("%s does not support threads" % (
             self.__class__.__name__,))
-
-
-class ShadowStackRootWalker(BaseRootWalker):
-    need_root_stack = True
-    collect_stacks_from_other_threads = None
-
-    def __init__(self, gctransformer):
-        BaseRootWalker.__init__(self, gctransformer)
-        self.rootstacksize = sizeofaddr * gctransformer.root_stack_depth
-        # NB. 'self' is frozen, but we can use self.gcdata to store state
-        gcdata = self.gcdata
-
-        def incr_stack(n):
-            top = gcdata.root_stack_top
-            gcdata.root_stack_top = top + n*sizeofaddr
-            return top
-        self.incr_stack = incr_stack
-
-        def decr_stack(n):
-            top = gcdata.root_stack_top - n*sizeofaddr
-            gcdata.root_stack_top = top
-            return top
-        self.decr_stack = decr_stack
-
-        self.rootstackhook = gctransformer.root_stack_jit_hook
-        if self.rootstackhook is None:
-            def collect_stack_root(callback, gc, addr):
-                if gc.points_to_valid_gc_object(addr):
-                    callback(gc, addr)
-                return sizeofaddr
-            self.rootstackhook = collect_stack_root
-
-    def push_stack(self, addr):
-        top = self.incr_stack(1)
-        top.address[0] = addr
-
-    def pop_stack(self):
-        top = self.decr_stack(1)
-        return top.address[0]
-
-    def allocate_stack(self):
-        return llmemory.raw_malloc(self.rootstacksize)
-
-    def setup_root_walker(self):
-        stackbase = self.allocate_stack()
-        ll_assert(bool(stackbase), "could not allocate root stack")
-        self.gcdata.root_stack_top  = stackbase
-        self.gcdata.root_stack_base = stackbase
-        BaseRootWalker.setup_root_walker(self)
-
-    def walk_stack_roots(self, collect_stack_root):
-        gcdata = self.gcdata
-        gc = self.gc
-        rootstackhook = self.rootstackhook
-        addr = gcdata.root_stack_base
-        end = gcdata.root_stack_top
-        while addr != end:
-            addr += rootstackhook(collect_stack_root, gc, addr)
-        if self.collect_stacks_from_other_threads is not None:
-            self.collect_stacks_from_other_threads(collect_stack_root)
-
-    def need_thread_support(self, gctransformer, getfn):
-        from pypy.module.thread import ll_thread    # xxx fish
-        from pypy.rpython.memory.support import AddressDict
-        from pypy.rpython.memory.support import copy_without_null_values
-        gcdata = self.gcdata
-        # the interfacing between the threads and the GC is done via
-        # three completely ad-hoc operations at the moment:
-        # gc_thread_prepare, gc_thread_run, gc_thread_die.
-        # See docstrings below.
-
-        def get_aid():
-            """Return the thread identifier, cast to an (opaque) address."""
-            return llmemory.cast_int_to_adr(ll_thread.get_ident())
-
-        def thread_setup():
-            """Called once when the program starts."""
-            aid = get_aid()
-            gcdata.main_thread = aid
-            gcdata.active_thread = aid
-            gcdata.thread_stacks = AddressDict()     # {aid: root_stack_top}
-            gcdata._fresh_rootstack = llmemory.NULL
-            gcdata.dead_threads_count = 0
-
-        def thread_prepare():
-            """Called just before thread.start_new_thread().  This
-            allocates a new shadow stack to be used by the future
-            thread.  If memory runs out, this raises a MemoryError
-            (which can be handled by the caller instead of just getting
-            ignored if it was raised in the newly starting thread).
-            """
-            if not gcdata._fresh_rootstack:
-                gcdata._fresh_rootstack = self.allocate_stack()
-                if not gcdata._fresh_rootstack:
-                    raise MemoryError
-
-        def thread_run():
-            """Called whenever the current thread (re-)acquired the GIL.
-            This should ensure that the shadow stack installed in
-            gcdata.root_stack_top/root_stack_base is the one corresponding
-            to the current thread.
-            """
-            aid = get_aid()
-            if gcdata.active_thread != aid:
-                switch_shadow_stacks(aid)
-
-        def thread_die():
-            """Called just before the final GIL release done by a dying
-            thread.  After a thread_die(), no more gc operation should
-            occur in this thread.
-            """
-            aid = get_aid()
-            if aid == gcdata.main_thread:
-                return   # ignore calls to thread_die() in the main thread
-                         # (which can occur after a fork()).
-            gcdata.thread_stacks.setitem(aid, llmemory.NULL)
-            old = gcdata.root_stack_base
-            if gcdata._fresh_rootstack == llmemory.NULL:
-                gcdata._fresh_rootstack = old
-            else:
-                llmemory.raw_free(old)
-            install_new_stack(gcdata.main_thread)
-            # from time to time, rehash the dictionary to remove
-            # old NULL entries
-            gcdata.dead_threads_count += 1
-            if (gcdata.dead_threads_count & 511) == 0:
-                copy = copy_without_null_values(gcdata.thread_stacks)
-                gcdata.thread_stacks.delete()
-                gcdata.thread_stacks = copy
-
-        def switch_shadow_stacks(new_aid):
-            save_away_current_stack()
-            install_new_stack(new_aid)
-        switch_shadow_stacks._dont_inline_ = True
-
-        def save_away_current_stack():
-            old_aid = gcdata.active_thread
-            # save root_stack_base on the top of the stack
-            self.push_stack(gcdata.root_stack_base)
-            # store root_stack_top into the dictionary
-            gcdata.thread_stacks.setitem(old_aid, gcdata.root_stack_top)
-
-        def install_new_stack(new_aid):
-            # look for the new stack top
-            top = gcdata.thread_stacks.get(new_aid, llmemory.NULL)
-            if top == llmemory.NULL:
-                # first time we see this thread.  It is an error if no
-                # fresh new stack is waiting.
-                base = gcdata._fresh_rootstack
-                gcdata._fresh_rootstack = llmemory.NULL
-                ll_assert(base != llmemory.NULL, "missing gc_thread_prepare")
-                gcdata.root_stack_top = base
-                gcdata.root_stack_base = base
-            else:
-                # restore the root_stack_base from the top of the stack
-                gcdata.root_stack_top = top
-                gcdata.root_stack_base = self.pop_stack()
-            # done
-            gcdata.active_thread = new_aid
-
-        def collect_stack(aid, stacktop, callback):
-            if stacktop != llmemory.NULL and aid != gcdata.active_thread:
-                # collect all valid stacks from the dict (the entry
-                # corresponding to the current thread is not valid)
-                gc = self.gc
-                rootstackhook = self.rootstackhook
-                end = stacktop - sizeofaddr
-                addr = end.address[0]
-                while addr != end:
-                    addr += rootstackhook(callback, gc, addr)
-
-        def collect_more_stacks(callback):
-            ll_assert(get_aid() == gcdata.active_thread,
-                      "collect_more_stacks(): invalid active_thread")
-            gcdata.thread_stacks.foreach(collect_stack, callback)
-
-        def _free_if_not_current(aid, stacktop, _):
-            if stacktop != llmemory.NULL and aid != gcdata.active_thread:
-                end = stacktop - sizeofaddr
-                base = end.address[0]
-                llmemory.raw_free(base)
-
-        def thread_after_fork(result_of_fork, opaqueaddr):
-            # we don't need a thread_before_fork in this case, so
-            # opaqueaddr == NULL.  This is called after fork().
-            if result_of_fork == 0:
-                # We are in the child process.  Assumes that only the
-                # current thread survived, so frees the shadow stacks
-                # of all the other ones.
-                gcdata.thread_stacks.foreach(_free_if_not_current, None)
-                # Clears the dict (including the current thread, which
-                # was an invalid entry anyway and will be recreated by
-                # the next call to save_away_current_stack()).
-                gcdata.thread_stacks.clear()
-                # Finally, reset the stored thread IDs, in case it
-                # changed because of fork().  Also change the main
-                # thread to the current one (because there is not any
-                # other left).
-                aid = get_aid()
-                gcdata.main_thread = aid
-                gcdata.active_thread = aid
-
-        self.thread_setup = thread_setup
-        self.thread_prepare_ptr = getfn(thread_prepare, [], annmodel.s_None)
-        self.thread_run_ptr = getfn(thread_run, [], annmodel.s_None,
-                                    inline=True)
-        # no thread_start_ptr here
-        self.thread_die_ptr = getfn(thread_die, [], annmodel.s_None)
-        # no thread_before_fork_ptr here
-        self.thread_after_fork_ptr = getfn(thread_after_fork,
-                                           [annmodel.SomeInteger(),
-                                            annmodel.SomeAddress()],
-                                           annmodel.s_None)
-        self.collect_stacks_from_other_threads = collect_more_stacks
diff --git a/pypy/rpython/memory/gctransform/shadowstack.py b/pypy/rpython/memory/gctransform/shadowstack.py
new file mode 100644
--- /dev/null
+++ b/pypy/rpython/memory/gctransform/shadowstack.py
@@ -0,0 +1,219 @@
+from pypy.rpython.memory.gctransform.framework import BaseRootWalker
+from pypy.rpython.memory.gctransform.framework import sizeofaddr
+from pypy.rlib.debug import ll_assert
+from pypy.rpython.lltypesystem import llmemory
+from pypy.annotation import model as annmodel
+
+
+class ShadowStackRootWalker(BaseRootWalker):
+    need_root_stack = True
+    collect_stacks_from_other_threads = None
+
+    def __init__(self, gctransformer):
+        BaseRootWalker.__init__(self, gctransformer)
+        self.rootstacksize = sizeofaddr * gctransformer.root_stack_depth
+        # NB. 'self' is frozen, but we can use self.gcdata to store state
+        gcdata = self.gcdata
+
+        def incr_stack(n):
+            top = gcdata.root_stack_top
+            gcdata.root_stack_top = top + n*sizeofaddr
+            return top
+        self.incr_stack = incr_stack
+
+        def decr_stack(n):
+            top = gcdata.root_stack_top - n*sizeofaddr
+            gcdata.root_stack_top = top
+            return top
+        self.decr_stack = decr_stack
+
+        self.rootstackhook = gctransformer.root_stack_jit_hook
+        if self.rootstackhook is None:
+            def collect_stack_root(callback, gc, addr):
+                if gc.points_to_valid_gc_object(addr):
+                    callback(gc, addr)
+                return sizeofaddr
+            self.rootstackhook = collect_stack_root
+
+    def push_stack(self, addr):
+        top = self.incr_stack(1)
+        top.address[0] = addr
+
+    def pop_stack(self):
+        top = self.decr_stack(1)
+        return top.address[0]
+
+    def allocate_stack(self):
+        return llmemory.raw_malloc(self.rootstacksize)
+
+    def setup_root_walker(self):
+        stackbase = self.allocate_stack()
+        ll_assert(bool(stackbase), "could not allocate root stack")
+        self.gcdata.root_stack_top  = stackbase
+        self.gcdata.root_stack_base = stackbase
+        BaseRootWalker.setup_root_walker(self)
+
+    def walk_stack_roots(self, collect_stack_root):
+        gcdata = self.gcdata
+        gc = self.gc
+        rootstackhook = self.rootstackhook
+        addr = gcdata.root_stack_base
+        end = gcdata.root_stack_top
+        while addr != end:
+            addr += rootstackhook(collect_stack_root, gc, addr)
+        if self.collect_stacks_from_other_threads is not None:
+            self.collect_stacks_from_other_threads(collect_stack_root)
+
+    def need_thread_support(self, gctransformer, getfn):
+        from pypy.module.thread import ll_thread    # xxx fish
+        from pypy.rpython.memory.support import AddressDict
+        from pypy.rpython.memory.support import copy_without_null_values
+        gcdata = self.gcdata
+        # the interfacing between the threads and the GC is done via
+        # three completely ad-hoc operations at the moment:
+        # gc_thread_prepare, gc_thread_run, gc_thread_die.
+        # See docstrings below.
+
+        def get_aid():
+            """Return the thread identifier, cast to an (opaque) address."""
+            return llmemory.cast_int_to_adr(ll_thread.get_ident())
+
+        def thread_setup():
+            """Called once when the program starts."""
+            aid = get_aid()
+            gcdata.main_thread = aid
+            gcdata.active_thread = aid
+            gcdata.thread_stacks = AddressDict()     # {aid: root_stack_top}
+            gcdata._fresh_rootstack = llmemory.NULL
+            gcdata.dead_threads_count = 0
+
+        def thread_prepare():
+            """Called just before thread.start_new_thread().  This
+            allocates a new shadow stack to be used by the future
+            thread.  If memory runs out, this raises a MemoryError
+            (which can be handled by the caller instead of just getting
+            ignored if it was raised in the newly starting thread).
+            """
+            if not gcdata._fresh_rootstack:
+                gcdata._fresh_rootstack = self.allocate_stack()
+                if not gcdata._fresh_rootstack:
+                    raise MemoryError
+
+        def thread_run():
+            """Called whenever the current thread (re-)acquired the GIL.
+            This should ensure that the shadow stack installed in
+            gcdata.root_stack_top/root_stack_base is the one corresponding
+            to the current thread.
+            """
+            aid = get_aid()
+            if gcdata.active_thread != aid:
+                switch_shadow_stacks(aid)
+
+        def thread_die():
+            """Called just before the final GIL release done by a dying
+            thread.  After a thread_die(), no more gc operation should
+            occur in this thread.
+            """
+            aid = get_aid()
+            if aid == gcdata.main_thread:
+                return   # ignore calls to thread_die() in the main thread
+                         # (which can occur after a fork()).
+            gcdata.thread_stacks.setitem(aid, llmemory.NULL)
+            old = gcdata.root_stack_base
+            if gcdata._fresh_rootstack == llmemory.NULL:
+                gcdata._fresh_rootstack = old
+            else:
+                llmemory.raw_free(old)
+            install_new_stack(gcdata.main_thread)
+            # from time to time, rehash the dictionary to remove
+            # old NULL entries
+            gcdata.dead_threads_count += 1
+            if (gcdata.dead_threads_count & 511) == 0:
+                copy = copy_without_null_values(gcdata.thread_stacks)
+                gcdata.thread_stacks.delete()
+                gcdata.thread_stacks = copy
+
+        def switch_shadow_stacks(new_aid):
+            save_away_current_stack()
+            install_new_stack(new_aid)
+        switch_shadow_stacks._dont_inline_ = True
+
+        def save_away_current_stack():
+            old_aid = gcdata.active_thread
+            # save root_stack_base on the top of the stack
+            self.push_stack(gcdata.root_stack_base)
+            # store root_stack_top into the dictionary
+            gcdata.thread_stacks.setitem(old_aid, gcdata.root_stack_top)
+
+        def install_new_stack(new_aid):
+            # look for the new stack top
+            top = gcdata.thread_stacks.get(new_aid, llmemory.NULL)
+            if top == llmemory.NULL:
+                # first time we see this thread.  It is an error if no
+                # fresh new stack is waiting.
+                base = gcdata._fresh_rootstack
+                gcdata._fresh_rootstack = llmemory.NULL
+                ll_assert(base != llmemory.NULL, "missing gc_thread_prepare")
+                gcdata.root_stack_top = base
+                gcdata.root_stack_base = base
+            else:
+                # restore the root_stack_base from the top of the stack
+                gcdata.root_stack_top = top
+                gcdata.root_stack_base = self.pop_stack()
+            # done
+            gcdata.active_thread = new_aid
+
+        def collect_stack(aid, stacktop, callback):
+            if stacktop != llmemory.NULL and aid != gcdata.active_thread:
+                # collect all valid stacks from the dict (the entry
+                # corresponding to the current thread is not valid)
+                gc = self.gc
+                rootstackhook = self.rootstackhook
+                end = stacktop - sizeofaddr
+                addr = end.address[0]
+                while addr != end:
+                    addr += rootstackhook(callback, gc, addr)
+
+        def collect_more_stacks(callback):
+            ll_assert(get_aid() == gcdata.active_thread,
+                      "collect_more_stacks(): invalid active_thread")
+            gcdata.thread_stacks.foreach(collect_stack, callback)
+
+        def _free_if_not_current(aid, stacktop, _):
+            if stacktop != llmemory.NULL and aid != gcdata.active_thread:
+                end = stacktop - sizeofaddr
+                base = end.address[0]
+                llmemory.raw_free(base)
+
+        def thread_after_fork(result_of_fork, opaqueaddr):
+            # we don't need a thread_before_fork in this case, so
+            # opaqueaddr == NULL.  This is called after fork().
+            if result_of_fork == 0:
+                # We are in the child process.  Assumes that only the
+                # current thread survived, so frees the shadow stacks
+                # of all the other ones.
+                gcdata.thread_stacks.foreach(_free_if_not_current, None)
+                # Clears the dict (including the current thread, which
+                # was an invalid entry anyway and will be recreated by
+                # the next call to save_away_current_stack()).
+                gcdata.thread_stacks.clear()
+                # Finally, reset the stored thread IDs, in case it
+                # changed because of fork().  Also change the main
+                # thread to the current one (because there is not any
+                # other left).
+                aid = get_aid()
+                gcdata.main_thread = aid
+                gcdata.active_thread = aid
+
+        self.thread_setup = thread_setup
+        self.thread_prepare_ptr = getfn(thread_prepare, [], annmodel.s_None)
+        self.thread_run_ptr = getfn(thread_run, [], annmodel.s_None,
+                                    inline=True)
+        # no thread_start_ptr here
+        self.thread_die_ptr = getfn(thread_die, [], annmodel.s_None)
+        # no thread_before_fork_ptr here
+        self.thread_after_fork_ptr = getfn(thread_after_fork,
+                                           [annmodel.SomeInteger(),
+                                            annmodel.SomeAddress()],
+                                           annmodel.s_None)
+        self.collect_stacks_from_other_threads = collect_more_stacks
diff --git a/pypy/rpython/memory/gctransform/transform.py b/pypy/rpython/memory/gctransform/transform.py
--- a/pypy/rpython/memory/gctransform/transform.py
+++ b/pypy/rpython/memory/gctransform/transform.py
@@ -590,6 +590,11 @@
 
     def gct_fv_raw_malloc_varsize(self, hop, flags, TYPE, v_length, c_const_size, c_item_size,
                                                                     c_offset_to_length):
+        if flags.get('add_memory_pressure', False):
+            if hasattr(self, 'raw_malloc_memory_pressure_ptr'):
+                hop.genop("direct_call",
+                          [self.raw_malloc_memory_pressure_ptr,
+                           v_length, c_item_size])
         if c_offset_to_length is None:
             if flags.get('zero'):
                 fnptr = self.raw_malloc_varsize_no_length_zero_ptr
diff --git a/pypy/rpython/memory/lldict.py b/pypy/rpython/memory/lldict.py
--- a/pypy/rpython/memory/lldict.py
+++ b/pypy/rpython/memory/lldict.py
@@ -1,6 +1,7 @@
 from pypy.rpython.lltypesystem import lltype, llmemory
 from pypy.rpython.lltypesystem import rdict
 from pypy.rlib.objectmodel import we_are_translated
+from pypy.rpython.memory.support import mangle_hash
 
 # This is a low-level AddressDict, reusing a lot of the logic from rdict.py.
 # xxx this is very dependent on the details of rdict.py
@@ -40,7 +41,8 @@
     lltype.free(entries, flavor="raw")
     if not we_are_translated(): count_alloc(-1)
 
-_hash = llmemory.cast_adr_to_int
+def _hash(adr):
+    return mangle_hash(llmemory.cast_adr_to_int(adr))
 
 def dict_keyhash(d, key):
     return _hash(key)
diff --git a/pypy/rpython/memory/support.py b/pypy/rpython/memory/support.py
--- a/pypy/rpython/memory/support.py
+++ b/pypy/rpython/memory/support.py
@@ -4,6 +4,15 @@
 from pypy.rlib.debug import ll_assert
 from pypy.tool.identity_dict import identity_dict
 
+
+def mangle_hash(i):
+    # To hash pointers in dictionaries.  Assumes that i shows some
+    # alignment (to 4, 8, maybe 16 bytes), so we use the following
+    # formula to avoid the trailing bits being always 0.
+    return i ^ (i >> 4)
+
+# ____________________________________________________________
+
 DEFAULT_CHUNK_SIZE = 1019
 
 
diff --git a/pypy/rpython/rbuilder.py b/pypy/rpython/rbuilder.py
--- a/pypy/rpython/rbuilder.py
+++ b/pypy/rpython/rbuilder.py
@@ -36,6 +36,11 @@
         hop.exception_cannot_occur()
         return hop.gendirectcall(self.ll_append_multiple_char, *vlist)
 
+    def rtype_method_append_charpsize(self, hop):
+        vlist = hop.inputargs(self, self.raw_ptr_repr, lltype.Signed)
+        hop.exception_cannot_occur()
+        return hop.gendirectcall(self.ll_append_charpsize, *vlist)
+
     def rtype_method_getlength(self, hop):
         vlist = hop.inputargs(self)
         hop.exception_cannot_occur()
diff --git a/pypy/rpython/rbuiltin.py b/pypy/rpython/rbuiltin.py
--- a/pypy/rpython/rbuiltin.py
+++ b/pypy/rpython/rbuiltin.py
@@ -345,14 +345,17 @@
 BUILTIN_TYPER[object.__init__] = rtype_object__init__
 # annotation of low-level types
 
-def rtype_malloc(hop, i_flavor=None, i_zero=None, i_track_allocation=None):
+def rtype_malloc(hop, i_flavor=None, i_zero=None, i_track_allocation=None,
+                 i_add_memory_pressure=None):
     assert hop.args_s[0].is_constant()
     vlist = [hop.inputarg(lltype.Void, arg=0)]
     opname = 'malloc'
-    v_flavor, v_zero, v_track_allocation = parse_kwds(hop,
+    v_flavor, v_zero, v_track_allocation, v_add_memory_pressure = parse_kwds(
+        hop,
         (i_flavor, lltype.Void),
         (i_zero, None),
-        (i_track_allocation, None))
+        (i_track_allocation, None),
+        (i_add_memory_pressure, None))
 
     flags = {'flavor': 'gc'}
     if v_flavor is not None:
@@ -361,8 +364,11 @@
         flags['zero'] = v_zero.value
     if i_track_allocation is not None:
         flags['track_allocation'] = v_track_allocation.value
+    if i_add_memory_pressure is not None:
+        flags['add_memory_pressure'] = v_add_memory_pressure.value
     vlist.append(hop.inputconst(lltype.Void, flags))
-        
+
+    assert 1 <= hop.nb_args <= 2
     if hop.nb_args == 2:
         vlist.append(hop.inputarg(lltype.Signed, arg=1))
         opname += '_varsize'
diff --git a/pypy/rpython/rptr.py b/pypy/rpython/rptr.py
--- a/pypy/rpython/rptr.py
+++ b/pypy/rpython/rptr.py
@@ -22,7 +22,7 @@
 class __extend__(annmodel.SomeInteriorPtr):
     def rtyper_makerepr(self, rtyper):
         return InteriorPtrRepr(self.ll_ptrtype)
- 
+
 
 class PtrRepr(Repr):
 
@@ -91,7 +91,7 @@
         vlist = hop.inputargs(*hop.args_r)
         nexpected = len(self.lowleveltype.TO.ARGS)
         nactual = len(vlist)-1
-        if nactual != nexpected: 
+        if nactual != nexpected:
             raise TyperError("argcount mismatch:  expected %d got %d" %
                             (nexpected, nactual))
         if isinstance(vlist[0], flowmodel.Constant):
@@ -111,7 +111,12 @@
         hop.swap_fst_snd_args()
         hop.r_s_popfirstarg()
         return self.rtype_simple_call(hop)
-        
+
+class __extend__(pairtype(PtrRepr, PtrRepr)):
+    def convert_from_to((r_ptr1, r_ptr2), v, llop):
+        assert r_ptr1.lowleveltype == r_ptr2.lowleveltype
+        return v
+
 
 class __extend__(pairtype(PtrRepr, IntegerRepr)):
 
@@ -205,7 +210,7 @@
         self.lowleveltype = adtmeth.ll_ptrtype
         self.ll_ptrtype = adtmeth.ll_ptrtype
         self.lowleveltype = rtyper.getrepr(annmodel.lltype_to_annotation(adtmeth.ll_ptrtype)).lowleveltype
- 
+
     def rtype_simple_call(self, hop):
         hop2 = hop.copy()
         func = self.func
@@ -242,7 +247,7 @@
         if numitemoffsets > 0:
             self.lowleveltype = lltype.Ptr(self.parentptrtype._interior_ptr_type_with_index(self.resulttype.TO))
         else:
-            self.lowleveltype = self.parentptrtype            
+            self.lowleveltype = self.parentptrtype
 
     def getinteriorfieldargs(self, hop, v_self):
         vlist = []
@@ -305,7 +310,7 @@
 
 
 class __extend__(pairtype(InteriorPtrRepr, IntegerRepr)):
-    def rtype_getitem((r_ptr, r_item), hop): 
+    def rtype_getitem((r_ptr, r_item), hop):
         ARRAY = r_ptr.resulttype.TO
         ITEM_TYPE = ARRAY.OF
         if isinstance(ITEM_TYPE, lltype.ContainerType):
@@ -325,7 +330,7 @@
             vlist = r_ptr.getinteriorfieldargs(hop, v_self) + [v_index]
             return hop.genop('getinteriorfield', vlist,
                              resulttype=ITEM_TYPE)
-        
+
     def rtype_setitem((r_ptr, r_index), hop):
         ARRAY = r_ptr.resulttype.TO
         ITEM_TYPE = ARRAY.OF
@@ -333,11 +338,11 @@
         v_self, v_index, v_value = hop.inputargs(r_ptr, lltype.Signed, hop.args_r[2])
         vlist = r_ptr.getinteriorfieldargs(hop, v_self) + [v_index, v_value]
         hop.genop('setinteriorfield', vlist)
-            
+
 class __extend__(pairtype(InteriorPtrRepr, LLADTMethRepr)):
 
     def convert_from_to((r_from, r_to), v, llops):
         if r_from.lowleveltype == r_to.lowleveltype:
             return v
         return NotImplemented
-   
+
diff --git a/pypy/rpython/test/test_rbuilder.py b/pypy/rpython/test/test_rbuilder.py
--- a/pypy/rpython/test/test_rbuilder.py
+++ b/pypy/rpython/test/test_rbuilder.py
@@ -1,8 +1,10 @@
 import py
+
+from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
+from pypy.rpython.annlowlevel import llstr, hlstr
+from pypy.rpython.lltypesystem import rffi
+from pypy.rpython.lltypesystem.rbuilder import *
 from pypy.rpython.test.tool import BaseRtypingTest, LLRtypeMixin, OORtypeMixin
-from pypy.rpython.lltypesystem.rbuilder import *
-from pypy.rpython.annlowlevel import llstr, hlstr
-from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
 
 
 class TestStringBuilderDirect(object):
@@ -73,6 +75,15 @@
         res = self.interpret(func, [])
         assert res == 4
 
+    def test_append_charpsize(self):
+        def func(l):
+            s = StringBuilder()
+            with rffi.scoped_str2charp("hello world") as x:
+                s.append_charpsize(x, l)
+            return s.build()
+        res = self.ll_to_string(self.interpret(func, [5]))
+        assert res == "hello"
+
 class TestLLtype(BaseTestStringBuilder, LLRtypeMixin):
     pass
 
@@ -81,3 +92,5 @@
         py.test.skip("getlength(): not implemented on ootype")
     def test_unicode_getlength(self):
         py.test.skip("getlength(): not implemented on ootype")
+    def test_append_charpsize(self):
+        py.test.skip("append_charpsize(): not implemented on ootype")
\ No newline at end of file
diff --git a/pypy/jit/codewriter/regalloc.py b/pypy/tool/algo/regalloc.py
copy from pypy/jit/codewriter/regalloc.py
copy to pypy/tool/algo/regalloc.py
--- a/pypy/jit/codewriter/regalloc.py
+++ b/pypy/tool/algo/regalloc.py
@@ -2,13 +2,11 @@
 from pypy.objspace.flow.model import Variable
 from pypy.tool.algo.color import DependencyGraph
 from pypy.tool.algo.unionfind import UnionFind
-from pypy.jit.metainterp.history import getkind
-from pypy.jit.codewriter.flatten import ListOfKind
 
-def perform_register_allocation(graph, kind):
+def perform_register_allocation(graph, consider_var, ListOfKind=()):
     """Perform register allocation for the Variables of the given 'kind'
     in the 'graph'."""
-    regalloc = RegAllocator(graph, kind)
+    regalloc = RegAllocator(graph, consider_var, ListOfKind)
     regalloc.make_dependencies()
     regalloc.coalesce_variables()
     regalloc.find_node_coloring()
@@ -18,9 +16,10 @@
 class RegAllocator(object):
     DEBUG_REGALLOC = False
 
-    def __init__(self, graph, kind):
+    def __init__(self, graph, consider_var, ListOfKind):
         self.graph = graph
-        self.kind = kind
+        self.consider_var = consider_var
+        self.ListOfKind = ListOfKind
 
     def make_dependencies(self):
         dg = DependencyGraph()
@@ -31,7 +30,7 @@
                 for v in op.args:
                     if isinstance(v, Variable):
                         die_at[v] = i
-                    elif isinstance(v, ListOfKind):
+                    elif isinstance(v, self.ListOfKind):
                         for v1 in v:
                             if isinstance(v1, Variable):
                                 die_at[v1] = i
@@ -51,7 +50,7 @@
             # Done.  XXX the code above this line runs 3 times
             # (for kind in KINDS) to produce the same result...
             livevars = [v for v in block.inputargs
-                          if getkind(v.concretetype) == self.kind]
+                          if self.consider_var(v)]
             # Add the variables of this block to the dependency graph
             for i, v in enumerate(livevars):
                 dg.add_node(v)
@@ -67,10 +66,10 @@
                         pass
                     die_index += 1
                 if (op.result is not None and
-                    getkind(op.result.concretetype) == self.kind):
+                        self.consider_var(op.result)):
                     dg.add_node(op.result)
                     for v in livevars:
-                        if getkind(v.concretetype) == self.kind:
+                        if self.consider_var(v):
                             dg.add_edge(v, op.result)
                     livevars.add(op.result)
         self._depgraph = dg
@@ -95,8 +94,8 @@
                     self._try_coalesce(v, link.target.inputargs[i])
 
     def _try_coalesce(self, v, w):
-        if isinstance(v, Variable) and getkind(v.concretetype) == self.kind:
-            assert getkind(w.concretetype) == self.kind
+        if isinstance(v, Variable) and self.consider_var(v):
+            assert self.consider_var(w)
             dg = self._depgraph
             uf = self._unionfind
             v0 = uf.find_rep(v)
diff --git a/pypy/tool/gdb_pypy.py b/pypy/tool/gdb_pypy.py
new file mode 100644
--- /dev/null
+++ b/pypy/tool/gdb_pypy.py
@@ -0,0 +1,125 @@
+"""
+Some convenience macros for gdb.  If you have pypy in your path, you can simply do:
+
+(gdb) python import pypy.tool.gdb_pypy
+
+Or, alternatively:
+
+(gdb) python execfile('/path/to/gdb_pypy.py')
+"""
+
+from __future__ import with_statement
+
+import sys
+import os.path
+
+try:
+    # when running inside gdb
+    from gdb import Command
+except ImportError:
+    # whenn running outside gdb: mock class for testing
+    class Command(object):
+        def __init__(self, name, command_class):
+            pass
+
+
+def find_field_with_suffix(val, suffix):
+    """
+    Return ``val[field]``, where ``field`` is the only one whose name ends
+    with ``suffix``.  If there is no such field, or more than one, raise KeyError.
+    """
+    names = []
+    for field in val.type.fields():
+        if field.name.endswith(suffix):
+            names.append(field.name)
+    #
+    if len(names) == 1:
+        return val[names[0]]
+    elif len(names) == 0:
+        raise KeyError, "cannot find field *%s" % suffix
+    else:
+        raise KeyError, "too many matching fields: %s" % ', '.join(names)
+
+def lookup(val, suffix):
+    """
+    Lookup a field which ends with ``suffix`` following the rpython struct
+    inheritance hierarchy (i.e., looking both at ``val`` and
+    ``val['*_super']``, recursively.
+    """
+    try:
+        return find_field_with_suffix(val, suffix)
+    except KeyError:
+        baseobj = find_field_with_suffix(val, '_super')
+        return lookup(baseobj, suffix)
+
+
+class RPyType(Command):
+    """
+    Prints the RPython type of the expression (remember to dereference it!)
+    It assumes to find ``typeids.txt`` in the current directory.
+    E.g.:
+
+    (gdb) rpy_type *l_v123
+    GcStruct pypy.foo.Bar { super, inst_xxx, inst_yyy }
+    """
+
+    prog2typeids = {}
+ 
+    def __init__(self, gdb=None):
+        # dependency injection, for tests
+        if gdb is None:
+            import gdb
+        self.gdb = gdb
+        Command.__init__(self, "rpy_type", self.gdb.COMMAND_NONE)
+
+    def invoke(self, arg, from_tty):
+        # some magic code to automatically reload the python file while developing
+        ## from pypy.tool import gdb_pypy
+        ## reload(gdb_pypy)
+        ## gdb_pypy.RPyType.prog2typeids = self.prog2typeids # persist the cache
+        ## self.__class__ = gdb_pypy.RPyType
+        print self.do_invoke(arg, from_tty)
+
+    def do_invoke(self, arg, from_tty):
+        obj = self.gdb.parse_and_eval(arg)
+        hdr = lookup(obj, '_gcheader')
+        tid = hdr['h_tid']
+        offset = tid & 0xFFFFFFFF # 64bit only
+        offset = int(offset) # convert from gdb.Value to python int
+        typeids = self.get_typeids()
+        if offset in typeids:
+            return typeids[offset]
+        else:
+            return 'Cannot find the type with offset %d' % offset
+
+    def get_typeids(self):
+        progspace = self.gdb.current_progspace()
+        try:
+            return self.prog2typeids[progspace]
+        except KeyError:
+            typeids = self.load_typeids(progspace)
+            self.prog2typeids[progspace] = typeids
+            return typeids
+
+    def load_typeids(self, progspace):
+        """
+        Returns a mapping offset --> description
+        """
+        exename = progspace.filename
+        root = os.path.dirname(exename)
+        typeids_txt = os.path.join(root, 'typeids.txt')
+        print 'loading', typeids_txt
+        typeids = {}
+        with open(typeids_txt) as f:
+            for line in f:
+                member, descr = map(str.strip, line.split(None, 1))
+                expr = "((char*)(&pypy_g_typeinfo.%s)) - (char*)&pypy_g_typeinfo" % member
+                offset = int(self.gdb.parse_and_eval(expr))
+                typeids[offset] = descr
+        return typeids
+
+try:
+    import gdb
+    RPyType() # side effects
+except ImportError:
+    pass
diff --git a/pypy/tool/jitlogparser/parser.py b/pypy/tool/jitlogparser/parser.py
--- a/pypy/tool/jitlogparser/parser.py
+++ b/pypy/tool/jitlogparser/parser.py
@@ -330,6 +330,8 @@
         if op.is_guard() and bridges.get('loop-' + str(op.guard_no), None):
             res.append(op)
             i = 0
+            if hasattr(op.bridge, 'force_asm'):
+                op.bridge.force_asm()
             ops = op.bridge.operations
         else:
             res.append(op)
diff --git a/pypy/tool/logparser.py b/pypy/tool/logparser.py
--- a/pypy/tool/logparser.py
+++ b/pypy/tool/logparser.py
@@ -4,7 +4,8 @@
     python logparser.py <action> <logfilename> <output> <options...>
 
 Actions:
-    draw-time   draw a timeline image of the log (format PNG by default)
+    draw-time      draw a timeline image of the log (format PNG by default)
+    print-summary  print a summary of the log
 """
 import autopath
 import sys, re
@@ -385,6 +386,23 @@
     else:
         image.save(output)
 
+def print_summary(log, out):
+    totaltimes = gettotaltimes(log)
+    if out == '-':
+        outfile = sys.stdout
+    else:
+        outfile = open(out, "w")
+    l = totaltimes.items()
+    l.sort(cmp=lambda a, b: cmp(b[1], a[1]))
+    total = sum([b for a, b in l])
+    for a, b in l:
+        if a is None:
+            a = 'interpret'
+        s = " " * (50 - len(a))
+        print >>outfile, a, s, str(b*100/total) + "%"
+    if out != '-':
+        outfile.close()
+
 # ____________________________________________________________
 
 
@@ -393,6 +411,7 @@
                                         'mainwidth=', 'mainheight=',
                                         'summarywidth=', 'summarybarheight=',
                                         ]),
+    'print-summary': (print_summary, []),
     }
 
 if __name__ == '__main__':
diff --git a/pypy/tool/release/force-builds.py b/pypy/tool/release/force-builds.py
--- a/pypy/tool/release/force-builds.py
+++ b/pypy/tool/release/force-builds.py
@@ -22,7 +22,7 @@
 #    'own-macosx-x86-32',
 #    'pypy-c-app-level-linux-x86-32',
 #    'pypy-c-app-level-linux-x86-64',
-    'pypy-c-stackless-app-level-linux-x86-32',
+#    'pypy-c-stackless-app-level-linux-x86-32',
     'pypy-c-app-level-win-x86-32',
     'pypy-c-jit-linux-x86-32',
     'pypy-c-jit-linux-x86-64',
diff --git a/pypy/tool/test/test_gdb_pypy.py b/pypy/tool/test/test_gdb_pypy.py
new file mode 100644
--- /dev/null
+++ b/pypy/tool/test/test_gdb_pypy.py
@@ -0,0 +1,105 @@
+import py
+from pypy.tool import gdb_pypy
+
+class FakeGdb(object):
+
+    COMMAND_NONE = -1
+
+    def __init__(self, exprs, progspace=None):
+        self.exprs = exprs
+        self.progspace = progspace
+
+    def parse_and_eval(self, expr):
+        return self.exprs[expr]
+
+    def current_progspace(self):
+        return self.progspace
+
+
+class Mock(object):
+    def __init__(self, **attrs):
+        self.__dict__.update(attrs)
+
+class Field(Mock):
+    pass
+
+class Struct(object):
+    def __init__(self, fieldnames):
+        self._fields = [Field(name=name) for name in fieldnames]
+
+    def fields(self):
+        return self._fields[:]
+
+class Value(dict):
+    def __init__(self, *args, **kwds):
+        dict.__init__(self, *args, **kwds)
+        self.type = Struct(self.keys())
+        for key, val in self.iteritems():
+            if isinstance(val, dict):
+                self[key] = Value(val)
+
+def test_mock_objects():
+    d = {'a': 1,
+         'b': 2,
+         'super': {
+            'c': 3,
+            }
+         }
+    val = Value(d)
+    assert val['a'] == 1
+    assert val['b'] == 2
+    assert isinstance(val['super'], Value)
+    assert val['super']['c'] == 3
+    fields = val.type.fields()
+    names = [f.name for f in fields]
+    assert sorted(names) == ['a', 'b', 'super']
+
+def test_find_field_with_suffix():
+    obj = Value(x_foo = 1,
+                y_bar = 2,
+                z_foobar = 3)
+    assert gdb_pypy.find_field_with_suffix(obj, 'foo') == 1
+    assert gdb_pypy.find_field_with_suffix(obj, 'foobar') == 3
+    py.test.raises(KeyError, "gdb_pypy.find_field_with_suffix(obj, 'bar')")
+    py.test.raises(KeyError, "gdb_pypy.find_field_with_suffix(obj, 'xxx')")
+
+def test_lookup():
+    d = {'r_super': {
+            '_gcheader': {
+                'h_tid': 123,
+                }
+            },
+         'r_foo': 42,
+         }
+    obj = Value(d)
+    assert gdb_pypy.lookup(obj, 'foo') == 42
+    hdr = gdb_pypy.lookup(obj, 'gcheader')
+    assert hdr['h_tid'] == 123
+
+def test_RPyType(tmpdir):
+    exe = tmpdir.join('pypy-c')
+    typeids = tmpdir.join('typeids.txt')
+    typeids.write("""
+member0    GcStruct xxx {}
+member1    GcStruct yyy {}
+member2    GcStruct zzz {}
+""".strip())
+    #
+    progspace = Mock(filename=str(exe))
+    d = {'r_super': {
+            '_gcheader': {
+                'h_tid': 123,
+                }
+            },
+         'r_foo': 42,
+         }
+    myvar = Value(d)
+    exprs = {
+        '*myvar': myvar,
+        '((char*)(&pypy_g_typeinfo.member0)) - (char*)&pypy_g_typeinfo': 0,
+        '((char*)(&pypy_g_typeinfo.member1)) - (char*)&pypy_g_typeinfo': 123,
+        '((char*)(&pypy_g_typeinfo.member2)) - (char*)&pypy_g_typeinfo': 456,
+        }
+    gdb = FakeGdb(exprs, progspace)
+    cmd = gdb_pypy.RPyType(gdb)
+    assert cmd.do_invoke('*myvar', True) == 'GcStruct yyy {}'
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
@@ -3,31 +3,38 @@
 #include "src/cjkcodecs/multibytecodec.h"
 
 
-struct pypy_cjk_dec_s *pypy_cjk_dec_init(const MultibyteCodec *codec,
-                                         char *inbuf, Py_ssize_t inlen)
+struct pypy_cjk_dec_s *pypy_cjk_dec_new(const MultibyteCodec *codec)
 {
   struct pypy_cjk_dec_s *d = malloc(sizeof(struct pypy_cjk_dec_s));
   if (!d)
     return NULL;
   if (codec->decinit != NULL && codec->decinit(&d->state, codec->config) != 0)
-    goto errorexit;
+    {
+      free(d);
+      return NULL;
+    }
+  d->codec = codec;
+  d->outbuf_start = NULL;
+  return d;
+}
 
-  d->codec = codec;
+Py_ssize_t pypy_cjk_dec_init(struct pypy_cjk_dec_s *d,
+                             char *inbuf, Py_ssize_t inlen)
+{
   d->inbuf_start = inbuf;
   d->inbuf = inbuf;
   d->inbuf_end = inbuf + inlen;
-  d->outbuf_start = (inlen <= (PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) ?
-                     malloc(inlen * sizeof(Py_UNICODE)) :
-                     NULL);
-  if (!d->outbuf_start)
-    goto errorexit;
+  if (d->outbuf_start == NULL)
+    {
+      d->outbuf_start = (inlen <= (PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) ?
+                         malloc(inlen * sizeof(Py_UNICODE)) :
+                         NULL);
+      if (d->outbuf_start == NULL)
+        return -1;
+      d->outbuf_end = d->outbuf_start + inlen;
+    }
   d->outbuf = d->outbuf_start;
-  d->outbuf_end = d->outbuf_start + inlen;
-  return d;
-
- errorexit:
-  free(d);
-  return NULL;
+  return 0;
 }
 
 void pypy_cjk_dec_free(struct pypy_cjk_dec_s *d)
@@ -112,34 +119,40 @@
 
 /************************************************************/
 
-struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
-                                         Py_UNICODE *inbuf, Py_ssize_t inlen)
+struct pypy_cjk_enc_s *pypy_cjk_enc_new(const MultibyteCodec *codec)
 {
-  Py_ssize_t outlen;
   struct pypy_cjk_enc_s *d = malloc(sizeof(struct pypy_cjk_enc_s));
   if (!d)
     return NULL;
   if (codec->encinit != NULL && codec->encinit(&d->state, codec->config) != 0)
-    goto errorexit;
+    {
+      free(d);
+      return NULL;
+    }
+  d->codec = codec;
+  d->outbuf_start = NULL;
+  return d;
+}
 
-  d->codec = codec;
+Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d,
+                             Py_UNICODE *inbuf, Py_ssize_t inlen)
+{
+  Py_ssize_t outlen;
   d->inbuf_start = inbuf;
   d->inbuf = inbuf;
   d->inbuf_end = inbuf + inlen;
-
-  if (inlen > (PY_SSIZE_T_MAX - 16) / 2)
-    goto errorexit;
-  outlen = inlen * 2 + 16;
-  d->outbuf_start = malloc(outlen);
-  if (!d->outbuf_start)
-    goto errorexit;
+  if (d->outbuf_start == NULL)
+    {
+      if (inlen > (PY_SSIZE_T_MAX - 16) / 2)
+        return -1;
+      outlen = inlen * 2 + 16;
+      d->outbuf_start = malloc(outlen);
+      if (d->outbuf_start == NULL)
+        return -1;
+      d->outbuf_end = d->outbuf_start + outlen;
+    }
   d->outbuf = d->outbuf_start;
-  d->outbuf_end = d->outbuf_start + outlen;
-  return d;
-
- errorexit:
-  free(d);
-  return NULL;
+  return 0;
 }
 
 void pypy_cjk_enc_free(struct pypy_cjk_enc_s *d)
@@ -167,11 +180,8 @@
   return 0;
 }
 
-#define MBENC_RESET     MBENC_MAX<<1
-
-Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *d)
+Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *d, Py_ssize_t flags)
 {
-  int flags = MBENC_FLUSH | MBENC_RESET;   /* XXX always, for now */
   while (1)
     {
       Py_ssize_t r;
@@ -242,3 +252,8 @@
   d->inbuf = d->inbuf_start + in_offset;
   return 0;
 }
+
+const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *d)
+{
+  return d->codec;
+}
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
@@ -94,8 +94,9 @@
   Py_UNICODE *outbuf_start, *outbuf, *outbuf_end;
 };
 
-struct pypy_cjk_dec_s *pypy_cjk_dec_init(const MultibyteCodec *codec,
-                                         char *inbuf, Py_ssize_t inlen);
+struct pypy_cjk_dec_s *pypy_cjk_dec_new(const MultibyteCodec *codec);
+Py_ssize_t pypy_cjk_dec_init(struct pypy_cjk_dec_s *d,
+                             char *inbuf, Py_ssize_t inlen);
 void pypy_cjk_dec_free(struct pypy_cjk_dec_s *);
 Py_ssize_t pypy_cjk_dec_chunk(struct pypy_cjk_dec_s *);
 Py_UNICODE *pypy_cjk_dec_outbuf(struct pypy_cjk_dec_s *);
@@ -112,10 +113,11 @@
   unsigned char *outbuf_start, *outbuf, *outbuf_end;
 };
 
-struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
-                                         Py_UNICODE *inbuf, Py_ssize_t inlen);
+struct pypy_cjk_enc_s *pypy_cjk_enc_new(const MultibyteCodec *codec);
+Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d,
+                             Py_UNICODE *inbuf, Py_ssize_t inlen);
 void pypy_cjk_enc_free(struct pypy_cjk_enc_s *);
-Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *);
+Py_ssize_t pypy_cjk_enc_chunk(struct pypy_cjk_enc_s *, Py_ssize_t);
 Py_ssize_t pypy_cjk_enc_reset(struct pypy_cjk_enc_s *);
 char *pypy_cjk_enc_outbuf(struct pypy_cjk_enc_s *);
 Py_ssize_t pypy_cjk_enc_outlen(struct pypy_cjk_enc_s *);
@@ -123,6 +125,7 @@
 Py_ssize_t pypy_cjk_enc_inbuf_consumed(struct pypy_cjk_enc_s* d);
 Py_ssize_t pypy_cjk_enc_replace_on_error(struct pypy_cjk_enc_s* d,
                                          char *, Py_ssize_t, Py_ssize_t);
+const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *);
 
 /* list of codecs defined in the .c files */
 
diff --git a/pypy/translator/c/test/test_newgc.py b/pypy/translator/c/test/test_newgc.py
--- a/pypy/translator/c/test/test_newgc.py
+++ b/pypy/translator/c/test/test_newgc.py
@@ -1390,6 +1390,35 @@
     def test_gc_heap_stats(self):
         py.test.skip("not implemented")
 
+    def define_nongc_attached_to_gc(cls):
+        from pypy.rpython.lltypesystem import rffi
+        ARRAY = rffi.CArray(rffi.INT)
+        class A:
+            def __init__(self, n):
+                self.buf = lltype.malloc(ARRAY, n, flavor='raw',
+                                         add_memory_pressure=True)
+            def __del__(self):
+                lltype.free(self.buf, flavor='raw')
+        A(6)
+        def f():
+            # allocate a total of ~77GB, but if the automatic gc'ing works,
+            # it should never need more than a few MBs at once
+            am1 = am2 = am3 = None
+            res = 0
+            for i in range(1, 100001):
+                if am3 is not None:
+                    res += rffi.cast(lltype.Signed, am3.buf[0])
+                am3 = am2
+                am2 = am1
+                am1 = A(i * 4)
+                am1.buf[0] = rffi.cast(rffi.INT, i-50000)
+            return res
+        return f
+
+    def test_nongc_attached_to_gc(self):
+        res = self.run("nongc_attached_to_gc")
+        assert res == -99997
+
 # ____________________________________________________________________
 
 class TaggedPointersTest(object):
diff --git a/pypy/translator/jvm/src/pypy/PyPy.java b/pypy/translator/jvm/src/pypy/PyPy.java
--- a/pypy/translator/jvm/src/pypy/PyPy.java
+++ b/pypy/translator/jvm/src/pypy/PyPy.java
@@ -1100,9 +1100,9 @@
         if (Double.isNaN(x))
             return interlink.recordFloatSigned(x, 0);
 
-        // Infinity: Python throws exception
+        // Infinity: Python returns (inf, 0)
         if (Double.isInfinite(x))
-            interlink.throwOverflowError();
+            return interlink.recordFloatSigned(x, 0);
 
         // Extract the various parts of the format:
         final long e=11, f=52; // number of bits in IEEE format