[pypy-commit] pypy numpy-fixes: merge default into branch

Sat May 16 20:42:59 CEST 2015

Author: mattip <matti.picus at gmail.com>
Branch: numpy-fixes
Changeset: r77347:d42df199eb45
Date: 2015-05-16 19:58 +0300
http://bitbucket.org/pypy/pypy/changeset/d42df199eb45/

Log:	merge default into branch

diff --git a/lib_pypy/datetime.py b/lib_pypy/datetime.py
--- a/lib_pypy/datetime.py
+++ b/lib_pypy/datetime.py
@@ -1507,8 +1507,13 @@
 
         converter = _time.localtime if tz is None else _time.gmtime
 
-        t, frac = divmod(t, 1.0)
-        us = _round(frac * 1e6)
+        if isinstance(t, int):
+            us = 0
+        else:
+            t_full = t
+            t = int(_math.floor(t))
+            frac = t_full - t
+            us = _round(frac * 1e6)
 
         # If timestamp is less than one microsecond smaller than a
         # full second, us can be rounded up to 1000000.  In this case,
@@ -1527,8 +1532,13 @@
     @classmethod
     def utcfromtimestamp(cls, t):
         "Construct a UTC datetime from a POSIX timestamp (like time.time())."
-        t, frac = divmod(t, 1.0)
-        us = _round(frac * 1e6)
+        if isinstance(t, int):
+            us = 0
+        else:
+            t_full = t
+            t = int(_math.floor(t))
+            frac = t_full - t
+            us = _round(frac * 1e6)
 
         # If timestamp is less than one microsecond smaller than a
         # full second, us can be rounded up to 1000000.  In this case,
diff --git a/pypy/doc/cpython_differences.rst b/pypy/doc/cpython_differences.rst
--- a/pypy/doc/cpython_differences.rst
+++ b/pypy/doc/cpython_differences.rst
@@ -320,6 +320,13 @@
   http://bugs.python.org/issue14621, some of us believe it has no
   purpose in CPython either.
 
+* You can't store non-string keys in type objects.  For example::
+
+    class A(object):
+        locals()[42] = 3
+
+  won't work.
+
 * ``sys.setrecursionlimit(n)`` sets the limit only approximately,
   by setting the usable stack space to ``n * 768`` bytes.  On Linux,
   depending on the compiler settings, the default of 768KB is enough
@@ -361,8 +368,13 @@
   opposed to a dict proxy like in CPython. Mutating the dict will change the
   type and vice versa. For builtin types, a dictionary will be returned that
   cannot be changed (but still looks and behaves like a normal dictionary).
+  
+* some functions and attributes of the ``gc`` module behave in a
+  slightly different way: for example, ``gc.enable`` and
+  ``gc.disable`` are supported, but instead of enabling and disabling
+  the GC, they just enable and disable the execution of finalizers.
 
 * PyPy prints a random line from past #pypy IRC topics at startup in
-  interactive mode. In a released version, this behaviour is supressed, but
+  interactive mode. In a released version, this behaviour is suppressed, but
   setting the environment variable PYPY_IRC_TOPIC will bring it back. Note that
   downstream package providers have been known to totally disable this feature.
diff --git a/pypy/doc/embedding.rst b/pypy/doc/embedding.rst
--- a/pypy/doc/embedding.rst
+++ b/pypy/doc/embedding.rst
@@ -51,6 +51,9 @@
    otherwise return 0.  You should really do your own error handling in the
    source. It'll acquire the GIL.
 
+   Note: this is meant to be called *only once* or a few times at most.  See
+   the `more complete example`_ below.
+
 .. function:: int pypy_execute_source_ptr(char* source, void* ptr);
 
    .. note:: Not available in PyPy <= 2.2.1
@@ -65,8 +68,9 @@
    Note that this function is not thread-safe itself, so you need to guard it
    with a mutex.
 
-Simple example
---------------
+
+Minimal example
+---------------
 
 Note that this API is a lot more minimal than say CPython C API, so at first
 it's obvious to think that you can't do much. However, the trick is to do
@@ -78,10 +82,10 @@
 
 .. code-block:: c
 
-    #include "include/PyPy.h"
+    #include "PyPy.h"
     #include <stdio.h>
 
-    const char source[] = "print 'hello from pypy'";
+    static char source[] = "print 'hello from pypy'";
 
     int main(void)
     {
@@ -103,154 +107,115 @@
 
 If we save it as ``x.c`` now, compile it and run it (on linux) with::
 
-    fijal at hermann:/opt/pypy$ gcc -o x x.c -lpypy-c -L.
-    fijal at hermann:/opt/pypy$ LD_LIBRARY_PATH=. ./x
+    $ gcc -g -o x x.c -lpypy-c -L/opt/pypy/bin -I/opt/pypy/include
+    $ LD_LIBRARY_PATH=/opt/pypy/bin ./x
     hello from pypy
 
-on OSX it is necessary to set the rpath of the binary if one wants to link to it::
+.. note:: If the compilation fails because of missing PyPy.h header file,
+          you are running PyPy <= 2.2.1.  Get it here__.
+
+.. __: https://bitbucket.org/pypy/pypy/raw/c4cd6eca9358066571500ac82aaacfdaa3889e8c/include/PyPy.h
+
+On OSX it is necessary to set the rpath of the binary if one wants to link to it,
+with a command like::
 
     gcc -o x x.c -lpypy-c -L. -Wl,-rpath -Wl, at executable_path
     ./x
     hello from pypy
 
-Worked!
 
-.. note:: If the compilation fails because of missing PyPy.h header file,
-          you are running PyPy <= 2.2.1, please see the section `Missing PyPy.h`_.
-
-Missing PyPy.h
---------------
-
-.. note:: PyPy.h is in the nightly builds and goes to new PyPy releases (>2.2.1).
-
-For PyPy <= 2.2.1, you can download PyPy.h from PyPy repository (it has been added in commit c4cd6ec):
-
-.. code-block:: bash
-
-    cd /opt/pypy/include
-    wget https://bitbucket.org/pypy/pypy/raw/c4cd6eca9358066571500ac82aaacfdaa3889e8c/include/PyPy.h
-
-
-More advanced example
+More complete example
 ---------------------
 
 .. note:: This example depends on pypy_execute_source_ptr which is not available
-          in PyPy <= 2.2.1. You might want to see the alternative example
-          below.
+          in PyPy <= 2.2.1.
 
 Typically we need something more to do than simply execute source. The following
 is a fully fledged example, please consult cffi documentation for details.
 It's a bit longish, but it captures a gist what can be done with the PyPy
 embedding interface:
 
+.. code-block:: python
+
+    # file "interface.py"
+    
+    import cffi
+
+    ffi = cffi.FFI()
+    ffi.cdef('''
+    struct API {
+        double (*add_numbers)(double x, double y);
+    };
+    ''')
+
+    # Better define callbacks at module scope, it's important to
+    # keep this object alive.
+    @ffi.callback("double (double, double)")
+    def add_numbers(x, y):
+        return x + y
+
+    def fill_api(ptr):
+        global api
+        api = ffi.cast("struct API*", ptr)
+        api.add_numbers = add_numbers
+
 .. code-block:: c
 
-    #include "include/PyPy.h"
+    /* C example */
+    #include "PyPy.h"
     #include <stdio.h>
 
-    char source[] = "from cffi import FFI\n\
-    ffi = FFI()\n\
-    @ffi.callback('int(int)')\n\
-    def func(a):\n\
-        print 'Got from C %d' % a\n\
-        return a * 2\n\
-    ffi.cdef('int callback(int (*func)(int));')\n\
-    c_func = ffi.cast('int(*)(int(*)(int))', c_argument)\n\
-    c_func(func)\n\
-    print 'finished the Python part'\n\
-    ";
+    struct API {
+        double (*add_numbers)(double x, double y);
+    };
 
-    int callback(int (*func)(int))
+    struct API api;   /* global var */
+
+    int initialize_api(void)
     {
-        printf("Calling to Python, result: %d\n", func(3));
-    }
-
-    int main()
-    {
+        static char source[] =
+            "import sys; sys.path.insert(0, '.'); "
+            "import interface; interface.fill_api(c_argument)";
         int res;
-        void *lib, *func;
 
         rpython_startup_code();
         res = pypy_setup_home("/opt/pypy/bin/libpypy-c.so", 1);
         if (res) {
-            printf("Error setting pypy home!\n");
+            fprintf(stderr, "Error setting pypy home!\n");
+            return -1;
+        }
+        res = pypy_execute_source_ptr(source, &api);
+        if (res) {
+            fprintf(stderr, "Error calling pypy_execute_source_ptr!\n");
+            return -1;
+        }
+        return 0;
+    }
+
+    int main(void)
+    {
+        if (initialize_api() < 0)
             return 1;
-        }
-        res = pypy_execute_source_ptr(source, (void*)callback);
-        if (res) {
-            printf("Error calling pypy_execute_source_ptr!\n");
-        }
-        return res;
+
+        printf("sum: %f\n", api.add_numbers(12.3, 45.6));
+
+        return 0;
     }
 
 you can compile and run it with::
 
-   fijal at hermann:/opt/pypy$ gcc -g -o x x.c -lpypy-c -L.
-   fijal at hermann:/opt/pypy$ LD_LIBRARY_PATH=. ./x
-   Got from C 3
-   Calling to Python, result: 6
-   finished the Python part
+    $ gcc -g -o x x.c -lpypy-c -L/opt/pypy/bin -I/opt/pypy/include
+    $ LD_LIBRARY_PATH=/opt/pypy/bin ./x
+    sum: 57.900000
 
-As you can see, we successfully managed to call Python from C and C from
-Python. Now having one callback might not be enough, so what typically happens
-is that we would pass a struct full of callbacks to ``pypy_execute_source_ptr``
-and fill the structure from Python side for the future use.
+As you can see, what we did is create a ``struct API`` that contains
+the custom API that we need in our particular case.  This struct is
+filled by Python to contain a function pointer that is then called
+form the C side.  It is also possible to do have other function
+pointers that are filled by the C side and called by the Python side,
+or even non-function-pointer fields: basically, the two sides
+communicate via this single C structure that defines your API.
 
-Alternative example
--------------------
-
-As ``pypy_execute_source_ptr`` is not available in PyPy 2.2.1, you might want to try 
-an alternative approach which relies on -export-dynamic flag to the GNU linker. 
-The downside to this approach is that it is platform dependent.
-
-.. code-block:: c
-
-    #include "include/PyPy.h"
-    #include <stdio.h>
-
-    char source[] = "from cffi import FFI\n\
-    ffi = FFI()\n\
-    @ffi.callback('int(int)')\n\
-    def func(a):\n\
-        print 'Got from C %d' % a\n\
-        return a * 2\n\
-    ffi.cdef('int callback(int (*func)(int));')\n\
-    lib = ffi.verify('int callback(int (*func)(int));')\n\
-    lib.callback(func)\n\
-    print 'finished the Python part'\n\
-    ";
-
-    int callback(int (*func)(int))
-    {
-        printf("Calling to Python, result: %d\n", func(3));
-    }
-
-    int main()
-    {
-        int res;
-        void *lib, *func;
-
-        rpython_startup_code();
-        res = pypy_setup_home("/opt/pypy/bin/libpypy-c.so", 1);
-        if (res) {
-            printf("Error setting pypy home!\n");
-            return 1;
-        }
-        res = pypy_execute_source(source);
-        if (res) {
-            printf("Error calling pypy_execute_source!\n");
-        }
-        return res;
-    }
-
-
-Make sure to pass -export-dynamic flag when compiling::
-
-   $ gcc -g -o x x.c -lpypy-c -L. -export-dynamic
-   $ LD_LIBRARY_PATH=. ./x
-   Got from C 3
-   Calling to Python, result: 6
-   finished the Python part
 
 Finding pypy_home
 -----------------
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -59,6 +59,7 @@
 exactly like `f(a, b)`.
 
 .. branch: issue2018
+
 branch issue2018:
 Allow prebuilt rpython dict with function values
 
@@ -66,22 +67,41 @@
 .. Merged but then backed out, hopefully it will return as vmprof2
 
 .. branch: object-dtype2
+
+branch object-dtype2:
 Extend numpy dtypes to allow using objects with associated garbage collection hook
 
 .. branch: vmprof2
+
+branch vmprof2:
 Add backend support for vmprof - a lightweight statistical profiler -
 to linux64, see client at https://vmprof.readthedocs.org
 
 .. branch: jit_hint_docs
+
+branch jit_hint_docs:
 Add more detail to @jit.elidable and @jit.promote in rpython/rlib/jit.py
 
 .. branch: remove-frame-debug-attrs
+
+branch remove_frame-debug-attrs:
 Remove the debug attributes from frames only used for tracing and replace
 them with a debug object that is created on-demand
 
 .. branch: can_cast
+
+branch can_cast:
 Implement np.can_cast, np.min_scalar_type and missing dtype comparison operations.
 
-.. branch numpy-fixes
+.. branch: numpy-fixes
+
+branch numpy-fixes:
 Fix some error related to object dtype, non-contiguous arrays, inplement parts of 
 __array_interface__, __array_priority__, __array_wrap__
+
+.. branch: cells-local-stack
+
+branch cells-local-stack:
+Unify the PyFrame.cells and Pyframe.locals_stack_w lists, making frame objects
+1 or 3 words smaller.
+
diff --git a/pypy/interpreter/function.py b/pypy/interpreter/function.py
--- a/pypy/interpreter/function.py
+++ b/pypy/interpreter/function.py
@@ -105,7 +105,7 @@
                                                    self)
                 for i in funccallunrolling:
                     if i < nargs:
-                        new_frame.locals_stack_w[i] = args_w[i]
+                        new_frame.locals_cells_stack_w[i] = args_w[i]
                 return new_frame.run()
         elif nargs >= 1 and fast_natural_arity == Code.PASSTHROUGHARGS1:
             assert isinstance(code, gateway.BuiltinCodePassThroughArguments1)
@@ -171,7 +171,7 @@
                                                    self)
         for i in xrange(nargs):
             w_arg = frame.peekvalue(nargs-1-i)
-            new_frame.locals_stack_w[i] = w_arg
+            new_frame.locals_cells_stack_w[i] = w_arg
 
         return new_frame.run()
 
@@ -182,13 +182,13 @@
                                                    self)
         for i in xrange(nargs):
             w_arg = frame.peekvalue(nargs-1-i)
-            new_frame.locals_stack_w[i] = w_arg
+            new_frame.locals_cells_stack_w[i] = w_arg
 
         ndefs = len(self.defs_w)
         start = ndefs - defs_to_load
         i = nargs
         for j in xrange(start, ndefs):
-            new_frame.locals_stack_w[i] = self.defs_w[j]
+            new_frame.locals_cells_stack_w[i] = self.defs_w[j]
             i += 1
         return new_frame.run()
 
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -209,7 +209,7 @@
         # speed hack
         fresh_frame = jit.hint(frame, access_directly=True,
                                       fresh_virtualizable=True)
-        args.parse_into_scope(None, fresh_frame.locals_stack_w, func.name,
+        args.parse_into_scope(None, fresh_frame.locals_cells_stack_w, func.name,
                               sig, func.defs_w)
         fresh_frame.init_cells()
         return frame.run()
@@ -221,7 +221,7 @@
         # speed hack
         fresh_frame = jit.hint(frame, access_directly=True,
                                       fresh_virtualizable=True)
-        args.parse_into_scope(w_obj, fresh_frame.locals_stack_w, func.name,
+        args.parse_into_scope(w_obj, fresh_frame.locals_cells_stack_w, func.name,
                               sig, func.defs_w)
         fresh_frame.init_cells()
         return frame.run()
diff --git a/pypy/interpreter/pyframe.py b/pypy/interpreter/pyframe.py
--- a/pypy/interpreter/pyframe.py
+++ b/pypy/interpreter/pyframe.py
@@ -69,10 +69,9 @@
 
     w_globals = None
     pycode = None # code object executed by that frame
-    locals_stack_w = None # the list of all locals and valuestack
+    locals_cells_stack_w = None # the list of all locals, cells and the valuestack
     valuestackdepth = 0 # number of items on valuestack
     lastblock = None
-    cells = None # cells
 
     # other fields:
     
@@ -93,9 +92,14 @@
         self.space = space
         self.w_globals = w_globals
         self.pycode = code
-        self.locals_stack_w = [None] * (code.co_nlocals + code.co_stacksize)
-        self.valuestackdepth = code.co_nlocals
-        make_sure_not_resized(self.locals_stack_w)
+        ncellvars = len(code.co_cellvars)
+        nfreevars = len(code.co_freevars)
+        size = code.co_nlocals + ncellvars + nfreevars + code.co_stacksize
+        # the layout of this list is as follows:
+        # | local vars | cells | stack |
+        self.locals_cells_stack_w = [None] * size
+        self.valuestackdepth = code.co_nlocals + ncellvars + nfreevars
+        make_sure_not_resized(self.locals_cells_stack_w)
         check_nonneg(self.valuestackdepth)
         #
         if space.config.objspace.honor__builtins__:
@@ -136,6 +140,11 @@
             self.__class__.__module__, self.__class__.__name__,
             self.pycode, self.get_last_lineno())
 
+    def _getcell(self, varindex):
+        cell = self.locals_cells_stack_w[varindex + self.pycode.co_nlocals]
+        assert isinstance(cell, Cell)
+        return cell
+
     def mark_as_escaped(self):
         """
         Must be called on frames that are exposed to applevel, e.g. by
@@ -181,8 +190,6 @@
         else:
             return self.space.builtin
 
-    _NO_CELLS = []
-
     @jit.unroll_safe
     def initialize_frame_scopes(self, outer_func, code):
         # regular functions always have CO_OPTIMIZED and CO_NEWLOCALS.
@@ -201,8 +208,7 @@
         nfreevars = len(code.co_freevars)
         if not nfreevars:
             if not ncellvars:
-                self.cells = self._NO_CELLS
-                return            # no self.cells needed - fast path
+                return            # no cells needed - fast path
         elif outer_func is None:
             space = self.space
             raise OperationError(space.w_TypeError,
@@ -215,11 +221,13 @@
         if closure_size != nfreevars:
             raise ValueError("code object received a closure with "
                                  "an unexpected number of free variables")
-        self.cells = [None] * (ncellvars + nfreevars)
+        index = code.co_nlocals
         for i in range(ncellvars):
-            self.cells[i] = Cell()
+            self.locals_cells_stack_w[index] = Cell()
+            index += 1
         for i in range(nfreevars):
-            self.cells[i + ncellvars] = outer_func.closure[i]
+            self.locals_cells_stack_w[index] = outer_func.closure[i]
+            index += 1
 
     def run(self):
         """Start this frame's execution."""
@@ -283,14 +291,24 @@
     # stack manipulation helpers
     def pushvalue(self, w_object):
         depth = self.valuestackdepth
-        self.locals_stack_w[depth] = w_object
+        self.locals_cells_stack_w[depth] = w_object
         self.valuestackdepth = depth + 1
 
+    def _check_stack_index(self, index):
+        # will be completely removed by the optimizer if only used in an assert
+        # and if asserts are disabled
+        code = self.pycode
+        ncellvars = len(code.co_cellvars)
+        nfreevars = len(code.co_freevars)
+        stackstart = code.co_nlocals + ncellvars + nfreevars
+        return index >= stackstart
+
     def popvalue(self):
         depth = self.valuestackdepth - 1
-        assert depth >= self.pycode.co_nlocals, "pop from empty value stack"
-        w_object = self.locals_stack_w[depth]
-        self.locals_stack_w[depth] = None
+        assert self._check_stack_index(depth)
+        assert depth >= 0
+        w_object = self.locals_cells_stack_w[depth]
+        self.locals_cells_stack_w[depth] = None
         self.valuestackdepth = depth
         return w_object
 
@@ -316,25 +334,26 @@
     def peekvalues(self, n):
         values_w = [None] * n
         base = self.valuestackdepth - n
-        assert base >= self.pycode.co_nlocals
+        assert self._check_stack_index(base)
+        assert base >= 0
         while True:
             n -= 1
             if n < 0:
                 break
-            values_w[n] = self.locals_stack_w[base+n]
+            values_w[n] = self.locals_cells_stack_w[base+n]
         return values_w
 
     @jit.unroll_safe
     def dropvalues(self, n):
         n = hint(n, promote=True)
         finaldepth = self.valuestackdepth - n
-        assert finaldepth >= self.pycode.co_nlocals, (
-            "stack underflow in dropvalues()")
+        assert self._check_stack_index(finaldepth)
+        assert finaldepth >= 0
         while True:
             n -= 1
             if n < 0:
                 break
-            self.locals_stack_w[finaldepth+n] = None
+            self.locals_cells_stack_w[finaldepth+n] = None
         self.valuestackdepth = finaldepth
 
     @jit.unroll_safe
@@ -361,34 +380,27 @@
         # Contrast this with CPython where it's PEEK(-1).
         index_from_top = hint(index_from_top, promote=True)
         index = self.valuestackdepth + ~index_from_top
-        assert index >= self.pycode.co_nlocals, (
-            "peek past the bottom of the stack")
-        return self.locals_stack_w[index]
+        assert self._check_stack_index(index)
+        assert index >= 0
+        return self.locals_cells_stack_w[index]
 
     def settopvalue(self, w_object, index_from_top=0):
         index_from_top = hint(index_from_top, promote=True)
         index = self.valuestackdepth + ~index_from_top
-        assert index >= self.pycode.co_nlocals, (
-            "settop past the bottom of the stack")
-        self.locals_stack_w[index] = w_object
+        assert self._check_stack_index(index)
+        assert index >= 0
+        self.locals_cells_stack_w[index] = w_object
 
     @jit.unroll_safe
     def dropvaluesuntil(self, finaldepth):
         depth = self.valuestackdepth - 1
         finaldepth = hint(finaldepth, promote=True)
+        assert finaldepth >= 0
         while depth >= finaldepth:
-            self.locals_stack_w[depth] = None
+            self.locals_cells_stack_w[depth] = None
             depth -= 1
         self.valuestackdepth = finaldepth
 
-    def save_locals_stack(self):
-        return self.locals_stack_w[:self.valuestackdepth]
-
-    def restore_locals_stack(self, items_w):
-        self.locals_stack_w[:len(items_w)] = items_w
-        self.init_cells()
-        self.dropvaluesuntil(len(items_w))
-
     def make_arguments(self, nargs):
         return Arguments(self.space, self.peekvalues(nargs))
 
@@ -411,24 +423,16 @@
         w = space.wrap
         nt = space.newtuple
 
-        cells = self.cells
-        if cells is None:
-            w_cells = space.w_None
-        else:
-            w_cells = space.newlist([space.wrap(cell) for cell in cells])
-
         if self.get_w_f_trace() is None:
             f_lineno = self.get_last_lineno()
         else:
             f_lineno = self.getorcreatedebug().f_lineno
 
         nlocals = self.pycode.co_nlocals
-        values_w = self.locals_stack_w[nlocals:self.valuestackdepth]
-        w_valuestack = maker.slp_into_tuple_with_nulls(space, values_w)
+        values_w = self.locals_cells_stack_w
+        w_locals_cells_stack = maker.slp_into_tuple_with_nulls(space, values_w)
 
         w_blockstack = nt([block._get_state_(space) for block in self.get_blocklist()])
-        w_fastlocals = maker.slp_into_tuple_with_nulls(
-            space, self.locals_stack_w[:nlocals])
         if self.last_exception is None:
             w_exc_value = space.w_None
             w_tb = space.w_None
@@ -441,7 +445,7 @@
             w(self.f_backref()),
             w(self.get_builtin()),
             w(self.pycode),
-            w_valuestack,
+            w_locals_cells_stack,
             w_blockstack,
             w_exc_value, # last_exception
             w_tb,        #
@@ -449,7 +453,6 @@
             w(self.last_instr),
             w(self.frame_finished_execution),
             w(f_lineno),
-            w_fastlocals,
             space.w_None,           #XXX placeholder for f_locals
 
             #f_restricted requires no additional data!
@@ -458,7 +461,7 @@
             w(d.instr_lb),
             w(d.instr_ub),
             w(d.instr_prev_plus_one),
-            w_cells,
+            w(self.valuestackdepth),
             ]
         return nt(tup_state)
 
@@ -467,24 +470,20 @@
         from pypy.module._pickle_support import maker # helper fns
         from pypy.interpreter.pycode import PyCode
         from pypy.interpreter.module import Module
-        args_w = space.unpackiterable(w_args, 18)
-        w_f_back, w_builtin, w_pycode, w_valuestack, w_blockstack, w_exc_value, w_tb,\
-            w_globals, w_last_instr, w_finished, w_f_lineno, w_fastlocals, w_f_locals, \
-            w_f_trace, w_instr_lb, w_instr_ub, w_instr_prev_plus_one, w_cells = args_w
+        args_w = space.unpackiterable(w_args, 17)
+        w_f_back, w_builtin, w_pycode, w_locals_cells_stack, w_blockstack, w_exc_value, w_tb,\
+            w_globals, w_last_instr, w_finished, w_f_lineno, w_f_locals, \
+            w_f_trace, w_instr_lb, w_instr_ub, w_instr_prev_plus_one, w_stackdepth = args_w
 
         new_frame = self
         pycode = space.interp_w(PyCode, w_pycode)
 
-        if space.is_w(w_cells, space.w_None):
-            closure = None
-            cellvars = []
-        else:
-            from pypy.interpreter.nestedscope import Cell
-            cells_w = space.unpackiterable(w_cells)
-            cells = [space.interp_w(Cell, w_cell) for w_cell in cells_w]
-            ncellvars = len(pycode.co_cellvars)
-            cellvars = cells[:ncellvars]
-            closure = cells[ncellvars:]
+        values_w = maker.slp_from_tuple_with_nulls(space, w_locals_cells_stack)
+        nfreevars = len(pycode.co_freevars)
+        closure = None
+        if nfreevars:
+            base = pycode.co_nlocals + len(pycode.co_cellvars)
+            closure = values_w[base: base + nfreevars]
 
         # do not use the instance's __init__ but the base's, because we set
         # everything like cells from here
@@ -502,9 +501,12 @@
             assert space.interp_w(Module, w_builtin) is space.builtin
         new_frame.set_blocklist([unpickle_block(space, w_blk)
                                  for w_blk in space.unpackiterable(w_blockstack)])
-        values_w = maker.slp_from_tuple_with_nulls(space, w_valuestack)
-        for w_value in values_w:
-            new_frame.pushvalue(w_value)
+        self.locals_cells_stack_w = values_w[:]
+        valuestackdepth = space.int_w(w_stackdepth)
+        if not self._check_stack_index(valuestackdepth):
+            raise OperationError(space.w_ValueError, space.wrap("invalid stackdepth"))
+        assert valuestackdepth >= 0
+        self.valuestackdepth = valuestackdepth
         if space.is_w(w_exc_value, space.w_None):
             new_frame.last_exception = None
         else:
@@ -517,8 +519,6 @@
         new_frame.frame_finished_execution = space.is_true(w_finished)
         d = new_frame.getorcreatedebug()
         d.f_lineno = space.int_w(w_f_lineno)
-        fastlocals_w = maker.slp_from_tuple_with_nulls(space, w_fastlocals)
-        new_frame.locals_stack_w[:len(fastlocals_w)] = fastlocals_w
 
         if space.is_w(w_f_trace, space.w_None):
             d.w_f_trace = None
@@ -529,8 +529,6 @@
         d.instr_ub = space.int_w(w_instr_ub)
         d.instr_prev_plus_one = space.int_w(w_instr_prev_plus_one)
 
-        self._setcellvars(cellvars)
-
     def hide(self):
         return self.pycode.hidden_applevel
 
@@ -544,10 +542,10 @@
         scope_len = len(scope_w)
         if scope_len > self.pycode.co_nlocals:
             raise ValueError, "new fastscope is longer than the allocated area"
-        # don't assign directly to 'locals_stack_w[:scope_len]' to be
+        # don't assign directly to 'locals_cells_stack_w[:scope_len]' to be
         # virtualizable-friendly
         for i in range(scope_len):
-            self.locals_stack_w[i] = scope_w[i]
+            self.locals_cells_stack_w[i] = scope_w[i]
         self.init_cells()
 
     def getdictscope(self):
@@ -573,7 +571,7 @@
         varnames = self.getcode().getvarnames()
         for i in range(min(len(varnames), self.getcode().co_nlocals)):
             name = varnames[i]
-            w_value = self.locals_stack_w[i]
+            w_value = self.locals_cells_stack_w[i]
             if w_value is not None:
                 self.space.setitem_str(d.w_locals, name, w_value)
             else:
@@ -592,7 +590,7 @@
             freevarnames = freevarnames + self.pycode.co_freevars
         for i in range(len(freevarnames)):
             name = freevarnames[i]
-            cell = self.cells[i]
+            cell = self._getcell(i)
             try:
                 w_value = cell.get()
             except ValueError:
@@ -631,7 +629,7 @@
             # into the locals dict used by the class.
         for i in range(len(freevarnames)):
             name = freevarnames[i]
-            cell = self.cells[i]
+            cell = self._getcell(i)
             w_value = self.space.finditem_str(w_locals, name)
             if w_value is not None:
                 cell.set(w_value)
@@ -639,24 +637,21 @@
     @jit.unroll_safe
     def init_cells(self):
         """
-        Initialize cellvars from self.locals_stack_w.
+        Initialize cellvars from self.locals_cells_stack_w.
         """
         args_to_copy = self.pycode._args_as_cellvars
+        index = self.pycode.co_nlocals
         for i in range(len(args_to_copy)):
             argnum = args_to_copy[i]
             if argnum >= 0:
-                self.cells[i].set(self.locals_stack_w[argnum])
+                cell = self.locals_cells_stack_w[index]
+                assert isinstance(cell, Cell)
+                cell.set(self.locals_cells_stack_w[argnum])
+            index += 1
 
     def getclosure(self):
         return None
 
-    def _setcellvars(self, cellvars):
-        ncellvars = len(self.pycode.co_cellvars)
-        if len(cellvars) != ncellvars:
-            raise OperationError(self.space.w_TypeError,
-                                 self.space.wrap("bad cellvars"))
-        self.cells[:ncellvars] = cellvars
-
     def fget_code(self, space):
         return space.wrap(self.getcode())
 
diff --git a/pypy/interpreter/pyopcode.py b/pypy/interpreter/pyopcode.py
--- a/pypy/interpreter/pyopcode.py
+++ b/pypy/interpreter/pyopcode.py
@@ -485,7 +485,7 @@
 
     def LOAD_FAST(self, varindex, next_instr):
         # access a local variable directly
-        w_value = self.locals_stack_w[varindex]
+        w_value = self.locals_cells_stack_w[varindex]
         if w_value is None:
             self._load_fast_failed(varindex)
         self.pushvalue(w_value)
@@ -505,7 +505,7 @@
     def STORE_FAST(self, varindex, next_instr):
         w_newvalue = self.popvalue()
         assert w_newvalue is not None
-        self.locals_stack_w[varindex] = w_newvalue
+        self.locals_cells_stack_w[varindex] = w_newvalue
 
     def getfreevarname(self, index):
         freevarnames = self.pycode.co_cellvars + self.pycode.co_freevars
@@ -517,7 +517,7 @@
 
     def LOAD_DEREF(self, varindex, next_instr):
         # nested scopes: access a variable through its cell object
-        cell = self.cells[varindex]
+        cell = self._getcell(varindex)
         try:
             w_value = cell.get()
         except ValueError:
@@ -536,12 +536,12 @@
     def STORE_DEREF(self, varindex, next_instr):
         # nested scopes: access a variable through its cell object
         w_newvalue = self.popvalue()
-        cell = self.cells[varindex]
+        cell = self._getcell(varindex)
         cell.set(w_newvalue)
 
     def LOAD_CLOSURE(self, varindex, next_instr):
         # nested scopes: access the cell object
-        cell = self.cells[varindex]
+        cell = self._getcell(varindex)
         w_value = self.space.wrap(cell)
         self.pushvalue(w_value)
 
@@ -911,12 +911,12 @@
     LOAD_GLOBAL._always_inline_ = True
 
     def DELETE_FAST(self, varindex, next_instr):
-        if self.locals_stack_w[varindex] is None:
+        if self.locals_cells_stack_w[varindex] is None:
             varname = self.getlocalvarname(varindex)
             raise oefmt(self.space.w_UnboundLocalError,
                         "local variable '%s' referenced before assignment",
                         varname)
-        self.locals_stack_w[varindex] = None
+        self.locals_cells_stack_w[varindex] = None
 
     def BUILD_TUPLE(self, itemcount, next_instr):
         items = self.popvalues(itemcount)
diff --git a/pypy/module/_cffi_backend/ccallback.py b/pypy/module/_cffi_backend/ccallback.py
--- a/pypy/module/_cffi_backend/ccallback.py
+++ b/pypy/module/_cffi_backend/ccallback.py
@@ -1,19 +1,21 @@
 """
 Callbacks.
 """
-import os
+import sys, os
 
-from rpython.rlib import clibffi, rweakref, jit
+from rpython.rlib import clibffi, rweakref, jit, jit_libffi
 from rpython.rlib.objectmodel import compute_unique_id, keepalive_until_here
 from rpython.rtyper.lltypesystem import lltype, rffi
 
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.module._cffi_backend import cerrno, misc
 from pypy.module._cffi_backend.cdataobj import W_CData
-from pypy.module._cffi_backend.ctypefunc import SIZE_OF_FFI_ARG, BIG_ENDIAN, W_CTypeFunc
+from pypy.module._cffi_backend.ctypefunc import SIZE_OF_FFI_ARG, W_CTypeFunc
 from pypy.module._cffi_backend.ctypeprim import W_CTypePrimitiveSigned
 from pypy.module._cffi_backend.ctypevoid import W_CTypeVoid
 
+BIG_ENDIAN = sys.byteorder == 'big'
+
 # ____________________________________________________________
 
 
diff --git a/pypy/module/_cffi_backend/ctypefunc.py b/pypy/module/_cffi_backend/ctypefunc.py
--- a/pypy/module/_cffi_backend/ctypefunc.py
+++ b/pypy/module/_cffi_backend/ctypefunc.py
@@ -188,7 +188,6 @@
 # ____________________________________________________________
 
 
-BIG_ENDIAN = sys.byteorder == 'big'
 USE_C_LIBFFI_MSVC = getattr(clibffi, 'USE_C_LIBFFI_MSVC', False)
 
 
@@ -399,16 +398,6 @@
         exchange_offset = rffi.sizeof(rffi.CCHARP) * nargs
         exchange_offset = self.align_arg(exchange_offset)
         cif_descr.exchange_result = exchange_offset
-        cif_descr.exchange_result_libffi = exchange_offset
-
-        if BIG_ENDIAN and self.fresult.is_primitive_integer:
-            # For results of precisely these types, libffi has a
-            # strange rule that they will be returned as a whole
-            # 'ffi_arg' if they are smaller.  The difference
-            # only matters on big-endian.
-            if self.fresult.size < SIZE_OF_FFI_ARG:
-                diff = SIZE_OF_FFI_ARG - self.fresult.size
-                cif_descr.exchange_result += diff
 
         # then enough room for the result, rounded up to sizeof(ffi_arg)
         exchange_offset += max(rffi.getintfield(self.rtype, 'c_size'),
diff --git a/pypy/module/_continuation/interp_continuation.py b/pypy/module/_continuation/interp_continuation.py
--- a/pypy/module/_continuation/interp_continuation.py
+++ b/pypy/module/_continuation/interp_continuation.py
@@ -35,10 +35,10 @@
         w_args, w_kwds = __args__.topacked()
         bottomframe = space.createframe(get_entrypoint_pycode(space),
                                         get_w_module_dict(space), None)
-        bottomframe.locals_stack_w[0] = space.wrap(self)
-        bottomframe.locals_stack_w[1] = w_callable
-        bottomframe.locals_stack_w[2] = w_args
-        bottomframe.locals_stack_w[3] = w_kwds
+        bottomframe.locals_cells_stack_w[0] = space.wrap(self)
+        bottomframe.locals_cells_stack_w[1] = w_callable
+        bottomframe.locals_cells_stack_w[2] = w_args
+        bottomframe.locals_cells_stack_w[3] = w_kwds
         bottomframe.last_exception = get_cleared_operation_error(space)
         self.bottomframe = bottomframe
         #
diff --git a/pypy/module/cppyy/interp_cppyy.py b/pypy/module/cppyy/interp_cppyy.py
--- a/pypy/module/cppyy/interp_cppyy.py
+++ b/pypy/module/cppyy/interp_cppyy.py
@@ -314,13 +314,6 @@
                 exchange_offset = rffi.sizeof(rffi.CCHARP) * nargs
                 exchange_offset = (exchange_offset + 7) & ~7     # alignment
                 cif_descr.exchange_result = exchange_offset
-                cif_descr.exchange_result_libffi = exchange_offset
-
-                # TODO: left this out while testing (see ctypefunc.py)
-                # For results of precisely these types, libffi has a
-                # strange rule that they will be returned as a whole
-                # 'ffi_arg' if they are smaller.  The difference
-                # only matters on big-endian.
 
                 # then enough room for the result, rounded up to sizeof(ffi_arg)
                 exchange_offset += max(rffi.getintfield(cif_descr.rtype, 'c_size'),
diff --git a/pypy/module/micronumpy/nditer.py b/pypy/module/micronumpy/nditer.py
--- a/pypy/module/micronumpy/nditer.py
+++ b/pypy/module/micronumpy/nditer.py
@@ -217,8 +217,8 @@
     backward = is_backward(imp, order)
     if arr.is_scalar():
         return ConcreteIter(imp, 1, [], [], [], op_flags, base)
-    if (imp.strides[0] < imp.strides[-1] and not backward) or \
-       (imp.strides[0] > imp.strides[-1] and backward):
+    if (abs(imp.strides[0]) < abs(imp.strides[-1]) and not backward) or \
+       (abs(imp.strides[0]) > abs(imp.strides[-1]) and backward):
         # flip the strides. Is this always true for multidimension?
         strides = imp.strides[:]
         backstrides = imp.backstrides[:]
diff --git a/pypy/module/micronumpy/test/test_ndarray.py b/pypy/module/micronumpy/test/test_ndarray.py
--- a/pypy/module/micronumpy/test/test_ndarray.py
+++ b/pypy/module/micronumpy/test/test_ndarray.py
@@ -1834,6 +1834,13 @@
         v = s.view(y.__class__)
         assert v.strides == (4, 24)
 
+        x = empty([12, 8, 8], 'float64')
+        y = x[::-4, :, :]
+        assert y.base is x
+        assert y.strides == (-2048, 64, 8)
+        y[:] = 1000
+        assert x[-1, 0, 0] == 1000 
+
         a = empty([3, 2, 1], dtype='float64')
         b = a.view(dtype('uint32'))
         assert b.strides == (16, 8, 4)
diff --git a/pypy/module/micronumpy/test/test_ufuncs.py b/pypy/module/micronumpy/test/test_ufuncs.py
--- a/pypy/module/micronumpy/test/test_ufuncs.py
+++ b/pypy/module/micronumpy/test/test_ufuncs.py
@@ -246,12 +246,17 @@
                             dtypes=[dtype(int), dtype(int)],
                             stack_inputs=True,
                           )
-        ai = arange(18, dtype=int).reshape(2,3,3)
+        ai = arange(12*3*3, dtype='int32').reshape(12,3,3)
         exc = raises(ValueError, ufunc, ai[:,:,0])
         assert "perand 0 has a mismatch in its core dimension 1" in exc.value.message
         ai3 = ufunc(ai[0,:,:])
         ai2 = ufunc(ai)
         assert (ai2 == ai * 2).all()
+        # view
+        aiV = ai[::-2, :, :]
+        assert aiV.strides == (-72, 12, 4)
+        ai2 = ufunc(aiV)
+        assert (ai2 == aiV * 2).all()
 
     def test_frompyfunc_needs_nditer(self):
         def summer(in0):
diff --git a/pypy/module/pypyjit/interp_jit.py b/pypy/module/pypyjit/interp_jit.py
--- a/pypy/module/pypyjit/interp_jit.py
+++ b/pypy/module/pypyjit/interp_jit.py
@@ -19,8 +19,8 @@
 
 
 PyFrame._virtualizable_ = ['last_instr', 'pycode',
-                           'valuestackdepth', 'locals_stack_w[*]',
-                           'cells[*]',
+                           'valuestackdepth',
+                           'locals_cells_stack_w[*]',
                            'debugdata',
                            'last_exception',
                            'lastblock',
diff --git a/pypy/module/pypyjit/test_pypy_c/model.py b/pypy/module/pypyjit/test_pypy_c/model.py
--- a/pypy/module/pypyjit/test_pypy_c/model.py
+++ b/pypy/module/pypyjit/test_pypy_c/model.py
@@ -450,6 +450,9 @@
             if self.try_match(op, until_op):
                 # it matched! The '...' operator ends here
                 return op
+            self._assert(op != '--end--',
+                         'nothing in the end of the loop matches %r' %
+                          (until_op,))
 
     def match_any_order(self, iter_exp_ops, iter_ops, ignore_ops):
         exp_ops = []
diff --git a/pypy/module/pypyjit/test_pypy_c/test_ffi.py b/pypy/module/pypyjit/test_pypy_c/test_ffi.py
--- a/pypy/module/pypyjit/test_pypy_c/test_ffi.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_ffi.py
@@ -207,6 +207,88 @@
             guard_no_exception(descr=...)
         """, ignore_ops=['guard_not_invalidated'])
 
+    def test__cffi_call_c_int(self):
+        def main():
+            import os
+            try:
+                import _cffi_backend
+            except ImportError:
+                sys.stderr.write('SKIP: cannot import _cffi_backend\n')
+                return 0
+
+            libc = _cffi_backend.load_library(None)
+            BInt = _cffi_backend.new_primitive_type("int")
+            BClose = _cffi_backend.new_function_type([BInt], BInt)
+            _dup = libc.load_function(BClose, 'dup')
+            i = 0
+            fd0, fd1 = os.pipe()
+            while i < 300:
+                tmp = _dup(fd0)   # ID: cfficall
+                os.close(tmp)
+                i += 1
+            os.close(fd0)
+            os.close(fd1)
+            BLong = _cffi_backend.new_primitive_type("long")
+            return 42
+        #
+        log = self.run(main, [])
+        assert log.result == 42
+        loop, = log.loops_by_filename(self.filepath)
+        if sys.maxint > 2**32:
+            extra = "i98 = int_signext(i97, 4)"
+        else:
+            extra = ""
+        assert loop.match_by_id('cfficall', """
+            p96 = force_token()
+            setfield_gc(p0, p96, descr=<FieldP pypy.interpreter.pyframe.PyFrame.vable_token .>)
+            i97 = call_release_gil(91, i59, i50, descr=<Calli 4 i EF=7 OS=62>)
+            guard_not_forced(descr=...)
+            guard_no_exception(descr=...)
+            %s
+        """ % extra, ignore_ops=['guard_not_invalidated'])
+
+    def test__cffi_call_size_t(self):
+        def main():
+            import os
+            try:
+                import _cffi_backend
+            except ImportError:
+                sys.stderr.write('SKIP: cannot import _cffi_backend\n')
+                return 0
+
+            libc = _cffi_backend.load_library(None)
+            BInt = _cffi_backend.new_primitive_type("int")
+            BSizeT = _cffi_backend.new_primitive_type("size_t")
+            BChar = _cffi_backend.new_primitive_type("char")
+            BCharP = _cffi_backend.new_pointer_type(BChar)
+            BWrite = _cffi_backend.new_function_type([BInt, BCharP, BSizeT],
+                                                     BSizeT)  # not signed here!
+            _write = libc.load_function(BWrite, 'write')
+            i = 0
+            fd0, fd1 = os.pipe()
+            buffer = _cffi_backend.newp(BCharP, 'A')
+            while i < 300:
+                tmp = _write(fd1, buffer, 1)   # ID: cfficall
+                assert tmp == 1
+                assert os.read(fd0, 2) == 'A'
+                i += 1
+            os.close(fd0)
+            os.close(fd1)
+            return 42
+        #
+        log = self.run(main, [])
+        assert log.result == 42
+        loop, = log.loops_by_filename(self.filepath)
+        assert loop.match_by_id('cfficall', """
+            p96 = force_token()
+            setfield_gc(p0, p96, descr=<FieldP pypy.interpreter.pyframe.PyFrame.vable_token .>)
+            i97 = call_release_gil(91, i59, i10, i12, 1, descr=<Calli . iii EF=7 OS=62>)
+            guard_not_forced(descr=...)
+            guard_no_exception(descr=...)
+            p98 = call(ConstClass(fromrarith_int__r_uint), i97, descr=<Callr . i EF=4>)
+            guard_no_exception(descr=...)
+        """, ignore_ops=['guard_not_invalidated'])
+
     def test_cffi_call_guard_not_forced_fails(self):
         # this is the test_pypy_c equivalent of
         # rpython/jit/metainterp/test/test_fficall::test_guard_not_forced_fails
diff --git a/rpython/jit/codewriter/jtransform.py b/rpython/jit/codewriter/jtransform.py
--- a/rpython/jit/codewriter/jtransform.py
+++ b/rpython/jit/codewriter/jtransform.py
@@ -1953,11 +1953,6 @@
             assert False, 'unsupported oopspec: %s' % oopspec_name
         return self._handle_oopspec_call(op, args, oopspecindex, extraeffect)
 
-    def rewrite_op_jit_ffi_save_result(self, op):
-        kind = op.args[0].value
-        assert kind in ('int', 'float', 'longlong', 'singlefloat')
-        return SpaceOperation('libffi_save_result_%s' % kind, op.args[1:], None)
-
     def rewrite_op_jit_force_virtual(self, op):
         op0 = SpaceOperation('-live-', [], None)
         op1 = self._do_builtin_call(op)
diff --git a/rpython/jit/metainterp/blackhole.py b/rpython/jit/metainterp/blackhole.py
--- a/rpython/jit/metainterp/blackhole.py
+++ b/rpython/jit/metainterp/blackhole.py
@@ -1431,41 +1431,6 @@
     def bhimpl_copyunicodecontent(cpu, src, dst, srcstart, dststart, length):
         cpu.bh_copyunicodecontent(src, dst, srcstart, dststart, length)
 
-    def _libffi_save_result(self, cif_description, exchange_buffer, result):
-        ARRAY = lltype.Ptr(rffi.CArray(lltype.typeOf(result)))
-        cast_int_to_ptr = self.cpu.cast_int_to_ptr
-        cif_description = cast_int_to_ptr(cif_description, CIF_DESCRIPTION_P)
-        exchange_buffer = cast_int_to_ptr(exchange_buffer, rffi.CCHARP)
-        #
-        data_out = rffi.ptradd(exchange_buffer, cif_description.exchange_result)
-        rffi.cast(ARRAY, data_out)[0] = result
-    _libffi_save_result._annspecialcase_ = 'specialize:argtype(3)'
-
-    @arguments("self", "i", "i", "i")
-    def bhimpl_libffi_save_result_int(self, cif_description,
-                                      exchange_buffer, result):
-        self._libffi_save_result(cif_description, exchange_buffer, result)
-
-    @arguments("self", "i", "i", "f")
-    def bhimpl_libffi_save_result_float(self, cif_description,
-                                        exchange_buffer, result):
-        result = longlong.getrealfloat(result)
-        self._libffi_save_result(cif_description, exchange_buffer, result)
-
-    @arguments("self", "i", "i", "f")
-    def bhimpl_libffi_save_result_longlong(self, cif_description,
-                                           exchange_buffer, result):
-        # 32-bit only: 'result' is here a LongLong
-        assert longlong.is_longlong(lltype.typeOf(result))
-        self._libffi_save_result(cif_description, exchange_buffer, result)
-
-    @arguments("self", "i", "i", "i")
-    def bhimpl_libffi_save_result_singlefloat(self, cif_description,
-                                              exchange_buffer, result):
-        result = longlong.int2singlefloat(result)
-        self._libffi_save_result(cif_description, exchange_buffer, result)
-
-
     # ----------
     # helpers to resume running in blackhole mode when a guard failed
 
diff --git a/rpython/jit/metainterp/pyjitpl.py b/rpython/jit/metainterp/pyjitpl.py
--- a/rpython/jit/metainterp/pyjitpl.py
+++ b/rpython/jit/metainterp/pyjitpl.py
@@ -1331,34 +1331,6 @@
             metainterp.history.record(rop.VIRTUAL_REF_FINISH,
                                       [vrefbox, nullbox], None)
 
-    @arguments("box", "box", "box")
-    def _opimpl_libffi_save_result(self, box_cif_description,
-                                   box_exchange_buffer, box_result):
-        from rpython.rtyper.lltypesystem import llmemory
-        from rpython.rlib.jit_libffi import CIF_DESCRIPTION_P
-        from rpython.jit.backend.llsupport.ffisupport import get_arg_descr
-
-        cif_description = box_cif_description.getint()
-        cif_description = llmemory.cast_int_to_adr(cif_description)
-        cif_description = llmemory.cast_adr_to_ptr(cif_description,
-                                                   CIF_DESCRIPTION_P)
-
-        kind, descr, itemsize = get_arg_descr(self.metainterp.cpu, cif_description.rtype)
-
-        if kind != 'v':
-            ofs = cif_description.exchange_result
-            assert ofs % itemsize == 0     # alignment check (result)
-            self.metainterp.history.record(rop.SETARRAYITEM_RAW,
-                                           [box_exchange_buffer,
-                                            ConstInt(ofs // itemsize),
-                                            box_result],
-                                           None, descr)
-
-    opimpl_libffi_save_result_int         = _opimpl_libffi_save_result
-    opimpl_libffi_save_result_float       = _opimpl_libffi_save_result
-    opimpl_libffi_save_result_longlong    = _opimpl_libffi_save_result
-    opimpl_libffi_save_result_singlefloat = _opimpl_libffi_save_result
-
     # ------------------------------
 
     def setup_call(self, argboxes):
@@ -2910,7 +2882,7 @@
         self.history.operations.extend(extra_guards)
         #
         # note that the result is written back to the exchange_buffer by the
-        # special op libffi_save_result_{int,float}
+        # following operation, which should be a raw_store
 
     def direct_call_release_gil(self):
         op = self.history.operations.pop()
diff --git a/rpython/jit/metainterp/test/test_fficall.py b/rpython/jit/metainterp/test/test_fficall.py
--- a/rpython/jit/metainterp/test/test_fficall.py
+++ b/rpython/jit/metainterp/test/test_fficall.py
@@ -9,7 +9,7 @@
 from rpython.rlib import jit
 from rpython.rlib import jit_libffi
 from rpython.rlib.jit_libffi import (types, CIF_DESCRIPTION, FFI_TYPE_PP,
-                                     jit_ffi_call, jit_ffi_save_result)
+                                     jit_ffi_call)
 from rpython.rlib.unroll import unrolling_iterable
 from rpython.rlib.rarithmetic import intmask, r_longlong, r_singlefloat
 from rpython.rlib.longlong2float import float2longlong
@@ -48,13 +48,20 @@
     def _run(self, atypes, rtype, avalues, rvalue,
              expected_call_release_gil=1,
              supports_floats=True,
-             supports_longlong=True,
-             supports_singlefloats=True):
+             supports_longlong=False,
+             supports_singlefloats=False):
 
         cif_description = get_description(atypes, rtype)
 
+        expected_args = []
+        for avalue in avalues:
+            if lltype.typeOf(avalue) == rffi.ULONG:
+                avalue = intmask(avalue)
+            expected_args.append(avalue)
+        expected_args = tuple(expected_args)
+
         def verify(*args):
-            assert args == tuple(avalues)
+            assert args == expected_args
             return rvalue
         FUNC = lltype.FuncType([lltype.typeOf(avalue) for avalue in avalues],
                                lltype.typeOf(rvalue))
@@ -76,6 +83,10 @@
                 if lltype.typeOf(avalue) is lltype.SingleFloat:
                     got = float(got)
                     avalue = float(avalue)
+                elif (lltype.typeOf(avalue) is rffi.SIGNEDCHAR or
+                      lltype.typeOf(avalue) is rffi.UCHAR):
+                    got = intmask(got)
+                    avalue = intmask(avalue)
                 assert got == avalue
                 ofs += 16
             if rvalue is not None:
@@ -115,6 +126,9 @@
                 return res == 654321
             if isinstance(rvalue, r_singlefloat):
                 rvalue = float(rvalue)
+            if lltype.typeOf(rvalue) is rffi.ULONG:
+                res = intmask(res)
+                rvalue = intmask(rvalue)
             return res == rvalue
 
         with FakeFFI(fake_call_impl_any):
@@ -156,20 +170,24 @@
                       -42434445)
 
     def test_simple_call_float(self, **kwds):
+        kwds.setdefault('supports_floats', True)
         self._run([types.double] * 2, types.double, [45.6, 78.9], -4.2, **kwds)
 
     def test_simple_call_longlong(self, **kwds):
+        kwds.setdefault('supports_longlong', True)
         maxint32 = 2147483647
         a = r_longlong(maxint32) + 1
         b = r_longlong(maxint32) + 2
         self._run([types.slonglong] * 2, types.slonglong, [a, b], a, **kwds)
 
-    def test_simple_call_singlefloat_args(self):
+    def test_simple_call_singlefloat_args(self, **kwds):
+        kwds.setdefault('supports_singlefloats', True)
         self._run([types.float] * 2, types.double,
                   [r_singlefloat(10.5), r_singlefloat(31.5)],
                   -4.5)
 
     def test_simple_call_singlefloat(self, **kwds):
+        kwds.setdefault('supports_singlefloats', True)
         self._run([types.float] * 2, types.float,
                   [r_singlefloat(10.5), r_singlefloat(31.5)],
                   r_singlefloat(-4.5), **kwds)
@@ -183,9 +201,20 @@
         self._run([types.signed] * 2, types.void, [456, 789], None)
 
     def test_returns_signedchar(self):
-        self._run([types.signed], types.sint8, [456],
+        self._run([types.sint8], types.sint8,
+                  [rffi.cast(rffi.SIGNEDCHAR, -28)],
                   rffi.cast(rffi.SIGNEDCHAR, -42))
 
+    def test_handle_unsigned(self):
+        self._run([types.ulong], types.ulong,
+                  [rffi.cast(rffi.ULONG, sys.maxint + 91348)],
+                  rffi.cast(rffi.ULONG, sys.maxint + 4242))
+
+    def test_handle_unsignedchar(self):
+        self._run([types.uint8], types.uint8,
+                  [rffi.cast(rffi.UCHAR, 191)],
+                  rffi.cast(rffi.UCHAR, 180))
+
     def _add_libffi_types_to_ll2types_maybe(self):
         # not necessary on the llgraph backend, but needed for x86.
         # see rpython/jit/backend/x86/test/test_fficall.py
@@ -255,7 +284,7 @@
                 # when n==50, fn() will force the frame, so guard_not_forced
                 # fails and we enter blackholing: this test makes sure that
                 # the result of call_release_gil is kept alive before the
-                # libffi_save_result, and that the corresponding box is passed
+                # raw_store, and that the corresponding box is passed
                 # in the fail_args. Before the fix, the result of
                 # call_release_gil was simply lost and when guard_not_forced
                 # failed, and the value of "res" was unpredictable.
@@ -291,7 +320,6 @@
         cd.atypes = atypes
         cd.exchange_size = 64    # 64 bytes of exchange data
         cd.exchange_result = 24
-        cd.exchange_result_libffi = 24
         cd.exchange_args[0] = 16
 
         def f():
@@ -324,8 +352,3 @@
     def test_simple_call_singlefloat_unsupported(self):
         self.test_simple_call_singlefloat(supports_singlefloats=False,
                                           expected_call_release_gil=0)
-
-    def test_simple_call_float_even_if_other_unsupported(self):
-        self.test_simple_call_float(supports_longlong=False,
-                                    supports_singlefloats=False)
-        # this is the default:      expected_call_release_gil=1
diff --git a/rpython/rlib/jit_libffi.py b/rpython/rlib/jit_libffi.py
--- a/rpython/rlib/jit_libffi.py
+++ b/rpython/rlib/jit_libffi.py
@@ -1,10 +1,9 @@
-
-from rpython.rtyper.lltypesystem import lltype, rffi
-from rpython.rtyper.extregistry import ExtRegistryEntry
+import sys
+from rpython.rtyper.lltypesystem import lltype, llmemory, rffi
+from rpython.rtyper.lltypesystem.lloperation import llop
 from rpython.rlib import clibffi, jit
 from rpython.rlib.rarithmetic import r_longlong, r_singlefloat
-from rpython.rlib.nonconst import NonConstant
-
+from rpython.rlib.unroll import unrolling_iterable
 
 FFI_CIF = clibffi.FFI_CIFP.TO
 FFI_TYPE = clibffi.FFI_TYPE_P.TO
@@ -13,6 +12,8 @@
 FFI_ABI = clibffi.FFI_ABI
 FFI_TYPE_STRUCT = clibffi.FFI_TYPE_STRUCT
 SIZE_OF_FFI_ARG = rffi.sizeof(clibffi.ffi_arg)
+SIZE_OF_SIGNED = rffi.sizeof(lltype.Signed)
+FFI_ARG_P = rffi.CArrayPtr(clibffi.ffi_arg)
 
 # Usage: for each C function, make one CIF_DESCRIPTION block of raw
 # memory.  Initialize it by filling all its fields apart from 'cif'.
@@ -33,11 +34,12 @@
 #  - 'exchange_result': the offset in that buffer for the result of the call.
 #    (this and the other offsets must be at least NARGS * sizeof(void*).)
 #
-#  - 'exchange_result_libffi': the actual offset passed to ffi_call().
-#    Differs on big-endian machines if the result is an integer type smaller
-#    than SIZE_OF_FFI_ARG (blame libffi).
+#  - 'exchange_args[nargs]': the offset in that buffer for each argument.
 #
-#  - 'exchange_args[nargs]': the offset in that buffer for each argument.
+# Each argument and the result should have enough room for at least
+# SIZE_OF_FFI_ARG bytes, even if they may be smaller.  (Unlike ffi_call,
+# we don't have any special rule about results that are integers smaller
+# than SIZE_OF_FFI_ARG).
 
 CIF_DESCRIPTION = lltype.Struct(
     'CIF_DESCRIPTION',
@@ -48,7 +50,6 @@
     ('atypes', FFI_TYPE_PP),   #
     ('exchange_size', lltype.Signed),
     ('exchange_result', lltype.Signed),
-    ('exchange_result_libffi', lltype.Signed),
     ('exchange_args', lltype.Array(lltype.Signed,
                           hints={'nolength': True, 'immutable': True})),
     hints={'immutable': True})
@@ -93,12 +94,16 @@
 ##
 ## The result is that now the jitcode looks like this:
 ##
-##     %i0 = libffi_call_int(...)
+##     %i0 = direct_call(libffi_call_int, ...)
 ##     -live-
-##     libffi_save_result_int(..., %i0)
+##     raw_store(exchange_result, %i0)
 ##
 ## the "-live-" is the key, because it make sure that the value is not lost if
 ## guard_not_forced fails.
+##
+## The value of %i0 is stored back in the exchange_buffer at the offset
+## exchange_result, which is usually where functions like jit_ffi_call_impl_int
+## have just read it from when called *in interpreter mode* only.
 
 
 def jit_ffi_call(cif_description, func_addr, exchange_buffer):
@@ -108,8 +113,10 @@
     reskind = types.getkind(cif_description.rtype)
     if reskind == 'v':
         jit_ffi_call_impl_void(cif_description, func_addr, exchange_buffer)
-    elif reskind == 'i' or reskind == 'u':
-        _do_ffi_call_int(cif_description, func_addr, exchange_buffer)
+    elif reskind == 'i':
+        _do_ffi_call_sint(cif_description, func_addr, exchange_buffer)
+    elif reskind == 'u':
+        _do_ffi_call_uint(cif_description, func_addr, exchange_buffer)
     elif reskind == 'f':
         _do_ffi_call_float(cif_description, func_addr, exchange_buffer)
     elif reskind == 'L': # L is for longlongs, on 32bit
@@ -126,54 +133,97 @@
         jit_ffi_call_impl_any(cif_description, func_addr, exchange_buffer)
 
 
-def _do_ffi_call_int(cif_description, func_addr, exchange_buffer):
+_short_sint_types = unrolling_iterable([rffi.SIGNEDCHAR, rffi.SHORT, rffi.INT])
+_short_uint_types = unrolling_iterable([rffi.UCHAR, rffi.USHORT, rffi.UINT])
+
+def _do_ffi_call_sint(cif_description, func_addr, exchange_buffer):
     result = jit_ffi_call_impl_int(cif_description, func_addr,
                                    exchange_buffer)
-    jit_ffi_save_result('int', cif_description, exchange_buffer, result)
+    size = types.getsize(cif_description.rtype)
+    for TP in _short_sint_types:     # short **signed** types
+        if size == rffi.sizeof(TP):
+            llop.raw_store(lltype.Void,
+                           llmemory.cast_ptr_to_adr(exchange_buffer),
+                           cif_description.exchange_result,
+                           rffi.cast(TP, result))
+            break
+    else:
+        # default case: expect a full signed number
+        llop.raw_store(lltype.Void,
+                       llmemory.cast_ptr_to_adr(exchange_buffer),
+                       cif_description.exchange_result,
+                       result)
+
+def _do_ffi_call_uint(cif_description, func_addr, exchange_buffer):
+    result = jit_ffi_call_impl_int(cif_description, func_addr,
+                                   exchange_buffer)
+    size = types.getsize(cif_description.rtype)
+    for TP in _short_uint_types:     # short **unsigned** types
+        if size == rffi.sizeof(TP):
+            llop.raw_store(lltype.Void,
+                           llmemory.cast_ptr_to_adr(exchange_buffer),
+                           cif_description.exchange_result,
+                           rffi.cast(TP, result))
+            break
+    else:
+        # default case: expect a full unsigned number
+        llop.raw_store(lltype.Void,
+                       llmemory.cast_ptr_to_adr(exchange_buffer),
+                       cif_description.exchange_result,
+                       rffi.cast(lltype.Unsigned, result))
 
 def _do_ffi_call_float(cif_description, func_addr, exchange_buffer):
     # a separate function in case the backend doesn't support floats
     result = jit_ffi_call_impl_float(cif_description, func_addr,
                                      exchange_buffer)
-    jit_ffi_save_result('float', cif_description, exchange_buffer, result)
+    llop.raw_store(lltype.Void,
+                   llmemory.cast_ptr_to_adr(exchange_buffer),
+                   cif_description.exchange_result,
+                   result)
 
 def _do_ffi_call_longlong(cif_description, func_addr, exchange_buffer):
     # a separate function in case the backend doesn't support longlongs
     result = jit_ffi_call_impl_longlong(cif_description, func_addr,
                                         exchange_buffer)
-    jit_ffi_save_result('longlong', cif_description, exchange_buffer, result)
+    llop.raw_store(lltype.Void,
+                   llmemory.cast_ptr_to_adr(exchange_buffer),
+                   cif_description.exchange_result,
+                   result)
 
 def _do_ffi_call_singlefloat(cif_description, func_addr, exchange_buffer):
     # a separate function in case the backend doesn't support singlefloats
     result = jit_ffi_call_impl_singlefloat(cif_description, func_addr,
                                            exchange_buffer)
-    jit_ffi_save_result('singlefloat', cif_description, exchange_buffer,result)
+    llop.raw_store(lltype.Void,
+                   llmemory.cast_ptr_to_adr(exchange_buffer),
+                   cif_description.exchange_result,
+                   result)
 
 
-# we must return a NonConstant else we get the constant -1 as the result of
-# the flowgraph, and the codewriter does not produce a box for the
-# result. Note that when not-jitted, the result is unused, but when jitted the
-# box of the result contains the actual value returned by the C function.
-
 @jit.oopspec("libffi_call(cif_description,func_addr,exchange_buffer)")
 def jit_ffi_call_impl_int(cif_description, func_addr, exchange_buffer):
     jit_ffi_call_impl_any(cif_description, func_addr, exchange_buffer)
-    return NonConstant(-1)
+    # read a complete 'ffi_arg' word
+    resultdata = rffi.ptradd(exchange_buffer, cif_description.exchange_result)
+    return rffi.cast(lltype.Signed, rffi.cast(FFI_ARG_P, resultdata)[0])
 
 @jit.oopspec("libffi_call(cif_description,func_addr,exchange_buffer)")
 def jit_ffi_call_impl_float(cif_description, func_addr, exchange_buffer):
     jit_ffi_call_impl_any(cif_description, func_addr, exchange_buffer)
-    return NonConstant(-1.0)
+    resultdata = rffi.ptradd(exchange_buffer, cif_description.exchange_result)
+    return rffi.cast(rffi.DOUBLEP, resultdata)[0]
 
 @jit.oopspec("libffi_call(cif_description,func_addr,exchange_buffer)")
 def jit_ffi_call_impl_longlong(cif_description, func_addr, exchange_buffer):
     jit_ffi_call_impl_any(cif_description, func_addr, exchange_buffer)
-    return r_longlong(-1)
+    resultdata = rffi.ptradd(exchange_buffer, cif_description.exchange_result)
+    return rffi.cast(rffi.LONGLONGP, resultdata)[0]
 
 @jit.oopspec("libffi_call(cif_description,func_addr,exchange_buffer)")
 def jit_ffi_call_impl_singlefloat(cif_description, func_addr, exchange_buffer):
     jit_ffi_call_impl_any(cif_description, func_addr, exchange_buffer)
-    return r_singlefloat(-1.0)
+    resultdata = rffi.ptradd(exchange_buffer, cif_description.exchange_result)
+    return rffi.cast(rffi.FLOATP, resultdata)[0]
 
 @jit.oopspec("libffi_call(cif_description,func_addr,exchange_buffer)")
 def jit_ffi_call_impl_void(cif_description, func_addr, exchange_buffer):
@@ -191,36 +241,12 @@
         data = rffi.ptradd(exchange_buffer, cif_description.exchange_args[i])
         buffer_array[i] = data
     resultdata = rffi.ptradd(exchange_buffer,
-                             cif_description.exchange_result_libffi)
+                             cif_description.exchange_result)
     clibffi.c_ffi_call(cif_description.cif, func_addr,
                        rffi.cast(rffi.VOIDP, resultdata),
                        buffer_array)
-    return -1
 
 
-
-def jit_ffi_save_result(kind, cif_description, exchange_buffer, result):
-    """
-    This is a no-op during normal execution, but actually fills the buffer
-    when jitted
-    """
-    pass
-
-class Entry(ExtRegistryEntry):
-    _about_ = jit_ffi_save_result
-
-    def compute_result_annotation(self, kind_s, *args_s):
-        from rpython.annotator import model as annmodel
-        assert isinstance(kind_s, annmodel.SomeString)
-        assert kind_s.const in ('int', 'float', 'longlong', 'singlefloat')
-
-    def specialize_call(self, hop):
-        hop.exception_cannot_occur()
-        vlist = hop.inputargs(lltype.Void, *hop.args_r[1:])
-        return hop.genop('jit_ffi_save_result', vlist,
-                         resulttype=lltype.Void)
-    
-
 # ____________________________________________________________
 
 class types(object):
@@ -282,6 +308,11 @@
 
     @staticmethod
     @jit.elidable
+    def getsize(ffi_type):
+        return rffi.getintfield(ffi_type, 'c_size')
+
+    @staticmethod
+    @jit.elidable
     def is_struct(ffi_type):
         return rffi.getintfield(ffi_type, 'c_type') == FFI_TYPE_STRUCT
 
diff --git a/rpython/rlib/rawstorage.py b/rpython/rlib/rawstorage.py
--- a/rpython/rlib/rawstorage.py
+++ b/rpython/rlib/rawstorage.py
@@ -19,9 +19,9 @@
 def raw_storage_getitem(TP, storage, index):
     "NOT_RPYTHON"
     _check_alignment(TP, index)
-    return raw_storage_getitem_unchecked(TP, storage, index)
+    return _raw_storage_getitem_unchecked(TP, storage, index)
 
-def raw_storage_getitem_unchecked(TP, storage, index):
+def _raw_storage_getitem_unchecked(TP, storage, index):
     "NOT_RPYTHON"
     return rffi.cast(rffi.CArrayPtr(TP), rffi.ptradd(storage, index))[0]
 
@@ -29,9 +29,9 @@
     "NOT_RPYTHON"
     TP = lltype.typeOf(item)
     _check_alignment(TP, index)
-    raw_storage_setitem_unchecked(storage, index, item)
+    _raw_storage_setitem_unchecked(storage, index, item)
 
-def raw_storage_setitem_unchecked(storage, index, item):
+def _raw_storage_setitem_unchecked(storage, index, item):
     "NOT_RPYTHON"
     TP = lltype.typeOf(item)
     rffi.cast(rffi.CArrayPtr(TP), rffi.ptradd(storage, index))[0] = item
@@ -80,13 +80,13 @@
         if we_are_translated():
             return raw_storage_getitem(TP, storage, index)
         else:
-            return raw_storage_getitem_unchecked(TP, storage, index)
+            return _raw_storage_getitem_unchecked(TP, storage, index)
     mask = _get_alignment_mask(TP)
     if (index & mask) == 0:
         if we_are_translated():
             return raw_storage_getitem(TP, storage, index)
         else:
-            return raw_storage_getitem_unchecked(TP, storage, index)
+            return _raw_storage_getitem_unchecked(TP, storage, index)
     ptr = rffi.ptradd(storage, index)
     with lltype.scoped_alloc(rffi.CArray(TP), 1) as s_array:
         rffi.c_memcpy(rffi.cast(rffi.VOIDP, s_array),
@@ -100,7 +100,7 @@
         if we_are_translated():
             raw_storage_setitem(storage, index, item)
         else:
-            raw_storage_setitem_unchecked(storage, index, item)
+            _raw_storage_setitem_unchecked(storage, index, item)
         return
     TP = lltype.typeOf(item)
     mask = _get_alignment_mask(TP)
@@ -108,7 +108,7 @@
         if we_are_translated():
             raw_storage_setitem(storage, index, item)
         else:
-            raw_storage_setitem_unchecked(storage, index, item)
+            _raw_storage_setitem_unchecked(storage, index, item)
         return
     ptr = rffi.ptradd(storage, index)
     with lltype.scoped_alloc(rffi.CArray(TP), 1) as s_array:
diff --git a/rpython/rlib/test/test_jit_libffi.py b/rpython/rlib/test/test_jit_libffi.py
--- a/rpython/rlib/test/test_jit_libffi.py
+++ b/rpython/rlib/test/test_jit_libffi.py
@@ -24,7 +24,6 @@
     cd.atypes = atypes
     cd.exchange_size = 64    # 64 bytes of exchange data
     cd.exchange_result = 24
-    cd.exchange_result_libffi = 24
     cd.exchange_args[0] = 16
     #
     jit_ffi_prep_cif(cd)