[pypy-commit] pypy unroll-if-alt: merged in default.

Thu Sep 1 20:17:31 CEST 2011

Author: Alex Gaynor <alex.gaynor at gmail.com>
Branch: unroll-if-alt
Changeset: r46995:bb553b7cded1
Date: 2011-09-01 12:33 -0400
http://bitbucket.org/pypy/pypy/changeset/bb553b7cded1/

Log:	merged in default.

diff --git a/lib_pypy/_ctypes/basics.py b/lib_pypy/_ctypes/basics.py
--- a/lib_pypy/_ctypes/basics.py
+++ b/lib_pypy/_ctypes/basics.py
@@ -166,7 +166,8 @@
     return tp._alignmentofinstances()
 
 def byref(cdata):
-    from ctypes import pointer
+    # "pointer" is imported at the end of this module to avoid circular
+    # imports
     return pointer(cdata)
 
 def cdata_from_address(self, address):
@@ -226,3 +227,6 @@
     'v' : _ffi.types.sshort,
     }
 
+
+# used by "byref"
+from _ctypes.pointer import pointer
diff --git a/lib_pypy/_sqlite3.py b/lib_pypy/_sqlite3.py
--- a/lib_pypy/_sqlite3.py
+++ b/lib_pypy/_sqlite3.py
@@ -24,6 +24,7 @@
 from ctypes import c_void_p, c_int, c_double, c_int64, c_char_p, cdll
 from ctypes import POINTER, byref, string_at, CFUNCTYPE, cast
 from ctypes import sizeof, c_ssize_t
+from collections import OrderedDict
 import datetime
 import sys
 import time
@@ -274,6 +275,28 @@
 def unicode_text_factory(x):
     return unicode(x, 'utf-8')
 
+
+class StatementCache(object):
+    def __init__(self, connection, maxcount):
+        self.connection = connection
+        self.maxcount = maxcount
+        self.cache = OrderedDict()
+
+    def get(self, sql, cursor, row_factory):
+        try:
+            stat = self.cache[sql]
+        except KeyError:
+            stat = Statement(self.connection, sql)
+            self.cache[sql] = stat
+            if len(self.cache) > self.maxcount:
+                self.cache.popitem(0)
+        #
+        if stat.in_use:
+            stat = Statement(self.connection, sql)
+        stat.set_cursor_and_factory(cursor, row_factory)
+        return stat
+
+
 class Connection(object):
     def __init__(self, database, timeout=5.0, detect_types=0, isolation_level="",
                  check_same_thread=True, factory=None, cached_statements=100):
@@ -291,6 +314,7 @@
         self.row_factory = None
         self._isolation_level = isolation_level
         self.detect_types = detect_types
+        self.statement_cache = StatementCache(self, cached_statements)
 
         self.cursors = []
 
@@ -399,7 +423,7 @@
         cur = Cursor(self)
         if not isinstance(sql, (str, unicode)):
             raise Warning("SQL is of wrong type. Must be string or unicode.")
-        statement = Statement(cur, sql, self.row_factory)
+        statement = self.statement_cache.get(sql, cur, self.row_factory)
         return statement
 
     def _get_isolation_level(self):
@@ -708,7 +732,7 @@
         if type(sql) is unicode:
             sql = sql.encode("utf-8")
         self._check_closed()
-        self.statement = Statement(self, sql, self.row_factory)
+        self.statement = self.connection.statement_cache.get(sql, self, self.row_factory)
 
         if self.connection._isolation_level is not None:
             if self.statement.kind == "DDL":
@@ -746,7 +770,8 @@
         if type(sql) is unicode:
             sql = sql.encode("utf-8")
         self._check_closed()
-        self.statement = Statement(self, sql, self.row_factory)
+        self.statement = self.connection.statement_cache.get(sql, self, self.row_factory)
+        
         if self.statement.kind == "DML":
             self.connection._begin()
         else:
@@ -871,14 +896,12 @@
     lastrowid = property(_getlastrowid)
 
 class Statement(object):
-    def __init__(self, cur, sql, row_factory):
+    def __init__(self, connection, sql):
         self.statement = None
         if not isinstance(sql, str):
             raise ValueError, "sql must be a string"
-        self.con = cur.connection
-        self.cur = weakref.ref(cur)
+        self.con = connection
         self.sql = sql # DEBUG ONLY
-        self.row_factory = row_factory
         first_word = self._statement_kind = sql.lstrip().split(" ")[0].upper()
         if first_word in ("INSERT", "UPDATE", "DELETE", "REPLACE"):
             self.kind = "DML"
@@ -887,6 +910,11 @@
         else:
             self.kind = "DDL"
         self.exhausted = False
+        self.in_use = False
+        #
+        # set by set_cursor_and_factory
+        self.cur = None
+        self.row_factory = None
 
         self.statement = c_void_p()
         next_char = c_char_p()
@@ -907,6 +935,10 @@
 
         self._build_row_cast_map()
 
+    def set_cursor_and_factory(self, cur, row_factory):
+        self.cur = weakref.ref(cur)
+        self.row_factory = row_factory
+
     def _build_row_cast_map(self):
         self.row_cast_map = []
         for i in xrange(sqlite.sqlite3_column_count(self.statement)):
@@ -976,6 +1008,7 @@
         ret = sqlite.sqlite3_reset(self.statement)
         if ret != SQLITE_OK:
             raise self.con._get_exception(ret)
+        self.mark_dirty()
 
         if params is None:
             if sqlite.sqlite3_bind_parameter_count(self.statement) != 0:
@@ -1068,11 +1101,17 @@
 
     def reset(self):
         self.row_cast_map = None
-        return sqlite.sqlite3_reset(self.statement)
+        ret = sqlite.sqlite3_reset(self.statement)
+        self.in_use = False
+        return ret
 
     def finalize(self):
         sqlite.sqlite3_finalize(self.statement)
         self.statement = None
+        self.in_use = False
+
+    def mark_dirty(self):
+        self.in_use = True
 
     def __del__(self):
         sqlite.sqlite3_finalize(self.statement)
diff --git a/lib_pypy/pyrepl/reader.py b/lib_pypy/pyrepl/reader.py
--- a/lib_pypy/pyrepl/reader.py
+++ b/lib_pypy/pyrepl/reader.py
@@ -401,13 +401,19 @@
             return "(arg: %s) "%self.arg
         if "\n" in self.buffer:
             if lineno == 0:
-                return self._ps2
+                res = self.ps2
             elif lineno == self.buffer.count("\n"):
-                return self._ps4
+                res = self.ps4
             else:
-                return self._ps3
+                res = self.ps3
         else:
-            return self._ps1
+            res = self.ps1
+        # Lazily call str() on self.psN, and cache the results using as key
+        # the object on which str() was called.  This ensures that even if the
+        # same object is used e.g. for ps1 and ps2, str() is called only once.
+        if res not in self._pscache:
+            self._pscache[res] = str(res)
+        return self._pscache[res]
 
     def push_input_trans(self, itrans):
         self.input_trans_stack.append(self.input_trans)
@@ -473,8 +479,7 @@
             self.pos = 0
             self.dirty = 1
             self.last_command = None
-            self._ps1, self._ps2, self._ps3, self._ps4 = \
-                           map(str, [self.ps1, self.ps2, self.ps3, self.ps4])
+            self._pscache = {}
         except:
             self.restore()
             raise
diff --git a/pypy/doc/stackless.rst b/pypy/doc/stackless.rst
--- a/pypy/doc/stackless.rst
+++ b/pypy/doc/stackless.rst
@@ -209,6 +209,8 @@
 
 * Automatic unlimited stack (must be emulated__ so far)
 
+* Support for other CPUs than x86 and x86-64
+
 .. __: `recursion depth limit`_
 
 (*) Pickling, as well as changing threads, could be implemented by using
@@ -217,9 +219,8 @@
 "hard" switch (like now) when the C stack contains non-trivial C frames
 to save, and a "soft" switch (like previously) when it contains only
 simple calls from Python to Python.  Soft-switched continulets would
-also consume a bit less RAM, at the possible expense of making the
-switch a bit slower (unsure about that; what is the Stackless Python
-experience?).
+also consume a bit less RAM, and the switch might be a bit faster too
+(unsure about that; what is the Stackless Python experience?).
 
 
 Recursion depth limit
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -626,9 +626,9 @@
             self.default_compiler = compiler
             return compiler
 
-    def createframe(self, code, w_globals, closure=None):
+    def createframe(self, code, w_globals, outer_func=None):
         "Create an empty PyFrame suitable for this code object."
-        return self.FrameClass(self, code, w_globals, closure)
+        return self.FrameClass(self, code, w_globals, outer_func)
 
     def allocate_lock(self):
         """Return an interp-level Lock object if threads are enabled,
diff --git a/pypy/interpreter/function.py b/pypy/interpreter/function.py
--- a/pypy/interpreter/function.py
+++ b/pypy/interpreter/function.py
@@ -30,7 +30,7 @@
     can_change_code = True
     _immutable_fields_ = ['code?',
                           'w_func_globals?',
-                          'closure?',
+                          'closure?[*]',
                           'defs_w?[*]',
                           'name?']
 
@@ -96,7 +96,7 @@
             assert isinstance(code, PyCode)
             if nargs < 5:
                 new_frame = self.space.createframe(code, self.w_func_globals,
-                                                   self.closure)
+                                                   self)
                 for i in funccallunrolling:
                     if i < nargs:
                         new_frame.locals_stack_w[i] = args_w[i]
@@ -156,7 +156,7 @@
     def _flat_pycall(self, code, nargs, frame):
         # code is a PyCode
         new_frame = self.space.createframe(code, self.w_func_globals,
-                                                   self.closure)
+                                                   self)
         for i in xrange(nargs):
             w_arg = frame.peekvalue(nargs-1-i)
             new_frame.locals_stack_w[i] = w_arg
@@ -167,7 +167,7 @@
     def _flat_pycall_defaults(self, code, nargs, frame, defs_to_load):
         # code is a PyCode
         new_frame = self.space.createframe(code, self.w_func_globals,
-                                                   self.closure)
+                                                   self)
         for i in xrange(nargs):
             w_arg = frame.peekvalue(nargs-1-i)
             new_frame.locals_stack_w[i] = w_arg
diff --git a/pypy/interpreter/nestedscope.py b/pypy/interpreter/nestedscope.py
--- a/pypy/interpreter/nestedscope.py
+++ b/pypy/interpreter/nestedscope.py
@@ -8,7 +8,7 @@
 
 class Cell(Wrappable):
     "A simple container for a wrapped value."
-    
+
     def __init__(self, w_value=None):
         self.w_value = w_value
 
@@ -90,32 +90,33 @@
     #     variables coming from a parent function in which i'm nested
     # 'closure' is a list of Cell instances: the received free vars.
 
-    cells = None
-
     @jit.unroll_safe
-    def initialize_frame_scopes(self, closure, code):
-        super_initialize_frame_scopes(self, closure, code)
+    def initialize_frame_scopes(self, outer_func, code):
+        super_initialize_frame_scopes(self, outer_func, code)
         ncellvars = len(code.co_cellvars)
         nfreevars = len(code.co_freevars)
         if not nfreevars:
             if not ncellvars:
+                self.cells = []
                 return            # no self.cells needed - fast path
-            if closure is None:
-                closure = []
-        elif closure is None:
+        elif outer_func is None:
             space = self.space
             raise OperationError(space.w_TypeError,
                                  space.wrap("directly executed code object "
                                             "may not contain free variables"))
-        if len(closure) != nfreevars:
+        if outer_func and outer_func.closure:
+            closure_size = len(outer_func.closure)
+        else:
+            closure_size = 0
+        if closure_size != nfreevars:
             raise ValueError("code object received a closure with "
                                  "an unexpected number of free variables")
         self.cells = [None] * (ncellvars + nfreevars)
         for i in range(ncellvars):
             self.cells[i] = Cell()
         for i in range(nfreevars):
-            self.cells[i + ncellvars] = closure[i]
-    
+            self.cells[i + ncellvars] = outer_func.closure[i]
+
     def _getcells(self):
         return self.cells
 
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -198,7 +198,7 @@
 
     def funcrun(self, func, args):
         frame = self.space.createframe(self, func.w_func_globals,
-                                  func.closure)
+                                  func)
         sig = self._signature
         # speed hack
         fresh_frame = jit.hint(frame, access_directly=True,
@@ -211,7 +211,7 @@
 
     def funcrun_obj(self, func, w_obj, args):
         frame = self.space.createframe(self, func.w_func_globals,
-                                  func.closure)
+                                  func)
         sig = self._signature
         # speed hack
         fresh_frame = jit.hint(frame, access_directly=True,
diff --git a/pypy/interpreter/pyframe.py b/pypy/interpreter/pyframe.py
--- a/pypy/interpreter/pyframe.py
+++ b/pypy/interpreter/pyframe.py
@@ -51,7 +51,7 @@
     is_being_profiled        = False
     escaped                  = False  # see mark_as_escaped()
 
-    def __init__(self, space, code, w_globals, closure):
+    def __init__(self, space, code, w_globals, outer_func):
         if not we_are_translated():
             assert type(self) in (space.FrameClass, CPythonFrame), (
                 "use space.FrameClass(), not directly PyFrame()")
@@ -70,7 +70,7 @@
             self.builtin = space.builtin.pick_builtin(w_globals)
         # regular functions always have CO_OPTIMIZED and CO_NEWLOCALS.
         # class bodies only have CO_NEWLOCALS.
-        self.initialize_frame_scopes(closure, code)
+        self.initialize_frame_scopes(outer_func, code)
         self.f_lineno = code.co_firstlineno
 
     def mark_as_escaped(self):
@@ -117,8 +117,8 @@
             return self.builtin
         else:
             return self.space.builtin
-        
-    def initialize_frame_scopes(self, closure, code): 
+
+    def initialize_frame_scopes(self, outer_func, code):
         # regular functions always have CO_OPTIMIZED and CO_NEWLOCALS.
         # class bodies only have CO_NEWLOCALS.
         # CO_NEWLOCALS: make a locals dict unless optimized is also set
@@ -385,7 +385,11 @@
         
         # do not use the instance's __init__ but the base's, because we set
         # everything like cells from here
-        PyFrame.__init__(self, space, pycode, w_globals, closure)
+        # XXX hack
+        from pypy.interpreter.function import Function
+        outer_func = Function(space, None, closure=closure,
+                             forcename="fake")
+        PyFrame.__init__(self, space, pycode, w_globals, outer_func)
         f_back = space.interp_w(PyFrame, w_f_back, can_be_None=True)
         new_frame.f_backref = jit.non_virtual_ref(f_back)
 
diff --git a/pypy/jit/backend/llsupport/descr.py b/pypy/jit/backend/llsupport/descr.py
--- a/pypy/jit/backend/llsupport/descr.py
+++ b/pypy/jit/backend/llsupport/descr.py
@@ -291,7 +291,7 @@
 
     def get_call_conv(self):
         from pypy.rlib.clibffi import get_call_conv
-        return get_call_conv(self.ffi_flags)
+        return get_call_conv(self.ffi_flags, True)
 
     def get_arg_types(self):
         return self.arg_classes
diff --git a/pypy/jit/backend/test/runner_test.py b/pypy/jit/backend/test/runner_test.py
--- a/pypy/jit/backend/test/runner_test.py
+++ b/pypy/jit/backend/test/runner_test.py
@@ -2070,16 +2070,21 @@
                                             ffi_flags=FUNCFLAG_STDCALL)
         i1 = BoxInt()
         i2 = BoxInt()
-        i3 = BoxInt()
-        tok = BoxInt()
         faildescr = BasicFailDescr(1)
-        ops = [
-        ResOperation(rop.CALL_RELEASE_GIL, [funcbox, i1, i2], i3,
-                     descr=calldescr),
-        ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
-        ResOperation(rop.FINISH, [i3], None, descr=BasicFailDescr(0))
+        # if the stdcall convention is ignored, then ESP is wrong after the
+        # call: 8 bytes too much.  If we repeat the call often enough, crash.
+        ops = []
+        for i in range(50):
+            i3 = BoxInt()
+            ops += [
+                ResOperation(rop.CALL_RELEASE_GIL, [funcbox, i1, i2], i3,
+                             descr=calldescr),
+                ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+                ]
+            ops[-1].setfailargs([])
+        ops += [
+            ResOperation(rop.FINISH, [i3], None, descr=BasicFailDescr(0))
         ]
-        ops[1].setfailargs([])
         looptoken = LoopToken()
         self.cpu.compile_loop([i1, i2], ops, looptoken)
 
diff --git a/pypy/jit/backend/x86/assembler.py b/pypy/jit/backend/x86/assembler.py
--- a/pypy/jit/backend/x86/assembler.py
+++ b/pypy/jit/backend/x86/assembler.py
@@ -34,6 +34,7 @@
 from pypy.rlib.debug import (debug_print, debug_start, debug_stop,
                              have_debug_prints)
 from pypy.rlib import rgc
+from pypy.rlib.clibffi import FFI_DEFAULT_ABI
 from pypy.jit.backend.x86.jump import remap_frame_layout
 from pypy.jit.metainterp.history import ConstInt, BoxInt
 from pypy.jit.codewriter.effectinfo import EffectInfo
@@ -1120,7 +1121,7 @@
         return genop_cmp_guard_float
 
     def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax,
-                   argtypes=None):
+                   argtypes=None, callconv=FFI_DEFAULT_ABI):
         if IS_X86_64:
             return self._emit_call_64(force_index, x, arglocs, start, argtypes)
 
@@ -1149,6 +1150,16 @@
         # x is a location
         self.mc.CALL(x)
         self.mark_gc_roots(force_index)
+        #
+        if callconv != FFI_DEFAULT_ABI:
+            self._fix_stdcall(callconv, p)
+
+    def _fix_stdcall(self, callconv, p):
+        from pypy.rlib.clibffi import FFI_STDCALL
+        assert callconv == FFI_STDCALL
+        # it's a bit stupid, but we're just going to cancel the fact that
+        # the called function just added 'p' to ESP, by subtracting it again.
+        self.mc.SUB_ri(esp.value, p)
 
     def _emit_call_64(self, force_index, x, arglocs, start, argtypes):
         src_locs = []
@@ -2127,7 +2138,8 @@
             tmp = eax
 
         self._emit_call(force_index, x, arglocs, 3, tmp=tmp,
-                        argtypes=op.getdescr().get_arg_types())
+                        argtypes=op.getdescr().get_arg_types(),
+                        callconv=op.getdescr().get_call_conv())
 
         if IS_X86_32 and isinstance(resloc, StackLoc) and resloc.width == 8:
             # a float or a long long return
diff --git a/pypy/jit/backend/x86/rx86.py b/pypy/jit/backend/x86/rx86.py
--- a/pypy/jit/backend/x86/rx86.py
+++ b/pypy/jit/backend/x86/rx86.py
@@ -527,6 +527,7 @@
 
     NOP = insn('\x90')
     RET = insn('\xC3')
+    RET16_i = insn('\xC2', immediate(1, 'h'))
 
     PUSH_r = insn(rex_nw, register(1), '\x50')
     PUSH_b = insn(rex_nw, '\xFF', orbyte(6<<3), stack_bp(1))
diff --git a/pypy/jit/backend/x86/test/test_runner.py b/pypy/jit/backend/x86/test/test_runner.py
--- a/pypy/jit/backend/x86/test/test_runner.py
+++ b/pypy/jit/backend/x86/test/test_runner.py
@@ -433,6 +433,88 @@
                 ops_offset[operations[2]] <=
                 ops_offset[None])
 
+    def test_calling_convention(self, monkeypatch):
+        if WORD != 4:
+            py.test.skip("32-bit only test")
+        from pypy.jit.backend.x86.regloc import eax, edx
+        from pypy.jit.backend.x86 import codebuf
+        from pypy.jit.codewriter.effectinfo import EffectInfo
+        from pypy.rlib.libffi import types, clibffi
+        had_stdcall = hasattr(clibffi, 'FFI_STDCALL')
+        if not had_stdcall:    # not running on Windows, but we can still test
+            monkeypatch.setattr(clibffi, 'FFI_STDCALL', 12345, raising=False)
+        #
+        for ffi in [clibffi.FFI_DEFAULT_ABI, clibffi.FFI_STDCALL]:
+            cpu = self.cpu
+            mc = codebuf.MachineCodeBlockWrapper()
+            mc.MOV_rs(eax.value, 4)      # argument 1
+            mc.MOV_rs(edx.value, 40)     # argument 10
+            mc.SUB_rr(eax.value, edx.value)     # return arg1 - arg10
+            if ffi == clibffi.FFI_DEFAULT_ABI:
+                mc.RET()
+            else:
+                mc.RET16_i(40)
+            rawstart = mc.materialize(cpu.asmmemmgr, [])
+            #
+            calldescr = cpu.calldescrof_dynamic([types.slong] * 10,
+                                                types.slong,
+                                                EffectInfo.MOST_GENERAL,
+                                                ffi_flags=-1)
+            calldescr.get_call_conv = lambda: ffi      # <==== hack
+            funcbox = ConstInt(rawstart)
+            i1 = BoxInt()
+            i2 = BoxInt()
+            i3 = BoxInt()
+            i4 = BoxInt()
+            i5 = BoxInt()
+            i6 = BoxInt()
+            c = ConstInt(-1)
+            faildescr = BasicFailDescr(1)
+            # we must call it repeatedly: if the stack pointer gets increased
+            # by 40 bytes by the STDCALL call, and if we don't expect it,
+            # then we are going to get our stack emptied unexpectedly by
+            # several repeated calls
+            ops = [
+            ResOperation(rop.CALL_RELEASE_GIL,
+                         [funcbox, i1, c, c, c, c, c, c, c, c, i2],
+                         i3, descr=calldescr),
+            ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+
+            ResOperation(rop.CALL_RELEASE_GIL,
+                         [funcbox, i1, c, c, c, c, c, c, c, c, i2],
+                         i4, descr=calldescr),
+            ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+
+            ResOperation(rop.CALL_RELEASE_GIL,
+                         [funcbox, i1, c, c, c, c, c, c, c, c, i2],
+                         i5, descr=calldescr),
+            ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+
+            ResOperation(rop.CALL_RELEASE_GIL,
+                         [funcbox, i1, c, c, c, c, c, c, c, c, i2],
+                         i6, descr=calldescr),
+            ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+
+            ResOperation(rop.FINISH, [i3, i4, i5, i6], None,
+                         descr=BasicFailDescr(0))
+            ]
+            ops[1].setfailargs([])
+            ops[3].setfailargs([])
+            ops[5].setfailargs([])
+            ops[7].setfailargs([])
+            looptoken = LoopToken()
+            self.cpu.compile_loop([i1, i2], ops, looptoken)
+
+            self.cpu.set_future_value_int(0, 123450)
+            self.cpu.set_future_value_int(1, 123408)
+            fail = self.cpu.execute_token(looptoken)
+            assert fail.identifier == 0
+            assert self.cpu.get_latest_value_int(0) == 42
+            assert self.cpu.get_latest_value_int(1) == 42
+            assert self.cpu.get_latest_value_int(2) == 42
+            assert self.cpu.get_latest_value_int(3) == 42
+
+
 class TestDebuggingAssembler(object):
     def setup_method(self, meth):
         self.cpu = CPU(rtyper=None, stats=FakeStats())
diff --git a/pypy/module/__builtin__/functional.py b/pypy/module/__builtin__/functional.py
--- a/pypy/module/__builtin__/functional.py
+++ b/pypy/module/__builtin__/functional.py
@@ -292,7 +292,7 @@
                 raise
             break
         new_frame = space.createframe(code, w_func.w_func_globals,
-                                      w_func.closure)
+                                      w_func)
         new_frame.locals_stack_w[0] = w_item
         w_res = new_frame.run()
         result_w.append(w_res)
diff --git a/pypy/module/bz2/test/test_bz2_file.py b/pypy/module/bz2/test/test_bz2_file.py
--- a/pypy/module/bz2/test/test_bz2_file.py
+++ b/pypy/module/bz2/test/test_bz2_file.py
@@ -274,14 +274,14 @@
             pass
         del bz2f   # delete from this frame, which is captured in the traceback
 
-    def test_read_chunk10(self):
+    def test_read_chunk9(self):
         from bz2 import BZ2File
         self.create_temp_file()
         
         bz2f = BZ2File(self.temppath)
         text_read = ""
         while True:
-            data = bz2f.read(10)
+            data = bz2f.read(9) # 9 doesn't divide evenly into data length
             if not data:
                 break
             text_read = "%s%s" % (text_read, data)
diff --git a/pypy/module/cpyext/frameobject.py b/pypy/module/cpyext/frameobject.py
--- a/pypy/module/cpyext/frameobject.py
+++ b/pypy/module/cpyext/frameobject.py
@@ -57,7 +57,7 @@
     code = space.interp_w(PyCode, w_code)
     w_globals = from_ref(space, py_frame.c_f_globals)
 
-    frame = space.FrameClass(space, code, w_globals, closure=None)
+    frame = space.FrameClass(space, code, w_globals, outer_func=None)
     frame.f_lineno = py_frame.c_f_lineno
     w_obj = space.wrap(frame)
     track_reference(space, py_obj, w_obj)
diff --git a/pypy/module/micronumpy/interp_dtype.py b/pypy/module/micronumpy/interp_dtype.py
--- a/pypy/module/micronumpy/interp_dtype.py
+++ b/pypy/module/micronumpy/interp_dtype.py
@@ -53,7 +53,9 @@
 
 VOID_TP = lltype.Ptr(lltype.Array(lltype.Void, hints={'nolength': True, "uncast_on_llgraph": True}))
 
-def create_low_level_dtype(num, kind, name, aliases, applevel_types, T, valtype):
+def create_low_level_dtype(num, kind, name, aliases, applevel_types, T, valtype,
+    expected_size=None):
+
     class Box(BaseBox):
         def __init__(self, val):
             self.val = val
@@ -113,6 +115,8 @@
     W_LowLevelDtype.aliases = aliases
     W_LowLevelDtype.applevel_types = applevel_types
     W_LowLevelDtype.num_bytes = rffi.sizeof(T)
+    if expected_size is not None:
+        assert W_LowLevelDtype.num_bytes == expected_size
     return W_LowLevelDtype
 
 
@@ -282,10 +286,21 @@
     applevel_types = [],
     T = rffi.SIGNEDCHAR,
     valtype = rffi.SIGNEDCHAR._type,
+    expected_size = 1,
 )
 class W_Int8Dtype(IntegerArithmeticDtype, W_Int8Dtype):
-    def unwrap(self, space, w_item):
-        return self.adapt_val(space.int_w(space.int(w_item)))
+    pass
+
+W_Int16Dtype = create_low_level_dtype(
+    num = 3, kind = SIGNEDLTR, name = "int16",
+    aliases = ["int16"],
+    applevel_types = [],
+    T = rffi.SHORT,
+    valtype = rffi.SHORT._type,
+    expected_size = 2,
+)
+class W_Int16Dtype(IntegerArithmeticDtype, W_Int16Dtype):
+    pass
 
 W_Int32Dtype = create_low_level_dtype(
     num = 5, kind = SIGNEDLTR, name = "int32",
@@ -293,6 +308,7 @@
     applevel_types = [],
     T = rffi.INT,
     valtype = rffi.INT._type,
+    expected_size = 4,
 )
 class W_Int32Dtype(IntegerArithmeticDtype, W_Int32Dtype):
     pass
@@ -303,6 +319,7 @@
     applevel_types = ["long"],
     T = rffi.LONGLONG,
     valtype = rffi.LONGLONG._type,
+    expected_size = 8,
 )
 class W_Int64Dtype(IntegerArithmeticDtype, W_Int64Dtype):
     pass
@@ -313,6 +330,7 @@
     applevel_types = ["float"],
     T = lltype.Float,
     valtype = float,
+    expected_size = 8,
 )
 class W_Float64Dtype(FloatArithmeticDtype, W_Float64Dtype):
     def unwrap(self, space, w_item):
@@ -323,7 +341,7 @@
 
 ALL_DTYPES = [
     W_BoolDtype,
-    W_Int8Dtype, W_Int32Dtype, W_Int64Dtype,
+    W_Int8Dtype, W_Int16Dtype, W_Int32Dtype, W_Int64Dtype,
     W_Float64Dtype
 ]
 
@@ -353,4 +371,4 @@
     kind = interp_attrproperty("kind", cls=W_Dtype),
     shape = GetSetProperty(W_Dtype.descr_get_shape),
 )
-W_Dtype.typedef.acceptable_as_base_class = False
\ No newline at end of file
+W_Dtype.typedef.acceptable_as_base_class = False
diff --git a/pypy/module/micronumpy/interp_numarray.py b/pypy/module/micronumpy/interp_numarray.py
--- a/pypy/module/micronumpy/interp_numarray.py
+++ b/pypy/module/micronumpy/interp_numarray.py
@@ -217,7 +217,15 @@
         return space.wrap("[" + " ".join(concrete._getnums(True)) + "]")
 
     def descr_getitem(self, space, w_idx):
-        # TODO: indexing by tuples
+        # TODO: indexing by arrays and lists
+        if space.isinstance_w(w_idx, space.w_tuple):
+            length = space.len_w(w_idx)
+            if length == 0:
+                return space.wrap(self)
+            if length > 1: # only one dimension for now.
+                raise OperationError(space.w_IndexError,
+                                     space.wrap("invalid index"))
+            w_idx = space.getitem(w_idx, space.wrap(0))
         start, stop, step, slice_length = space.decode_index4(w_idx, self.find_size())
         if step == 0:
             # Single index
@@ -231,8 +239,19 @@
             return space.wrap(res)
 
     def descr_setitem(self, space, w_idx, w_value):
-        # TODO: indexing by tuples and lists
+        # TODO: indexing by arrays and lists
         self.invalidated()
+        if space.isinstance_w(w_idx, space.w_tuple):
+            length = space.len_w(w_idx)
+            if length > 1: # only one dimension for now.
+                raise OperationError(space.w_IndexError,
+                                     space.wrap("invalid index"))
+            if length == 0:
+                w_idx = space.newslice(space.wrap(0),
+                                      space.wrap(self.find_size()),
+                                      space.wrap(1))
+            else:
+                w_idx = space.getitem(w_idx, space.wrap(0))
         start, stop, step, slice_length = space.decode_index4(w_idx,
                                                               self.find_size())
         if step == 0:
diff --git a/pypy/module/micronumpy/test/test_dtypes.py b/pypy/module/micronumpy/test/test_dtypes.py
--- a/pypy/module/micronumpy/test/test_dtypes.py
+++ b/pypy/module/micronumpy/test/test_dtypes.py
@@ -82,10 +82,20 @@
             assert a[1] == 1
 
     def test_add_int8(self):
-        from numpy import array
+        from numpy import array, dtype
 
         a = array(range(5), dtype="int8")
         b = a + a
+        assert b.dtype is dtype("int8")
+        for i in range(5):
+            assert b[i] == i * 2
+
+    def test_add_int16(self):
+        from numpy import array, dtype
+
+        a = array(range(5), dtype="int16")
+        b = a + a
+        assert b.dtype is dtype("int16")
         for i in range(5):
             assert b[i] == i * 2
 
@@ -98,4 +108,4 @@
         from numpy import dtype
 
         # You can't subclass dtype
-        raises(TypeError, type, "Foo", (dtype,), {})
\ No newline at end of file
+        raises(TypeError, type, "Foo", (dtype,), {})
diff --git a/pypy/module/micronumpy/test/test_numarray.py b/pypy/module/micronumpy/test/test_numarray.py
--- a/pypy/module/micronumpy/test/test_numarray.py
+++ b/pypy/module/micronumpy/test/test_numarray.py
@@ -84,6 +84,9 @@
         a = array(range(5), dtype="int8")
         assert str(a) == "[0 1 2 3 4]"
 
+        a = array(range(5), dtype="int16")
+        assert str(a) == "[0 1 2 3 4]"
+
     def test_str_slice(self):
         from numpy import array, zeros
         a = array(range(5), float)
@@ -102,6 +105,16 @@
         assert a[-1] == 8
         raises(IndexError, "a[-6]")
 
+    def test_getitem_tuple(self):
+        from numpy import array
+        a = array(range(5))
+        raises(IndexError, "a[(1,2)]")
+        for i in xrange(5):
+            assert a[(i,)] == i
+        b = a[()]
+        for i in xrange(5):
+            assert a[i] == b[i]
+
     def test_setitem(self):
         from numpy import array
         a = array(range(5))
@@ -110,6 +123,17 @@
         raises(IndexError, "a[5] = 0.0")
         raises(IndexError, "a[-6] = 3.0")
 
+    def test_setitem_tuple(self):
+        from numpy import array
+        a = array(range(5))
+        raises(IndexError, "a[(1,2)] = [0,1]")
+        for i in xrange(5):
+            a[(i,)] = i+1
+            assert a[i] == i+1
+        a[()] = range(5)
+        for i in xrange(5):
+            assert a[i] == i
+
     def test_setslice_array(self):
         from numpy import array
         a = array(range(5))
@@ -541,4 +565,4 @@
         a = fromstring(self.data)
         for i in range(4):
             assert a[i] == i + 1
-        raises(ValueError, fromstring, "abc")
\ No newline at end of file
+        raises(ValueError, fromstring, "abc")
diff --git a/pypy/module/pypyjit/interp_jit.py b/pypy/module/pypyjit/interp_jit.py
--- a/pypy/module/pypyjit/interp_jit.py
+++ b/pypy/module/pypyjit/interp_jit.py
@@ -21,6 +21,7 @@
 
 PyFrame._virtualizable2_ = ['last_instr', 'pycode',
                             'valuestackdepth', 'locals_stack_w[*]',
+                            'cells[*]',
                             'last_exception',
                             'lastblock',
                             'is_being_profiled',
diff --git a/pypy/module/pypyjit/test_pypy_c/test_call.py b/pypy/module/pypyjit/test_pypy_c/test_call.py
--- a/pypy/module/pypyjit/test_pypy_c/test_call.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_call.py
@@ -174,7 +174,7 @@
             guard_no_overflow(descr=...)
             i18 = force_token()
             --TICK--
-            jump(p0, p1, p2, p3, p4, i8, p7, i17, p8, i9, i17, p10, p11, p12, descr=<Loop0>)
+            jump(..., descr=<Loop0>)
         """)
 
     def test_default_and_kw(self):
@@ -396,3 +396,70 @@
             --TICK--
             jump(..., descr=<Loop0>)
         """)
+
+    def test_global_closure_has_constant_cells(self):
+        log = self.run("""
+            def make_adder(n):
+                def add(x):
+                    return x + n
+                return add
+            add5 = make_adder(5)
+            def main():
+                i = 0
+                while i < 5000:
+                    i = add5(i) # ID: call
+            """, [])
+        loop, = log.loops_by_id('call', is_entry_bridge=True)
+        assert loop.match("""
+            guard_value(i6, 1, descr=...)
+            guard_nonnull_class(p8, ConstClass(W_IntObject), descr=...)
+            guard_value(i4, 0, descr=...)
+            guard_value(p3, ConstPtr(ptr14), descr=...)
+            i15 = getfield_gc_pure(p8, descr=<SignedFieldDescr pypy.objspace.std.intobject.W_IntObject.inst_intval 8>)
+            i17 = int_lt(i15, 5000)
+            guard_true(i17, descr=...)
+            p18 = getfield_gc(p0, descr=<GcPtrFieldDescr pypy.interpreter.eval.Frame.inst_w_globals 8>)
+            guard_value(p18, ConstPtr(ptr19), descr=...)
+            p20 = getfield_gc(p18, descr=<GcPtrFieldDescr pypy.objspace.std.dictmultiobject.W_DictMultiObject.inst_strategy 12>)
+            guard_value(p20, ConstPtr(ptr21), descr=...)
+            guard_not_invalidated(descr=...)
+            # most importantly, there is no getarrayitem_gc here
+            p23 = call(ConstClass(getexecutioncontext), descr=<GcPtrCallDescr>)
+            p24 = getfield_gc(p23, descr=<GcPtrFieldDescr pypy.interpreter.executioncontext.ExecutionContext.inst_topframeref 36>)
+            i25 = force_token()
+            p26 = getfield_gc(p23, descr=<GcPtrFieldDescr pypy.interpreter.executioncontext.ExecutionContext.inst_w_tracefunc 44>)
+            guard_isnull(p26, descr=...)
+            i27 = getfield_gc(p23, descr=<NonGcPtrFieldDescr pypy.interpreter.executioncontext.ExecutionContext.inst_profilefunc 24>)
+            i28 = int_is_zero(i27)
+            guard_true(i28, descr=...)
+            p30 = getfield_gc(ConstPtr(ptr29), descr=<GcPtrFieldDescr pypy.interpreter.nestedscope.Cell.inst_w_value 8>)
+            guard_nonnull_class(p30, ConstClass(W_IntObject), descr=...)
+            i32 = getfield_gc_pure(p30, descr=<SignedFieldDescr pypy.objspace.std.intobject.W_IntObject.inst_intval 8>)
+            i33 = int_add_ovf(i15, i32)
+            guard_no_overflow(descr=...)
+            --TICK--
+            jump(p0, p1, p2, p5, i33, i32, p23, p30, p24, descr=<Loop0>)
+        """)
+
+    def test_local_closure_is_virtual(self):
+        log = self.run("""
+            def main():
+                i = 0
+                while i < 5000:
+                    def add():
+                        return i + 1
+                    i = add() # ID: call
+            """, [])
+        loop, = log.loops_by_id('call')
+        assert loop.match("""
+            i8 = getfield_gc_pure(p6, descr=<SignedFieldDescr pypy.objspace.std.intobject.W_IntObject.inst_intval 8>)
+            i10 = int_lt(i8, 5000)
+            guard_true(i10, descr=...)
+            i11 = force_token()
+            i13 = int_add(i8, 1)
+            --TICK--
+            p22 = new_with_vtable(ConstClass(W_IntObject))
+            setfield_gc(p22, i13, descr=<SignedFieldDescr pypy.objspace.std.intobject.W_IntObject.inst_intval 8>)
+            setfield_gc(p4, p22, descr=<GcPtrFieldDescr pypy.interpreter.nestedscope.Cell.inst_w_value 8>)
+            jump(p0, p1, p2, p3, p4, p7, p22, p7, descr=<Loop0>)
+        """)
diff --git a/pypy/objspace/flow/flowcontext.py b/pypy/objspace/flow/flowcontext.py
--- a/pypy/objspace/flow/flowcontext.py
+++ b/pypy/objspace/flow/flowcontext.py
@@ -184,7 +184,7 @@
 
 class FlowExecutionContext(ExecutionContext):
 
-    def __init__(self, space, code, globals, constargs={}, closure=None,
+    def __init__(self, space, code, globals, constargs={}, outer_func=None,
                  name=None):
         ExecutionContext.__init__(self, space)
         self.code = code
@@ -193,11 +193,11 @@
 
         self.crnt_offset = -1
         self.crnt_frame = None
-        if closure is None:
+        if outer_func and outer_func.closure:
+            self.closure = [nestedscope.Cell(Constant(value))
+                            for value in outer_func.closure]
+        else:
             self.closure = None
-        else:
-            self.closure = [nestedscope.Cell(Constant(value))
-                            for value in closure]
         frame = self.create_frame()
         formalargcount = code.getformalargcount()
         arg_list = [Variable() for i in range(formalargcount)]
@@ -216,7 +216,7 @@
         # while ignoring any operation like the creation of the locals dict
         self.recorder = []
         frame = FlowSpaceFrame(self.space, self.code,
-                               self.w_globals, self.closure)
+                               self.w_globals, self)
         frame.last_instr = 0
         return frame
 
diff --git a/pypy/objspace/flow/objspace.py b/pypy/objspace/flow/objspace.py
--- a/pypy/objspace/flow/objspace.py
+++ b/pypy/objspace/flow/objspace.py
@@ -252,9 +252,9 @@
             raise TypeError("%r is a generator" % (func,))
         code = PyCode._from_code(self, code)
         if func.func_closure is None:
-            closure = None
+            cl = None
         else:
-            closure = [extract_cell_content(c) for c in func.func_closure]
+            cl = [extract_cell_content(c) for c in func.func_closure]
         # CallableFactory.pycall may add class_ to functions that are methods
         name = func.func_name
         class_ = getattr(func, 'class_', None)
@@ -262,8 +262,10 @@
             name = '%s.%s' % (class_.__name__, name)
         for c in "<>&!":
             name = name.replace(c, '_')
+        class outerfunc: # hack
+            closure = cl
         ec = flowcontext.FlowExecutionContext(self, code, func.func_globals,
-                                              constargs, closure, name)
+                                              constargs, outerfunc, name)
         graph = ec.graph
         graph.func = func
         # attach a signature and defaults to the graph
diff --git a/pypy/objspace/std/fake.py b/pypy/objspace/std/fake.py
--- a/pypy/objspace/std/fake.py
+++ b/pypy/objspace/std/fake.py
@@ -142,7 +142,7 @@
 
     def funcrun(self, func, args):
         frame = func.space.createframe(self, func.w_func_globals,
-                                        func.closure)
+                                       func)
         sig = self.signature()
         scope_w = args.parse_obj(None, func.name, sig, func.defs_w)
         frame.setfastscope(scope_w)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -129,12 +129,12 @@
         ec._py_repr = None
         return ec
 
-    def createframe(self, code, w_globals, closure=None):
+    def createframe(self, code, w_globals, outer_func=None):
         from pypy.objspace.std.fake import CPythonFakeCode, CPythonFakeFrame
         if not we_are_translated() and isinstance(code, CPythonFakeCode):
             return CPythonFakeFrame(self, code, w_globals)
         else:
-            return ObjSpace.createframe(self, code, w_globals, closure)
+            return ObjSpace.createframe(self, code, w_globals, outer_func)
 
     def gettypefor(self, cls):
         return self.gettypeobject(cls.typedef)
diff --git a/pypy/rlib/clibffi.py b/pypy/rlib/clibffi.py
--- a/pypy/rlib/clibffi.py
+++ b/pypy/rlib/clibffi.py
@@ -286,10 +286,10 @@
 
 FFI_OK = cConfig.FFI_OK
 FFI_BAD_TYPEDEF = cConfig.FFI_BAD_TYPEDEF
-FFI_DEFAULT_ABI = rffi.cast(rffi.USHORT, cConfig.FFI_DEFAULT_ABI)
+FFI_DEFAULT_ABI = cConfig.FFI_DEFAULT_ABI
 if _WIN32:
-    FFI_STDCALL = rffi.cast(rffi.USHORT, cConfig.FFI_STDCALL)
-FFI_TYPE_STRUCT = rffi.cast(rffi.USHORT, cConfig.FFI_TYPE_STRUCT)
+    FFI_STDCALL = cConfig.FFI_STDCALL
+FFI_TYPE_STRUCT = cConfig.FFI_TYPE_STRUCT
 FFI_CIFP = rffi.COpaquePtr('ffi_cif', compilation_info=eci)
 
 FFI_CLOSUREP = lltype.Ptr(cConfig.ffi_closure)
@@ -319,7 +319,7 @@
        which the 'ffistruct' member is a regular FFI_TYPE.
     """
     tpe = lltype.malloc(FFI_STRUCT_P.TO, len(field_types)+1, flavor='raw')
-    tpe.ffistruct.c_type = FFI_TYPE_STRUCT
+    tpe.ffistruct.c_type = rffi.cast(rffi.USHORT, FFI_TYPE_STRUCT)
     tpe.ffistruct.c_size = rffi.cast(rffi.SIZE_T, size)
     tpe.ffistruct.c_alignment = rffi.cast(rffi.USHORT, aligment)
     tpe.ffistruct.c_elements = rffi.cast(FFI_TYPE_PP,
@@ -408,11 +408,12 @@
 FUNCFLAG_USE_ERRNO = 8
 FUNCFLAG_USE_LASTERROR = 16
 
-def get_call_conv(flags):
+def get_call_conv(flags, from_jit):
     if _WIN32 and (flags & FUNCFLAG_CDECL == 0):
         return FFI_STDCALL
     else:
         return FFI_DEFAULT_ABI
+get_call_conv._annspecialcase_ = 'specialize:arg(1)'     # hack :-/
 
 
 class AbstractFuncPtr(object):
@@ -437,13 +438,14 @@
         if _MSVC:
             # This little trick works correctly with MSVC.
             # It returns small structures in registers
-            if r_uint(restype.c_type) == FFI_TYPE_STRUCT:
+            if intmask(restype.c_type) == FFI_TYPE_STRUCT:
                 if restype.c_size <= 4:
                     restype = ffi_type_sint32
                 elif restype.c_size <= 8:
                     restype = ffi_type_sint64
 
-        res = c_ffi_prep_cif(self.ll_cif, get_call_conv(flags),
+        res = c_ffi_prep_cif(self.ll_cif,
+                             rffi.cast(rffi.USHORT, get_call_conv(flags,False)),
                              rffi.cast(rffi.UINT, argnum), restype,
                              self.ll_argtypes)
         if not res == FFI_OK:
diff --git a/pypy/rlib/libffi.py b/pypy/rlib/libffi.py
--- a/pypy/rlib/libffi.py
+++ b/pypy/rlib/libffi.py
@@ -75,7 +75,7 @@
     @staticmethod
     @jit.elidable
     def is_struct(ffi_type):
-        return intmask(ffi_type.c_type) == intmask(FFI_TYPE_STRUCT)
+        return intmask(ffi_type.c_type) == FFI_TYPE_STRUCT
 
 types._import()
 
diff --git a/pypy/tool/release/package.py b/pypy/tool/release/package.py
--- a/pypy/tool/release/package.py
+++ b/pypy/tool/release/package.py
@@ -52,9 +52,14 @@
             pypy_c_dir = basedir.join('pypy', 'translator', 'goal')
         pypy_c = pypy_c_dir.join('pypy-c.exe')
         libpypy_c = pypy_c_dir.join('libpypy-c.dll')
+        libexpat = pypy_c_dir.join('libexpat.dll')
+        if not libexpat.check():
+            libexpat = py.path.local.sysfind('libexpat.dll')
+            assert libexpat, "libexpat.dll not found"
+            print "Picking %s" % libexpat
         binaries = [(pypy_c, pypy_c.basename),
                     (libpypy_c, libpypy_c.basename),
-                    (pypy_c_dir.join('libexpat.dll'), 'libexpat.dll')]
+                    (libexpat, libexpat.basename)]
     else:
         basename = 'pypy-c'
         if override_pypy_c is None: