[pypy-commit] pypy win32-fixes3: merge default into branch

Sat Apr 20 20:45:34 CEST 2013

Author: mattip <matti.picus at gmail.com>
Branch: win32-fixes3
Changeset: r63531:7e95e169bc14
Date: 2013-04-20 21:45 +0300
http://bitbucket.org/pypy/pypy/changeset/7e95e169bc14/

Log:	merge default into branch

diff too long, truncating to 2000 out of 2200 lines

diff --git a/Makefile b/Makefile
new file mode 100644
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,7 @@
+
+all: pypy-c
+
+pypy-c:
+	@echo "Building PyPy with JIT, it'll take about 40 minutes and 4G of RAM"
+	@sleep 3
+	rpython/bin/rpython -Ojit pypy/goal/targetpypystandalone.py
diff --git a/README.rst b/README.rst
--- a/README.rst
+++ b/README.rst
@@ -22,3 +22,16 @@
 and send us feedback!
 
     the pypy-dev team <pypy-dev at python.org>
+
+Building
+========
+
+build with::
+
+  rpython/bin/rpython -Ojit pypy/goal/targetpypystandalone.py
+
+This ends up with ``pypy-c`` binary in the main pypy directory. We suggest
+to use virtualenv with the resulting pypy-c as the interpreter, you can
+find more details about various installation schemes here:
+
+http://doc.pypy.org/en/latest/getting-started.html#installing-pypy
diff --git a/lib-python/2.7/json/decoder.py b/lib-python/2.7/json/decoder.py
--- a/lib-python/2.7/json/decoder.py
+++ b/lib-python/2.7/json/decoder.py
@@ -162,7 +162,7 @@
         if nextchar == '}':
             if object_pairs_hook is not None:
                 result = object_pairs_hook(pairs)
-                return result, end
+                return result, end + 1
             pairs = {}
             if object_hook is not None:
                 pairs = object_hook(pairs)
diff --git a/lib-python/2.7/json/tests/test_decode.py b/lib-python/2.7/json/tests/test_decode.py
--- a/lib-python/2.7/json/tests/test_decode.py
+++ b/lib-python/2.7/json/tests/test_decode.py
@@ -44,6 +44,7 @@
                                     object_pairs_hook=OrderedDict,
                                     object_hook=lambda x: None),
                          OrderedDict(p))
+        self.assertEqual(self.loads("{}", object_pairs_hook=list), [])
 
 
 class TestPyDecode(TestDecode, PyTest): pass
diff --git a/pypy/doc/arm.rst b/pypy/doc/arm.rst
--- a/pypy/doc/arm.rst
+++ b/pypy/doc/arm.rst
@@ -153,7 +153,7 @@
 
 ::
 
-  pypy <path to rpython>/rpython/bin/rpython -Ojit --platform=arm --gcrootfinder=shadowstack --jit-backend=armv7 targetpypystandalone.py
+  pypy <path to rpython>/rpython/bin/rpython -Ojit --platform=arm --gcrootfinder=shadowstack --jit-backend=arm targetpypystandalone.py
 
 The gcrootfinder option is needed to work around `issue 1377`_ and the jit-backend works around `issue 1376`_
 
diff --git a/pypy/doc/config/translation.gcrootfinder.txt b/pypy/doc/config/translation.gcrootfinder.txt
--- a/pypy/doc/config/translation.gcrootfinder.txt
+++ b/pypy/doc/config/translation.gcrootfinder.txt
@@ -9,7 +9,9 @@
 - ``--gcrootfinder=asmgcc``: use assembler hackery to find the
   roots directly from the normal stack.  This is a bit faster,
   but platform specific.  It works so far with GCC or MSVC,
-  on i386 and x86-64.
+  on i386 and x86-64.  It is tested only on Linux (where it is
+  the default) so other platforms (as well as MSVC) may need
+  various fixes before they can be used.
 
 You may have to force the use of the shadowstack root finder if
 you are running into troubles or if you insist on translating
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -43,11 +43,11 @@
 Rudimentary support for bytearray in RPython
 
 .. branch: refactor-call_release_gil
-Fix a bug which casused cffi to return the wrong result when calling a C
+Fix a bug which caused cffi to return the wrong result when calling a C
 function which calls a Python callback which forces the frames
 
 .. branch: virtual-raw-mallocs
-JIT optimizations which makes cffi calls even faster, by removing the need to
+JIT optimizations which make cffi calls even faster, by removing the need to
 allocate a temporary buffer where to store the arguments.
 
 .. branch: improve-docs-2
diff --git a/pypy/module/thread/threadlocals.py b/pypy/module/thread/threadlocals.py
--- a/pypy/module/thread/threadlocals.py
+++ b/pypy/module/thread/threadlocals.py
@@ -52,7 +52,7 @@
 
     def signals_enabled(self):
         ec = self.getvalue()
-        return ec._signals_enabled
+        return ec is not None and ec._signals_enabled
 
     def enable_signals(self, space):
         ec = self.getvalue()
@@ -72,10 +72,12 @@
     def leave_thread(self, space):
         "Notification that the current thread is about to stop."
         from pypy.module.thread.os_local import thread_is_stopping
-        try:
-            thread_is_stopping(self.getvalue())
-        finally:
-            self.setvalue(None)
+        ec = self.getvalue()
+        if ec is not None:
+            try:
+                thread_is_stopping(ec)
+            finally:
+                self.setvalue(None)
 
     def reinit_threads(self, space):
         "Called in the child process after a fork()"
diff --git a/rpython/annotator/model.py b/rpython/annotator/model.py
--- a/rpython/annotator/model.py
+++ b/rpython/annotator/model.py
@@ -527,6 +527,7 @@
 s_Int = SomeInteger()
 s_ImpossibleValue = SomeImpossibleValue()
 s_Str0 = SomeString(no_nul=True)
+s_Unicode0 = SomeUnicodeString(no_nul=True)
 
 
 # ____________________________________________________________
diff --git a/rpython/config/translationoption.py b/rpython/config/translationoption.py
--- a/rpython/config/translationoption.py
+++ b/rpython/config/translationoption.py
@@ -115,7 +115,7 @@
                          ("translation.gcrootfinder", DEFL_ROOTFINDER_WITHJIT),
                          ("translation.list_comprehension_operations", True)]),
     ChoiceOption("jit_backend", "choose the backend for the JIT",
-                 ["auto", "x86", "x86-without-sse2", 'armv7', 'armv7hf', 'armv6hf'],
+                 ["auto", "x86", "x86-without-sse2", 'arm'],
                  default="auto", cmdline="--jit-backend"),
     ChoiceOption("jit_profiler", "integrate profiler support into the JIT",
                  ["off", "oprofile"],
diff --git a/rpython/jit/backend/arm/assembler.py b/rpython/jit/backend/arm/assembler.py
--- a/rpython/jit/backend/arm/assembler.py
+++ b/rpython/jit/backend/arm/assembler.py
@@ -3,6 +3,7 @@
 import os
 
 from rpython.jit.backend.arm import conditions as c, registers as r
+from rpython.jit.backend.arm import shift
 from rpython.jit.backend.arm.arch import (WORD, DOUBLE_WORD, FUNC_ALIGN,
     JITFRAME_FIXED_SIZE)
 from rpython.jit.backend.arm.codebuilder import InstrBuilder, OverwritingBuilder
@@ -12,8 +13,9 @@
     CoreRegisterManager, check_imm_arg, VFPRegisterManager,
     operations as regalloc_operations,
     operations_with_guard as regalloc_operations_with_guard)
-from rpython.jit.backend.llsupport import jitframe
+from rpython.jit.backend.llsupport import jitframe, rewrite
 from rpython.jit.backend.llsupport.assembler import DEBUG_COUNTER, debug_bridge, BaseAssembler
+from rpython.jit.backend.llsupport.regalloc import get_scale, valid_addressing_size
 from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from rpython.jit.backend.model import CompiledLoopToken
 from rpython.jit.codewriter.effectinfo import EffectInfo
@@ -25,7 +27,6 @@
 from rpython.rlib.rarithmetic import r_uint
 from rpython.rtyper.annlowlevel import llhelper, cast_instance_to_gcref
 from rpython.rtyper.lltypesystem import lltype, rffi
-from rpython.jit.backend.arm.detect import detect_hardfloat
 
 class AssemblerARM(ResOpAssembler):
 
@@ -51,14 +52,13 @@
 
     def setup_once(self):
         BaseAssembler.setup_once(self)
-        self.hf_abi = detect_hardfloat()
 
     def setup(self, looptoken):
         assert self.memcpy_addr != 0, 'setup_once() not called?'
         if we_are_translated():
             self.debug = False
         self.current_clt = looptoken.compiled_loop_token
-        self.mc = InstrBuilder(self.cpu.arch_version)
+        self.mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
         self.pending_guards = []
         assert self.datablockwrapper is None
         allblocks = self.get_asmmemmgr_blocks(looptoken)
@@ -82,20 +82,20 @@
         if not self.cpu.propagate_exception_descr:
             return      # not supported (for tests, or non-translated)
         #
-        mc = InstrBuilder(self.cpu.arch_version)
+        mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
         self._store_and_reset_exception(mc, r.r0)
         ofs = self.cpu.get_ofs_of_frame_field('jf_guard_exc')
         # make sure ofs fits into a register
         assert check_imm_arg(ofs)
-        mc.LDR_ri(r.r0.value, r.fp.value, imm=ofs)
+        self.store_reg(mc, r.r0, r.fp, ofs)
         propagate_exception_descr = rffi.cast(lltype.Signed,
                   cast_instance_to_gcref(self.cpu.propagate_exception_descr))
         # put propagate_exception_descr into frame
         ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
         # make sure ofs fits into a register
         assert check_imm_arg(ofs)
-        mc.gen_load_int(r.r0.value, propagate_exception_descr)
-        mc.STR_ri(r.r0.value, r.fp.value, imm=ofs)
+        mc.gen_load_int(r.r1.value, propagate_exception_descr)
+        self.store_reg(mc, r.r0, r.fp, ofs)
         mc.MOV_rr(r.r0.value, r.fp.value)
         self.gen_func_epilog(mc)
         rawstart = mc.materialize(self.cpu.asmmemmgr, [])
@@ -167,7 +167,7 @@
         #    |  my own retaddr       |    <-- sp
         #    +-----------------------+
         #
-        mc = InstrBuilder(self.cpu.arch_version)
+        mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
         # save argument registers and return address
         mc.PUSH([reg.value for reg in r.argument_regs] + [r.ip.value, r.lr.value])
         # stack is aligned here
@@ -208,7 +208,7 @@
         # write barriers.  It must save all registers, and optionally
         # all vfp registers.  It takes a single argument which is in r0.
         # It must keep stack alignment accordingly.
-        mc = InstrBuilder(self.cpu.arch_version)
+        mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
         #
         exc0 = exc1 = None
         mc.PUSH([r.ip.value, r.lr.value]) # push two words to keep alignment
@@ -251,26 +251,60 @@
         else:
             self.wb_slowpath[withcards + 2 * withfloats] = rawstart
 
-    def _build_malloc_slowpath(self):
-        mc = InstrBuilder(self.cpu.arch_version)
+    def _build_malloc_slowpath(self, kind):
+        """ While arriving on slowpath, we have a gcpattern on stack 0.
+        The arguments are passed in r0 and r10, as follows:
+
+        kind == 'fixed': nursery_head in r0 and the size in r1 - r0.
+
+        kind == 'str/unicode': length of the string to allocate in r0.
+
+        kind == 'var': length to allocate in r1, tid in r0,
+                       and itemsize on the stack.
+
+        This function must preserve all registers apart from r0 and r1.
+        """
+        assert kind in ['fixed', 'str', 'unicode', 'var']
+        mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
+        #
         self._push_all_regs_to_jitframe(mc, [r.r0, r.r1], self.cpu.supports_floats)
+        #
+        if kind == 'fixed':
+            addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
+        elif kind == 'str':
+            addr = self.cpu.gc_ll_descr.get_malloc_fn_addr('malloc_str')
+        elif kind == 'unicode':
+            addr = self.cpu.gc_ll_descr.get_malloc_fn_addr('malloc_unicode')
+        else:
+            addr = self.cpu.gc_ll_descr.get_malloc_slowpath_array_addr()
+        if kind == 'fixed':
+            # stack layout: [gcmap]
+            # At this point we know that the values we need to compute the size
+            # are stored in r0 and r1.
+            mc.SUB_rr(r.r0.value, r.r1.value, r.r0.value) # compute the size we want
+
+            if hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
+                mc.MOV_rr(r.r1.value, r.fp.value)
+        elif kind == 'str' or kind == 'unicode':
+            # stack layout: [gcmap]
+            mc.MOV_rr(r.r0.value, r.r1.value)
+        else:  # var
+            # stack layout: [gcmap][itemsize]...
+            # tid is in r0
+            # length is in r1
+            mc.MOV_rr(r.r2.value, r.r1.value)
+            mc.MOV_rr(r.r1.value, r.r0.value)
+            mc.POP([r.r0.value])  # load itemsize
+        # store the gc pattern
+        mc.POP([r.r4.value])
         ofs = self.cpu.get_ofs_of_frame_field('jf_gcmap')
-        # store the gc pattern
-        mc.POP([r.r2.value])
-        self.store_reg(mc, r.r2, r.fp, ofs)
+        self.store_reg(mc, r.r4, r.fp, ofs)
+        #
         # We need to push two registers here because we are going to make a
         # call an therefore the stack needs to be 8-byte aligned
         mc.PUSH([r.ip.value, r.lr.value])
-        # At this point we know that the values we need to compute the size
-        # are stored in r0 and r1.
-        mc.SUB_rr(r.r0.value, r.r1.value, r.r0.value) # compute the size we want
-
-        if hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
-            mc.MOV_rr(r.r1.value, r.fp.value)
-
-        addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
+        #
         mc.BL(addr)
-
         #
         # If the slowpath malloc failed, we raise a MemoryError that
         # always interrupts the current loop, as a "good enough"
@@ -290,8 +324,9 @@
         # return
         mc.POP([r.ip.value, r.pc.value])
 
+        #
         rawstart = mc.materialize(self.cpu.asmmemmgr, [])
-        self.malloc_slowpath = rawstart
+        return rawstart
 
     def _reload_frame_if_necessary(self, mc):
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
@@ -364,7 +399,7 @@
                 self.load_reg(mc, vfpr, r.fp, ofs)
 
     def _build_failure_recovery(self, exc, withfloats=False):
-        mc = InstrBuilder(self.cpu.arch_version)
+        mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
         self._push_all_regs_to_jitframe(mc, [], withfloats)
 
         if exc:
@@ -647,7 +682,7 @@
                                 expected_size=expected_size)
 
     def _patch_frame_depth(self, adr, allocated_depth):
-        mc = InstrBuilder(self.cpu.arch_version)
+        mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
         mc.gen_load_int(r.lr.value, allocated_depth)
         mc.copy_to_raw_memory(adr)
 
@@ -723,7 +758,7 @@
         # f) store the address of the new jitframe in the shadowstack
         # c) set the gcmap field to 0 in the new jitframe
         # g) restore registers and return
-        mc = InstrBuilder(self.cpu.arch_version)
+        mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
         self._push_all_regs_to_jitframe(mc, [], self.cpu.supports_floats)
         # this is the gcmap stored by push_gcmap(mov=True) in _check_stack_frame
         # and the expected_size pushed in _check_stack_frame
@@ -783,7 +818,7 @@
         self.target_tokens_currently_compiling = None
 
     def _patch_stackadjust(self, adr, allocated_depth):
-        mc = InstrBuilder(self.cpu.arch_version)
+        mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
         mc.gen_load_int(r.lr.value, allocated_depth)
         mc.copy_to_raw_memory(adr)
 
@@ -823,7 +858,7 @@
                 # patch the guard jumpt to the stub
                 # overwrite the generate NOP with a B_offs to the pos of the
                 # stub
-                mc = InstrBuilder(self.cpu.arch_version)
+                mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
                 mc.B_offs(relative_offset, c.get_opposite_of(tok.fcond))
                 mc.copy_to_raw_memory(guard_pos)
             else:
@@ -902,7 +937,7 @@
                 self.mc.ASR_ri(resloc.value, resloc.value, 16)
 
     def patch_trace(self, faildescr, looptoken, bridge_addr, regalloc):
-        b = InstrBuilder(self.cpu.arch_version)
+        b = InstrBuilder(self.cpu.cpuinfo.arch_version)
         patch_addr = faildescr._arm_failure_recovery_block
         assert patch_addr != 0
         b.B(bridge_addr)
@@ -1170,21 +1205,17 @@
         else:
             raise AssertionError('Trying to pop to an invalid location')
 
-    def malloc_cond(self, nursery_free_adr, nursery_top_adr, sizeloc, gcmap):
-        if sizeloc.is_imm():     # must be correctly aligned
-            assert sizeloc.value & (WORD-1) == 0
+    def malloc_cond(self, nursery_free_adr, nursery_top_adr, size, gcmap):
+        assert size & (WORD-1) == 0
 
         self.mc.gen_load_int(r.r0.value, nursery_free_adr)
         self.mc.LDR_ri(r.r0.value, r.r0.value)
 
-        if sizeloc.is_imm():
-            if check_imm_arg(sizeloc.value):
-                self.mc.ADD_ri(r.r1.value, r.r0.value, sizeloc.value)
-            else:
-                self.mc.gen_load_int(r.r1.value, sizeloc.value)
-                self.mc.ADD_rr(r.r1.value, r.r0.value, r.r1.value)
+        if check_imm_arg(size):
+            self.mc.ADD_ri(r.r1.value, r.r0.value, size)
         else:
-            self.mc.ADD_rr(r.r1.value, r.r0.value, sizeloc.value)
+            self.mc.gen_load_int(r.r1.value, size)
+            self.mc.ADD_rr(r.r1.value, r.r0.value, r.r1.value)
 
         self.mc.gen_load_int(r.ip.value, nursery_top_adr)
         self.mc.LDR_ri(r.ip.value, r.ip.value)
@@ -1205,6 +1236,128 @@
         self.mc.gen_load_int(r.ip.value, nursery_free_adr)
         self.mc.STR_ri(r.r1.value, r.ip.value)
 
+    def malloc_cond_varsize_frame(self, nursery_free_adr, nursery_top_adr,
+                                  sizeloc, gcmap):
+        if sizeloc is r.r0:
+            self.mc.MOV_rr(r.r1.value, r.r0.value)
+            sizeloc = r.r1
+        self.mc.gen_load_int(r.r0.value, nursery_free_adr)
+        self.mc.LDR_ri(r.r0.value, r.r0.value)
+        #
+        self.mc.ADD_rr(r.r1.value, r.r0.value, sizeloc.value)
+        #
+        self.mc.gen_load_int(r.ip.value, nursery_top_adr)
+        self.mc.LDR_ri(r.ip.value, r.ip.value)
+
+        self.mc.CMP_rr(r.r1.value, r.ip.value)
+        #
+        self.push_gcmap(self.mc, gcmap, push=True, cond=c.HI)
+
+        self.mc.BL(self.malloc_slowpath, c=c.HI)
+
+        self.mc.gen_load_int(r.ip.value, nursery_free_adr)
+        self.mc.STR_ri(r.r1.value, r.ip.value)
+
+    def malloc_cond_varsize(self, kind, nursery_free_adr, nursery_top_adr,
+                            lengthloc, itemsize, maxlength, gcmap,
+                            arraydescr):
+        from rpython.jit.backend.llsupport.descr import ArrayDescr
+        assert isinstance(arraydescr, ArrayDescr)
+
+        # lengthloc is the length of the array, which we must not modify!
+        assert lengthloc is not r.r0 and lengthloc is not r.r1
+        if lengthloc.is_reg():
+            varsizeloc = lengthloc
+        else:
+            assert lengthloc.is_stack()
+            self.regalloc_mov(lengthloc, r.r1)
+            varsizeloc = r.r1
+        #
+        if check_imm_arg(maxlength):
+            self.mc.CMP_ri(varsizeloc.value, maxlength)
+        else:
+            self.mc.gen_load_int(r.ip.value, maxlength)
+            self.mc.CMP_rr(varsizeloc.value, r.ip.value)
+        jmp_adr0 = self.mc.currpos()  # jump to (large)
+        self.mc.BKPT()
+        #
+        self.mc.gen_load_int(r.r0.value, nursery_free_adr)
+        self.mc.LDR_ri(r.r0.value, r.r0.value)
+
+
+        if valid_addressing_size(itemsize):
+            shiftsize = get_scale(itemsize)
+        else:
+            shiftsize = self._mul_const_scaled(self.mc, r.lr, varsizeloc,
+                                                itemsize)
+            varsizeloc = r.lr
+        # now varsizeloc is a register != r0.  The size of
+        # the variable part of the array is (varsizeloc << shiftsize)
+        assert arraydescr.basesize >= self.gc_minimal_size_in_nursery
+        constsize = arraydescr.basesize + self.gc_size_of_header
+        force_realignment = (itemsize % WORD) != 0
+        if force_realignment:
+            constsize += WORD - 1
+        self.mc.gen_load_int(r.ip.value, constsize)
+        # constsize + (varsizeloc << shiftsize)
+        self.mc.ADD_rr(r.r1.value, r.ip.value, varsizeloc.value,
+                                imm=shiftsize, shifttype=shift.LSL)
+        self.mc.ADD_rr(r.r1.value, r.r1.value, r.r0.value)
+        if force_realignment:
+            self.mc.MVN_ri(r.ip.value, imm=(WORD - 1))
+            self.mc.AND_rr(r.r1.value, r.r1.value, r.ip.value)
+        # now r1 contains the total size in bytes, rounded up to a multiple
+        # of WORD, plus nursery_free_adr
+        #
+        self.mc.gen_load_int(r.ip.value, nursery_top_adr)
+        self.mc.LDR_ri(r.ip.value, r.ip.value)
+
+        self.mc.CMP_rr(r.r1.value, r.ip.value)
+        jmp_adr1 = self.mc.currpos()  # jump to (after-call)
+        self.mc.BKPT()
+        #
+        # (large)
+        currpos = self.mc.currpos()
+        pmc = OverwritingBuilder(self.mc, jmp_adr0, WORD)
+        pmc.B_offs(currpos, c.GT)
+        #
+        # save the gcmap
+        self.push_gcmap(self.mc, gcmap, push=True)
+        #
+        if kind == rewrite.FLAG_ARRAY:
+            self.mc.gen_load_int(r.r0.value, arraydescr.tid)
+            self.regalloc_mov(lengthloc, r.r1)
+            self.regalloc_push(imm(itemsize))
+            addr = self.malloc_slowpath_varsize
+        else:
+            if kind == rewrite.FLAG_STR:
+                addr = self.malloc_slowpath_str
+            else:
+                assert kind == rewrite.FLAG_UNICODE
+                addr = self.malloc_slowpath_unicode
+            self.regalloc_mov(lengthloc, r.r1)
+        self.mc.BL(addr)
+        #
+        jmp_location = self.mc.currpos()  # jump to (done)
+        self.mc.BKPT()
+        # (after-call)
+        currpos = self.mc.currpos()
+        pmc = OverwritingBuilder(self.mc, jmp_adr1, WORD)
+        pmc.B_offs(currpos, c.LS)
+        #
+        # write down the tid, but not if it's the result of the CALL
+        self.mc.gen_load_int(r.ip.value, arraydescr.tid)
+        self.mc.STR_ri(r.ip.value, r.r0.value)
+
+        # while we're at it, this line is not needed if we've done the CALL
+        self.mc.gen_load_int(r.ip.value, nursery_free_adr)
+        self.mc.STR_ri(r.r1.value, r.ip.value)
+        # (done)
+        # skip instructions after call
+        currpos = self.mc.currpos()
+        pmc = OverwritingBuilder(self.mc, jmp_location, WORD)
+        pmc.B_offs(currpos)
+
     def push_gcmap(self, mc, gcmap, push=False, store=False, cond=c.AL):
         ptr = rffi.cast(lltype.Signed, gcmap)
         if push:
@@ -1222,6 +1375,32 @@
         mc.gen_load_int(r.ip.value, 0)
         self.store_reg(mc, r.ip, r.fp, ofs)
 
+    def _mul_const_scaled(self, mc, targetreg, sourcereg, itemsize):
+        """Produce one operation to do roughly
+               targetreg = sourcereg * itemsize
+           except that the targetreg may still need shifting by 0,1,2,3.
+        """
+        if (itemsize & 7) == 0:
+            shiftsize = 3
+        elif (itemsize & 3) == 0:
+            shiftsize = 2
+        elif (itemsize & 1) == 0:
+            shiftsize = 1
+        else:
+            shiftsize = 0
+        itemsize >>= shiftsize
+        #
+        if valid_addressing_size(itemsize - 1):
+            self.mc.ADD_rr(targetreg.value, sourcereg.value, sourcereg.value,
+                                imm=get_scale(itemsize - 1), shifttype=shift.LSL)
+        elif valid_addressing_size(itemsize):
+            self.mc.LSL_ri(targetreg.value, sourcereg.value,
+                    get_scale(itemsize))
+        else:
+            mc.gen_load_int(targetreg.value, itemsize)
+            mc.MUL(targetreg.value, sourcereg.value, targetreg.value)
+        #
+        return shiftsize
 
 def not_implemented(msg):
     os.write(2, '[ARM/asm] %s\n' % msg)
diff --git a/rpython/jit/backend/arm/detect.py b/rpython/jit/backend/arm/detect.py
--- a/rpython/jit/backend/arm/detect.py
+++ b/rpython/jit/backend/arm/detect.py
@@ -1,7 +1,10 @@
+import os
+
 from rpython.translator.tool.cbuild import ExternalCompilationInfo
+from rpython.rtyper.tool import rffi_platform
 from rpython.rlib.clibffi import FFI_DEFAULT_ABI, FFI_SYSV, FFI_VFP
-from rpython.rtyper.tool import rffi_platform
 from rpython.translator.platform import CompilationError
+from rpython.rlib.debug import debug_print, debug_start, debug_stop
 
 eci = ExternalCompilationInfo(
     post_include_bits=["""
@@ -26,3 +29,37 @@
         return True
     except CompilationError:
         return False
+
+
+def detect_arch_version(filename="/proc/cpuinfo"):
+    fd = os.open(filename, os.O_RDONLY, 0644)
+    n = 0
+    debug_start("jit-backend-arch")
+    try:
+        buf = os.read(fd, 2048)
+        if not buf:
+            debug_print("Could not detect ARM architecture "
+                        "version, assuming", "ARMv%d" % n)
+            n = 6  # we asume ARMv6 as base case
+    finally:
+        os.close(fd)
+    # "Processor       : ARMv%d-compatible processor rev 7 (v6l)"
+    i = buf.find('ARMv')
+    if i == -1:
+        n = 6
+        debug_print("Could not detect architecture version, "
+                    "falling back to", "ARMv%d" % n)
+    else:
+        n = int(buf[i + 4])
+
+    if n < 6:
+        raise ValueError("Unsupported ARM architecture version")
+
+    debug_print("Detected", "ARMv%d" % n)
+
+    if n > 7:
+        n = 7
+        debug_print("Architecture version not explicitly supported, "
+                    "falling back to", "ARMv%d" % n)
+    debug_stop("jit-backend-arch")
+    return n
diff --git a/rpython/jit/backend/arm/opassembler.py b/rpython/jit/backend/arm/opassembler.py
--- a/rpython/jit/backend/arm/opassembler.py
+++ b/rpython/jit/backend/arm/opassembler.py
@@ -191,7 +191,7 @@
         return fcond
 
     def _emit_guard(self, op, arglocs, fcond, save_exc,
-                                    is_guard_not_invalidated=False, 
+                                    is_guard_not_invalidated=False,
                                     is_guard_not_forced=False):
         assert isinstance(save_exc, bool)
         assert isinstance(fcond, int)
@@ -297,7 +297,7 @@
         return self._emit_guard(op, locs, fcond, save_exc=False,
                                             is_guard_not_invalidated=True)
 
-    def emit_op_label(self, op, arglocs, regalloc, fcond): 
+    def emit_op_label(self, op, arglocs, regalloc, fcond):
         self._check_frame_depth_debug(self.mc)
         return fcond
 
@@ -354,7 +354,7 @@
                     # whether to worry about a CALL that can collect; this
                     # is always true except in call_release_gil
                     can_collect=True):
-        if self.hf_abi:
+        if self.cpu.cpuinfo.hf_abi:
             stack_args, adr = self._setup_call_hf(adr, arglocs, fcond,
                                             resloc, result_info)
         else:
@@ -382,7 +382,7 @@
 
         # ensure the result is wellformed and stored in the correct location
         if resloc is not None:
-            if resloc.is_vfp_reg() and not self.hf_abi:
+            if resloc.is_vfp_reg() and not self.cpu.cpuinfo.hf_abi:
                 # move result to the allocated register
                 self.mov_to_vfp_loc(r.r0, r.r1, resloc)
             elif resloc.is_reg() and result_info != (-1, -1):
@@ -1230,7 +1230,7 @@
         baseofs = self.cpu.get_baseofs_of_frame_field()
         newlooptoken.compiled_loop_token.update_frame_info(
             oldlooptoken.compiled_loop_token, baseofs)
-        mc = InstrBuilder(self.cpu.arch_version)
+        mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
         mc.B(target)
         mc.copy_to_raw_memory(oldadr)
 
@@ -1320,6 +1320,7 @@
         with saved_registers(self.mc, regs_to_save, vfp_regs_to_save):
             self._emit_call(imm(self.reacqgil_addr), [], fcond,
                     can_collect=False)
+        self._reload_frame_if_necessary(self.mc)
 
     def _store_force_index(self, guard_op):
         faildescr = guard_op.getdescr()
@@ -1334,23 +1335,6 @@
         self._alignment_check()
         return fcond
 
-    def emit_op_call_malloc_nursery(self, op, arglocs, regalloc, fcond):
-        # registers r0 and r1 are allocated for this call
-        assert len(arglocs) == 1
-        sizeloc = arglocs[0]
-        gc_ll_descr = self.cpu.gc_ll_descr
-        gcmap = regalloc.get_gcmap([r.r0, r.r1])
-        self.malloc_cond(
-            gc_ll_descr.get_nursery_free_addr(),
-            gc_ll_descr.get_nursery_top_addr(),
-            sizeloc,
-            gcmap
-            )
-        self._alignment_check()
-        return fcond
-    emit_op_call_malloc_nursery_varsize_frame = emit_op_call_malloc_nursery
-
-
     def _alignment_check(self):
         if not self.debug:
             return
@@ -1436,7 +1420,7 @@
         self.mc.VCVT_f64_f32(r.vfp_ip.value, arg.value)
         self.mc.VMOV_rc(res.value, r.ip.value, r.vfp_ip.value)
         return fcond
-    
+
     def emit_op_cast_singlefloat_to_float(self, op, arglocs, regalloc, fcond):
         arg, res = arglocs
         assert res.is_vfp_reg()
diff --git a/rpython/jit/backend/arm/regalloc.py b/rpython/jit/backend/arm/regalloc.py
--- a/rpython/jit/backend/arm/regalloc.py
+++ b/rpython/jit/backend/arm/regalloc.py
@@ -2,7 +2,8 @@
 from rpython.rlib import rgc
 from rpython.rlib.debug import debug_print, debug_start, debug_stop
 from rpython.jit.backend.llsupport.regalloc import FrameManager, \
-        RegisterManager, TempBox, compute_vars_longevity, BaseRegalloc
+        RegisterManager, TempBox, compute_vars_longevity, BaseRegalloc, \
+        get_scale
 from rpython.jit.backend.arm import registers as r
 from rpython.jit.backend.arm import locations
 from rpython.jit.backend.arm.locations import imm, get_fp_offset
@@ -1011,20 +1012,72 @@
         self.rm.force_allocate_reg(op.result, selected_reg=r.r0)
         t = TempInt()
         self.rm.force_allocate_reg(t, selected_reg=r.r1)
+
+        sizeloc = size_box.getint()
+        gc_ll_descr = self.cpu.gc_ll_descr
+        gcmap = self.get_gcmap([r.r0, r.r1])
         self.possibly_free_var(t)
-        return [imm(size)]
+        self.assembler.malloc_cond(
+            gc_ll_descr.get_nursery_free_addr(),
+            gc_ll_descr.get_nursery_top_addr(),
+            sizeloc,
+            gcmap
+            )
+        self.assembler._alignment_check()
 
     def prepare_op_call_malloc_nursery_varsize_frame(self, op, fcond):
         size_box = op.getarg(0)
-        assert isinstance(size_box, BoxInt)
-
+        assert isinstance(size_box, BoxInt) # we cannot have a const here!
+        # sizeloc must be in a register, but we can free it now
+        # (we take care explicitly of conflicts with r0 or r1)
+        sizeloc = self.rm.make_sure_var_in_reg(size_box)
+        self.rm.possibly_free_var(size_box)
+        #
         self.rm.force_allocate_reg(op.result, selected_reg=r.r0)
+        #
         t = TempInt()
         self.rm.force_allocate_reg(t, selected_reg=r.r1)
-        argloc = self.make_sure_var_in_reg(size_box,
-                                            forbidden_vars=[op.result, t])
+        #
+        gcmap = self.get_gcmap([r.r0, r.r1])
         self.possibly_free_var(t)
-        return [argloc]
+        #
+        gc_ll_descr = self.assembler.cpu.gc_ll_descr
+        self.assembler.malloc_cond_varsize_frame(
+            gc_ll_descr.get_nursery_free_addr(),
+            gc_ll_descr.get_nursery_top_addr(),
+            sizeloc,
+            gcmap
+            )
+        self.assembler._alignment_check()
+
+    def prepare_op_call_malloc_nursery_varsize(self, op, fcond):
+        gc_ll_descr = self.assembler.cpu.gc_ll_descr
+        if not hasattr(gc_ll_descr, 'max_size_of_young_obj'):
+            raise Exception("unreachable code")
+            # for boehm, this function should never be called
+        arraydescr = op.getdescr()
+        length_box = op.getarg(2)
+        assert isinstance(length_box, BoxInt) # we cannot have a const here!
+        # the result will be in r0
+        self.rm.force_allocate_reg(op.result, selected_reg=r.r0)
+        # we need r1 as a temporary
+        tmp_box = TempBox()
+        self.rm.force_allocate_reg(tmp_box, selected_reg=r.r1)
+        gcmap = self.get_gcmap([r.r0, r.r1]) # allocate the gcmap *before*
+        self.rm.possibly_free_var(tmp_box)
+        # length_box always survives: it's typically also present in the
+        # next operation that will copy it inside the new array.  It's
+        # fine to load it from the stack too, as long as it's != r0, r1.
+        lengthloc = self.rm.loc(length_box)
+        self.rm.possibly_free_var(length_box)
+        #
+        itemsize = op.getarg(1).getint()
+        maxlength = (gc_ll_descr.max_size_of_young_obj - WORD * 2) / itemsize
+        self.assembler.malloc_cond_varsize(
+            op.getarg(0).getint(),
+            gc_ll_descr.get_nursery_free_addr(),
+            gc_ll_descr.get_nursery_top_addr(),
+            lengthloc, itemsize, maxlength, gcmap, arraydescr)
 
     prepare_op_debug_merge_point = void
     prepare_op_jit_debug = void
@@ -1211,13 +1264,6 @@
 operations_with_guard = [notimplemented_with_guard] * (rop._LAST + 1)
 
 
-def get_scale(size):
-    scale = 0
-    while (1 << scale) < size:
-        scale += 1
-    assert (1 << scale) == size
-    return scale
-
 for key, value in rop.__dict__.items():
     key = key.lower()
     if key.startswith('_'):
diff --git a/rpython/jit/backend/arm/runner.py b/rpython/jit/backend/arm/runner.py
--- a/rpython/jit/backend/arm/runner.py
+++ b/rpython/jit/backend/arm/runner.py
@@ -7,9 +7,14 @@
 from rpython.rlib.jit_hooks import LOOP_RUN_CONTAINER
 from rpython.rtyper.lltypesystem import lltype, llmemory
 from rpython.jit.backend.arm.detect import detect_hardfloat
+from rpython.jit.backend.arm.detect import detect_arch_version
 
 jitframe.STATICSIZE = JITFRAME_FIXED_SIZE
 
+class CPUInfo(object):
+    hf_abi = False
+    arch_version = 6
+
 class AbstractARMCPU(AbstractLLCPU):
 
     IS_64_BIT = False
@@ -18,6 +23,7 @@
     supports_longlong = False # XXX requires an implementation of
                               # read_timestamp that works in user mode
     supports_singlefloats = not detect_hardfloat()
+    can_inline_varsize_malloc = True
 
     from rpython.jit.backend.arm.arch import JITFRAME_FIXED_SIZE
     all_reg_indexes = range(len(all_regs))
@@ -25,13 +31,11 @@
     float_regs = VFPRegisterManager.all_regs
     frame_reg = fp
 
-    hf_abi = False        # use hard float abi flag
-    arch_version = 7
-
     def __init__(self, rtyper, stats, opts=None, translate_support_code=False,
                  gcdescr=None):
         AbstractLLCPU.__init__(self, rtyper, stats, opts,
                                translate_support_code, gcdescr)
+        self.cpuinfo = CPUInfo()
 
     def set_debug(self, flag):
         return self.assembler.set_debug(flag)
@@ -46,6 +50,8 @@
         self.assembler = AssemblerARM(self, self.translate_support_code)
 
     def setup_once(self):
+        self.cpuinfo.arch_version = detect_arch_version()
+        self.cpuinfo.hf_abi = detect_hardfloat()
         self.assembler.setup_once()
 
     def finish_once(self):
@@ -89,7 +95,7 @@
         from rpython.jit.backend.arm.codebuilder import InstrBuilder
 
         for jmp, tgt in looptoken.compiled_loop_token.invalidate_positions:
-            mc = InstrBuilder(self.arch_version)
+            mc = InstrBuilder(self.cpuinfo.arch_version)
             mc.B_offs(tgt)
             mc.copy_to_raw_memory(jmp)
         # positions invalidated
@@ -113,10 +119,5 @@
 
 
 class CPU_ARM(AbstractARMCPU):
-    """ARM v7"""
-    backend_name = "armv7"
-
-class CPU_ARMv6(AbstractARMCPU):
-    """ ARM v6, uses hardfp ABI, requires vfp"""
-    arch_version = 6
-    backend_name = "armv6"
+    """ARM"""
+    backend_name = "arm"
diff --git a/rpython/jit/backend/arm/test/test_detect.py b/rpython/jit/backend/arm/test/test_detect.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/arm/test/test_detect.py
@@ -0,0 +1,48 @@
+import py
+from rpython.tool.udir import udir
+from rpython.jit.backend.arm.detect import detect_arch_version
+
+cpuinfo = "Processor : ARMv%d-compatible processor rev 7 (v6l)"""
+cpuinfo2 = """processor       : 0
+vendor_id       : GenuineIntel
+cpu family      : 6
+model           : 23
+model name      : Intel(R) Core(TM)2 Duo CPU     E8400  @ 3.00GHz
+stepping        : 10
+microcode       : 0xa07
+cpu MHz         : 2997.000
+cache size      : 6144 KB
+physical id     : 0
+siblings        : 2
+core id         : 0
+cpu cores       : 2
+apicid          : 0
+initial apicid  : 0
+fpu             : yes
+fpu_exception   : yes
+cpuid level     : 13
+wp              : yes
+flags           : fpu vme ...
+bogomips        : 5993.08
+clflush size    : 64
+cache_alignment : 64
+address sizes   : 36 bits physical, 48 bits virtual
+power management:
+"""
+
+def write_cpuinfo(info):
+    filepath = udir.join('get_arch_version')
+    filepath.write(info)
+    return str(filepath)
+
+
+def test_detect_arch_version():
+    # currently supported cases
+    for i in (6, 7, ):
+        filepath = write_cpuinfo(cpuinfo % i)
+        assert detect_arch_version(filepath) == i
+    # unsupported cases
+    assert detect_arch_version(write_cpuinfo(cpuinfo % 8)) == 7
+    py.test.raises(ValueError,
+            'detect_arch_version(write_cpuinfo(cpuinfo % 5))')
+    assert detect_arch_version(write_cpuinfo(cpuinfo2)) == 6
diff --git a/rpython/jit/backend/arm/test/test_runner.py b/rpython/jit/backend/arm/test/test_runner.py
--- a/rpython/jit/backend/arm/test/test_runner.py
+++ b/rpython/jit/backend/arm/test/test_runner.py
@@ -10,6 +10,7 @@
 from rpython.rtyper.annlowlevel import llhelper
 from rpython.jit.codewriter.effectinfo import EffectInfo
 from rpython.jit.metainterp.history import JitCellToken, TargetToken
+from rpython.jit.backend.arm.detect import detect_arch_version
 
 
 CPU = getcpuclass()
@@ -27,7 +28,8 @@
     bridge_loop_instructions = ['ldr', 'mov', 'nop', 'cmp', 'bge',
                                 'push', 'mov', 'mov', 'push', 'mov', 'mov',
                                 'blx', 'mov', 'mov', 'bx']
-    if CPU.backend_name.startswith('armv7'):
+    arch_version = detect_arch_version()
+    if arch_version == 7:
         bridge_loop_instructions = ['ldr', 'mov', 'nop', 'cmp', 'bge',
                                     'push', 'mov', 'mov', 'push', 'mov', 'mov',
                                     'blx', 'mov', 'mov', 'bx']
diff --git a/rpython/jit/backend/detect_cpu.py b/rpython/jit/backend/detect_cpu.py
--- a/rpython/jit/backend/detect_cpu.py
+++ b/rpython/jit/backend/detect_cpu.py
@@ -33,8 +33,8 @@
                 'x86_64': 'x86',
                 'amd64': 'x86',    # freebsd
                 'AMD64': 'x86',    # win64
-                'armv7l': 'armv7',
-                'armv6l': 'armv6',
+                'armv7l': 'arm',
+                'armv6l': 'arm',
                 }[mach]
     except KeyError:
         return mach
@@ -75,9 +75,7 @@
         return "rpython.jit.backend.x86.runner", "CPU_X86_64"
     elif backend_name == 'cli':
         return "rpython.jit.backend.cli.runner", "CliCPU"
-    elif backend_name.startswith('armv6'):
-        return "rpython.jit.backend.arm.runner", "CPU_ARMv6"
-    elif backend_name.startswith('armv7'):
+    elif backend_name.startswith('arm'):
         return "rpython.jit.backend.arm.runner", "CPU_ARM"
     else:
         raise ProcessorAutodetectError, (
diff --git a/rpython/jit/backend/llsupport/regalloc.py b/rpython/jit/backend/llsupport/regalloc.py
--- a/rpython/jit/backend/llsupport/regalloc.py
+++ b/rpython/jit/backend/llsupport/regalloc.py
@@ -740,6 +740,16 @@
     op = Fake(None)
     return op.is_comparison() or op.is_ovf()
 
+def valid_addressing_size(size):
+    return size == 1 or size == 2 or size == 4 or size == 8
+
+def get_scale(size):
+    assert valid_addressing_size(size)
+    if size < 4:
+        return size - 1         # 1, 2 => 0, 1
+    else:
+        return (size >> 2) + 1  # 4, 8 => 2, 3
+
 
 def not_implemented(msg):
     os.write(2, '[llsupport/regalloc] %s\n' % msg)
diff --git a/rpython/jit/backend/test/runner_test.py b/rpython/jit/backend/test/runner_test.py
--- a/rpython/jit/backend/test/runner_test.py
+++ b/rpython/jit/backend/test/runner_test.py
@@ -1057,6 +1057,28 @@
         r = self.cpu.bh_getinteriorfield_gc_i(a_box.getref_base(), 4, vsdescr)
         assert r == 4
 
+    def test_array_of_structs_all_sizes(self):
+        # x86 has special support that can be used for sizes
+        #   1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 16, 18, 20, 24, 32, 36, 40, 64, 72
+        for length in range(1, 75):
+            ITEM = lltype.FixedSizeArray(lltype.Char, length)
+            a_box, A = self.alloc_array_of(ITEM, 5)
+            a = a_box.getref(lltype.Ptr(A))
+            middle = length // 2
+            a[3][middle] = chr(65 + length)
+            fdescr = self.cpu.interiorfielddescrof(A, 'item%d' % middle)
+            r = self.execute_operation(rop.GETINTERIORFIELD_GC,
+                                       [a_box, BoxInt(3)],
+                                       'int', descr=fdescr)
+            r = r.getint()
+            assert r == 65 + length
+            self.execute_operation(rop.SETINTERIORFIELD_GC,
+                                   [a_box, BoxInt(2), BoxInt(r + 1)],
+                                   'void', descr=fdescr)
+            r1 = self.cpu.bh_getinteriorfield_gc_i(a_box.getref_base(), 2,
+                                                  fdescr)
+            assert r1 == r + 1
+
     def test_string_basic(self):
         s_box = self.alloc_string("hello\xfe")
         r = self.execute_operation(rop.STRLEN, [s_box], 'int')
@@ -3913,3 +3935,20 @@
         descr = self.cpu.get_latest_descr(frame)
         assert descr.identifier == 42
         assert not self.cpu.grab_exc_value(frame)
+
+    def test_setarrayitem_raw_short(self):
+        # setarrayitem_raw(140737353744432, 0, 30583, descr=<ArrayS 2>)
+        A = rffi.CArray(rffi.SHORT)
+        arraydescr = self.cpu.arraydescrof(A)
+        a = lltype.malloc(A, 2, flavor='raw')
+        a[0] = rffi.cast(rffi.SHORT, 666)
+        a[1] = rffi.cast(rffi.SHORT, 777)
+        addr = llmemory.cast_ptr_to_adr(a)
+        a_int = heaptracker.adr2int(addr)
+        print 'a_int:', a_int
+        self.execute_operation(rop.SETARRAYITEM_RAW,
+                               [ConstInt(a_int), ConstInt(0), ConstInt(-7654)],
+                               'void', descr=arraydescr)
+        assert rffi.cast(lltype.Signed, a[0]) == -7654
+        assert rffi.cast(lltype.Signed, a[1]) == 777
+        lltype.free(a, flavor='raw')
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -13,8 +13,9 @@
 from rpython.rtyper.annlowlevel import llhelper, cast_instance_to_gcref
 from rpython.rlib.jit import AsmInfo
 from rpython.jit.backend.model import CompiledLoopToken
-from rpython.jit.backend.x86.regalloc import (RegAlloc, get_ebp_ofs, _get_scale,
-    gpr_reg_mgr_cls, xmm_reg_mgr_cls, _valid_addressing_size)
+from rpython.jit.backend.x86.regalloc import (RegAlloc, get_ebp_ofs,
+    gpr_reg_mgr_cls, xmm_reg_mgr_cls)
+from rpython.jit.backend.llsupport.regalloc import (get_scale, valid_addressing_size)
 from rpython.jit.backend.x86.arch import (FRAME_FIXED_SIZE, WORD, IS_X86_64,
                                        JITFRAME_FIXED_SIZE, IS_X86_32,
                                        PASS_ON_MY_FRAME)
@@ -1523,7 +1524,7 @@
         base_loc, ofs_loc, size_loc, ofs, sign_loc = arglocs
         assert isinstance(ofs, ImmedLoc)
         assert isinstance(size_loc, ImmedLoc)
-        scale = _get_scale(size_loc.value)
+        scale = get_scale(size_loc.value)
         src_addr = addr_add(base_loc, ofs_loc, ofs.value, scale)
         self.load_from_mem(resloc, src_addr, size_loc, sign_loc)
 
@@ -1537,22 +1538,50 @@
         src_addr = addr_add(base_loc, ofs_loc, ofs.value, 0)
         self.load_from_mem(resloc, src_addr, size_loc, sign_loc)
 
+    def _imul_const_scaled(self, mc, targetreg, sourcereg, itemsize):
+        """Produce one operation to do roughly
+               targetreg = sourcereg * itemsize
+           except that the targetreg may still need shifting by 0,1,2,3.
+        """
+        if (itemsize & 7) == 0:
+            shift = 3
+        elif (itemsize & 3) == 0:
+            shift = 2
+        elif (itemsize & 1) == 0:
+            shift = 1
+        else:
+            shift = 0
+        itemsize >>= shift
+        #
+        if valid_addressing_size(itemsize - 1):
+            mc.LEA_ra(targetreg, (sourcereg, sourcereg,
+                                  get_scale(itemsize - 1), 0))
+        elif valid_addressing_size(itemsize):
+            mc.LEA_ra(targetreg, (rx86.NO_BASE_REGISTER, sourcereg,
+                                  get_scale(itemsize), 0))
+        else:
+            mc.IMUL_rri(targetreg, sourcereg, itemsize)
+        #
+        return shift
+
     def _get_interiorfield_addr(self, temp_loc, index_loc, itemsize_loc,
                                 base_loc, ofs_loc):
         assert isinstance(itemsize_loc, ImmedLoc)
+        itemsize = itemsize_loc.value
         if isinstance(index_loc, ImmedLoc):
-            temp_loc = imm(index_loc.value * itemsize_loc.value)
-        elif _valid_addressing_size(itemsize_loc.value):
-            return AddressLoc(base_loc, index_loc, _get_scale(itemsize_loc.value), ofs_loc.value)
+            temp_loc = imm(index_loc.value * itemsize)
+            shift = 0
+        elif valid_addressing_size(itemsize):
+            temp_loc = index_loc
+            shift = get_scale(itemsize)
         else:
-            # XXX should not use IMUL in more cases, it can use a clever LEA
+            assert isinstance(index_loc, RegLoc)
             assert isinstance(temp_loc, RegLoc)
-            assert isinstance(index_loc, RegLoc)
             assert not temp_loc.is_xmm
-            self.mc.IMUL_rri(temp_loc.value, index_loc.value,
-                             itemsize_loc.value)
+            shift = self._imul_const_scaled(self.mc, temp_loc.value,
+                                            index_loc.value, itemsize)
         assert isinstance(ofs_loc, ImmedLoc)
-        return AddressLoc(base_loc, temp_loc, 0, ofs_loc.value)
+        return AddressLoc(base_loc, temp_loc, shift, ofs_loc.value)
 
     def genop_getinteriorfield_gc(self, op, arglocs, resloc):
         (base_loc, ofs_loc, itemsize_loc, fieldsize_loc,
@@ -1582,7 +1611,7 @@
         base_loc, ofs_loc, value_loc, size_loc, baseofs = arglocs
         assert isinstance(baseofs, ImmedLoc)
         assert isinstance(size_loc, ImmedLoc)
-        scale = _get_scale(size_loc.value)
+        scale = get_scale(size_loc.value)
         dest_addr = AddressLoc(base_loc, ofs_loc, scale, baseofs.value)
         self.save_into_mem(dest_addr, value_loc, size_loc)
 
@@ -2415,11 +2444,12 @@
         jmp_adr0 = self.mc.get_relative_pos()
 
         self.mc.MOV(eax, heap(nursery_free_adr))
-        shift = size2shift(itemsize)
-        if shift < 0:
-            self.mc.IMUL_rri(edi.value, varsizeloc.value, itemsize)
+        if valid_addressing_size(itemsize):
+            shift = get_scale(itemsize)
+        else:
+            shift = self._imul_const_scaled(self.mc, edi.value,
+                                            varsizeloc.value, itemsize)
             varsizeloc = edi
-            shift = 0
         # now varsizeloc is a register != eax.  The size of
         # the variable part of the array is (varsizeloc << shift)
         assert arraydescr.basesize >= self.gc_minimal_size_in_nursery
@@ -2523,13 +2553,5 @@
     os.write(2, '[x86/asm] %s\n' % msg)
     raise NotImplementedError(msg)
 
-def size2shift(size):
-    "Return a result 0..3 such that (1<<result) == size, or -1 if impossible"
-    if size == 1: return 0
-    if size == 2: return 1
-    if size == 4: return 2
-    if size == 8: return 3
-    return -1
-
 class BridgeAlreadyCompiled(Exception):
     pass
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -1380,16 +1380,6 @@
     # i.e. the n'th word beyond the fixed frame size.
     return base_ofs + WORD * (position + JITFRAME_FIXED_SIZE)
 
-def _valid_addressing_size(size):
-    return size == 1 or size == 2 or size == 4 or size == 8
-
-def _get_scale(size):
-    assert _valid_addressing_size(size)
-    if size < 4:
-        return size - 1         # 1, 2 => 0, 1
-    else:
-        return (size >> 2) + 1  # 4, 8 => 2, 3
-
 def not_implemented(msg):
     os.write(2, '[x86/regalloc] %s\n' % msg)
     raise NotImplementedError(msg)
diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -521,17 +521,6 @@
 
         return func_with_new_name(INSN, "INSN_" + name)
 
-    def _16_bit_binaryop(name):
-        def INSN(self, loc1, loc2):
-            # Select 16-bit operand mode
-            self.writechar('\x66')
-            # XXX: Hack to let immediate() in rx86 know to do a 16-bit encoding
-            self._use_16_bit_immediate = True
-            getattr(self, name)(loc1, loc2)
-            self._use_16_bit_immediate = False
-
-        return INSN
-
     def _addr_as_reg_offset(self, addr):
         # Encodes a (64-bit) address as an offset from the scratch register.
         # If we are within a "reuse_scratch_register" block, we remember the
@@ -616,10 +605,10 @@
     NEG = _unaryop('NEG')
 
     CMP = _binaryop('CMP')
-    CMP16 = _16_bit_binaryop('CMP')
+    CMP16 = _binaryop('CMP16')
     MOV = _binaryop('MOV')
     MOV8 = _binaryop('MOV8')
-    MOV16 = _16_bit_binaryop('MOV')
+    MOV16 = _binaryop('MOV16')
     MOVZX8 = _binaryop('MOVZX8')
     MOVSX8 = _binaryop('MOVSX8')
     MOVZX16 = _binaryop('MOVZX16')
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -125,10 +125,7 @@
     elif width == 'q' and mc.WORD == 8:
         mc.writeimm64(immediate)
     else:
-        if mc._use_16_bit_immediate:
-            mc.writeimm16(immediate)
-        else:
-            mc.writeimm32(immediate)
+        mc.writeimm32(immediate)
     return 0
 
 def immediate(argnum, width='i'):
@@ -282,16 +279,20 @@
 # (the immediate address itself must be explicitely encoded as well,
 # with immediate(argnum)).
 
-def encode_abs(mc, _1, _2, orbyte):
+ at specialize.arg(2)
+def encode_abs(mc, immediate, _, orbyte):
     # expands to either '\x05' on 32-bit, or '\x04\x25' on 64-bit
     if mc.WORD == 8:
         mc.writechar(chr(0x04 | orbyte))
         mc.writechar(chr(0x25))
     else:
         mc.writechar(chr(0x05 | orbyte))
+    # followed by an immediate, always 32 bits
+    mc.writeimm32(immediate)
     return 0
 
-abs_ = encode_abs, 0, None, None
+def abs_(argnum):
+    return encode_abs, argnum, None, None
 
 # ____________________________________________________________
 # For 64-bits mode: the REX.W, REX.R, REX.X, REG.B prefixes
@@ -305,9 +306,6 @@
 def encode_rex(mc, rexbyte, basevalue, orbyte):
     if mc.WORD == 8:
         assert 0 <= rexbyte < 8
-        # XXX: Hack. Ignore REX.W if we are using 16-bit operands
-        if mc._use_16_bit_immediate:
-            basevalue &= ~REX_W
         if basevalue != 0 or rexbyte != 0:
             if basevalue == 0:
                 basevalue = 0x40
@@ -374,9 +372,8 @@
     INSN_br = insn(rex_w, chr(base+1), register(2,8), stack_bp(1))
     INSN_rb = insn(rex_w, chr(base+3), register(1,8), stack_bp(2))
     INSN_rm = insn(rex_w, chr(base+3), register(1,8), mem_reg_plus_const(2))
-    INSN_rj = insn(rex_w, chr(base+3), register(1,8), abs_, immediate(2))
-    INSN_ji8 = insn(rex_w, '\x83', orbyte(base), abs_, immediate(1),
-                    immediate(2,'b'))
+    INSN_rj = insn(rex_w, chr(base+3), register(1,8), abs_(2))
+    INSN_ji8 = insn(rex_w, '\x83', orbyte(base), abs_(1), immediate(2,'b'))
     INSN_mi8 = insn(rex_w, '\x83', orbyte(base), mem_reg_plus_const(1),
                     immediate(2,'b'))
     INSN_bi8 = insn(rex_w, '\x83', orbyte(base), stack_bp(1), immediate(2,'b'))
@@ -449,9 +446,6 @@
 class AbstractX86CodeBuilder(object):
     """Abstract base class."""
 
-    # Used by the 16-bit version of instructions
-    _use_16_bit_immediate = False
-
     def writechar(self, char):
         raise NotImplementedError
 
@@ -489,15 +483,13 @@
     CMP_mi = select_8_or_32_bit_immed(CMP_mi8, CMP_mi32)
     CMP_mr = insn(rex_w, '\x39', register(2, 8), mem_reg_plus_const(1))
 
-    CMP_ji8 = insn(rex_w, '\x83', orbyte(7<<3), abs_,
-                   immediate(1), immediate(2, 'b'))
-    CMP_ji32 = insn(rex_w, '\x81', orbyte(7<<3), abs_,
-                    immediate(1), immediate(2))
+    CMP_ji8 = insn(rex_w, '\x83', orbyte(7<<3), abs_(1), immediate(2, 'b'))
+    CMP_ji32 = insn(rex_w, '\x81', orbyte(7<<3), abs_(1), immediate(2))
     CMP_ji = select_8_or_32_bit_immed(CMP_ji8, CMP_ji32)
-    CMP_jr = insn(rex_w, '\x39', register(2, 8), abs_, immediate(1))
+    CMP_jr = insn(rex_w, '\x39', register(2, 8), abs_(1))
 
     CMP32_mi = insn(rex_nw, '\x81', orbyte(7<<3), mem_reg_plus_const(1), immediate(2))
-
+    CMP16_mi = insn('\x66', rex_nw, '\x81', orbyte(7<<3), mem_reg_plus_const(1), immediate(2, 'h'))
     CMP8_ri = insn(rex_fw, '\x80', byte_register(1), '\xF8', immediate(2, 'b'))
 
     AND8_rr = insn(rex_fw, '\x20', byte_register(1), byte_register(2,8), '\xC0')
@@ -505,7 +497,7 @@
     OR8_rr = insn(rex_fw, '\x08', byte_register(1), byte_register(2,8), '\xC0')
     OR8_mi = insn(rex_nw, '\x80', orbyte(1<<3), mem_reg_plus_const(1),
                   immediate(2, 'b'))
-    OR8_ji = insn(rex_nw, '\x80', orbyte(1<<3), abs_, immediate(1),
+    OR8_ji = insn(rex_nw, '\x80', orbyte(1<<3), abs_(1),
                   immediate(2, 'b'))
 
     NEG_r = insn(rex_w, '\xF7', register(1), '\xD8')
@@ -556,7 +548,7 @@
     LEA32_rb = insn(rex_w, '\x8D', register(1,8),stack_bp(2,force_32bits=True))
     LEA_ra = insn(rex_w, '\x8D', register(1, 8), mem_reg_plus_scaled_reg_plus_const(2))
     LEA_rm = insn(rex_w, '\x8D', register(1, 8), mem_reg_plus_const(2))
-    LEA_rj = insn(rex_w, '\x8D', register(1, 8), abs_, immediate(2))
+    LEA_rj = insn(rex_w, '\x8D', register(1, 8), abs_(2))
 
     CALL_l = insn('\xE8', relative(1))
     CALL_r = insn(rex_nw, '\xFF', register(1), chr(0xC0 | (2<<3)))
@@ -583,11 +575,11 @@
 
     TEST8_mi = insn(rex_nw, '\xF6', orbyte(0<<3), mem_reg_plus_const(1), immediate(2, 'b'))
     TEST8_bi = insn(rex_nw, '\xF6', orbyte(0<<3), stack_bp(1), immediate(2, 'b'))
-    TEST8_ji = insn(rex_nw, '\xF6', orbyte(0<<3), abs_, immediate(1), immediate(2, 'b'))
+    TEST8_ji = insn(rex_nw, '\xF6', orbyte(0<<3), abs_(1), immediate(2, 'b'))
     TEST_rr = insn(rex_w, '\x85', register(2,8), register(1), '\xC0')
 
     BTS_mr = insn(rex_w, '\x0F\xAB', register(2,8), mem_reg_plus_const(1))
-    BTS_jr = insn(rex_w, '\x0F\xAB', register(2,8), abs_, immediate(1))
+    BTS_jr = insn(rex_w, '\x0F\xAB', register(2,8), abs_(1))
 
     # x87 instructions
     FSTPL_b = insn('\xDD', orbyte(3<<3), stack_bp(1)) # rffi.DOUBLE ('as' wants L??)
@@ -690,10 +682,12 @@
 
 def define_modrm_modes(insnname_template, before_modrm, after_modrm=[], regtype='GPR'):
     def add_insn(code, *modrm):
-        args = before_modrm + list(modrm) + after_modrm
+        args = before_modrm + list(modrm)
         methname = insnname_template.replace('*', code)
-        if methname.endswith('_rr') or methname.endswith('_xx'):
+        if (methname.endswith('_rr') or methname.endswith('_xx')
+                or methname.endswith('_ri')):
             args.append('\xC0')
+        args += after_modrm
 
         if regtype == 'XMM':
             insn_func = xmminsn(*args)
@@ -718,7 +712,7 @@
     add_insn('s', stack_sp(modrm_argnum))
     add_insn('m', mem_reg_plus_const(modrm_argnum))
     add_insn('a', mem_reg_plus_scaled_reg_plus_const(modrm_argnum))
-    add_insn('j', abs_, immediate(modrm_argnum))
+    add_insn('j', abs_(modrm_argnum))
 
 # Define a regular MOV, and a variant MOV32 that only uses the low 4 bytes of a
 # register
@@ -729,6 +723,8 @@
 
 define_modrm_modes('MOV8_*r', [rex_fw, '\x88', byte_register(2, 8)], regtype='BYTE')
 define_modrm_modes('MOV8_*i', [rex_fw, '\xC6', orbyte(0<<3)], [immediate(2, 'b')], regtype='BYTE')
+define_modrm_modes('MOV16_*r', ['\x66', rex_nw, '\x89', register(2, 8)])
+define_modrm_modes('MOV16_*i', ['\x66', rex_nw, '\xC7', orbyte(0<<3)], [immediate(2, 'h')])
 
 define_modrm_modes('MOVZX8_r*', [rex_w, '\x0F\xB6', register(1, 8)], regtype='BYTE')
 define_modrm_modes('MOVSX8_r*', [rex_w, '\x0F\xBE', register(1, 8)], regtype='BYTE')
@@ -766,7 +762,7 @@
     #
     assert insnname_template.count('*') == 1
     add_insn('x', register(2), '\xC0')
-    add_insn('j', abs_, immediate(2))
+    add_insn('j', abs_(2))
     add_insn('m', mem_reg_plus_const(2))
 
 define_pxmm_insn('PADDQ_x*',     '\xD4')
diff --git a/rpython/jit/backend/x86/test/test_regloc.py b/rpython/jit/backend/x86/test/test_regloc.py
--- a/rpython/jit/backend/x86/test/test_regloc.py
+++ b/rpython/jit/backend/x86/test/test_regloc.py
@@ -1,4 +1,5 @@
 import struct, sys
+from rpython.jit.backend.x86.rx86 import R
 from rpython.jit.backend.x86.regloc import *
 from rpython.jit.backend.x86.test.test_rx86 import CodeBuilder32, CodeBuilder64, assert_encodes_as
 from rpython.jit.backend.x86.assembler import heap
@@ -15,36 +16,49 @@
 cb32 = LocationCodeBuilder32
 cb64 = LocationCodeBuilder64
 
+def test_mov_8():
+    assert_encodes_as(cb32, "MOV8_ri", (R.cl, 25), '\xB1\x19')
+
 def test_mov_16():
+    # only 'MOV16_*r' and 'MOV16_*i' are supported
     # 32-bit
     assert_encodes_as(cb32, "MOV16", (ecx, ebx), '\x66\x89\xD9')
-    assert_encodes_as(cb32, "MOV16", (ecx, ImmedLoc(12345)), '\x66\xB9\x39\x30')
-
+    assert_encodes_as(cb32, "MOV16",
+                      (AddressLoc(ecx, ImmedLoc(16), 0, 0), ebx),
+                      '\x66\x89\x59\x10')
     # 64-bit
     assert_encodes_as(cb64, "MOV16", (r8, ebx), '\x66\x41\x89\xD8')  # 11 011 000
     assert_encodes_as(cb64, "MOV16", (ebx, r8), '\x66\x44\x89\xC3')  # 11 000 011
-    assert_encodes_as(cb64, "MOV16", (ecx, ebx), '\x66\x40\x89\xD9')
-    assert_encodes_as(cb64, "MOV16", (ecx, ImmedLoc(12345)), '\x66\xB9\x39\x30')
+    assert_encodes_as(cb64, "MOV16", (ecx, ebx), '\x66\x89\xD9')
     # for the next case we don't pick the most efficient encoding, but well
-    expected = '\x66\x40\xC7\xC1\xC7\xCF'  # could be '\x66\xB9\xC7\xCF'
+    expected = '\x66\xC7\xC1\x39\x30'      # could be '\x66\xB9\x39\x30'
+    assert_encodes_as(cb64, "MOV16", (ecx, ImmedLoc(12345)), expected)
+    # for the next case we don't pick the most efficient encoding, but well
+    expected = '\x66\xC7\xC1\xC7\xCF'      # could be '\x66\xB9\xC7\xCF'
     assert_encodes_as(cb64, "MOV16", (ecx, ImmedLoc(-12345)), expected)
-    assert_encodes_as(cb64, "MOV16", (r9, ImmedLoc(12345)), '\x66\x41\xB9\x39\x30')
+    # for the next case we don't pick the most efficient encoding, but well
+    expected = '\x66\x41\xC7\xC1\x39\x30'  # could be '\x66\x41\xB9\x39\x30'
+    assert_encodes_as(cb64, "MOV16", (r9, ImmedLoc(12345)), expected)
     # for the next case we don't pick the most efficient encoding, but well
     expected = '\x66\x41\xC7\xC1\xC7\xCF'  # could be '\x66\x41\xB9\xC7\xCF'
     assert_encodes_as(cb64, "MOV16", (r9, ImmedLoc(-12345)), expected)
-    assert_encodes_as(cb64, "MOV16", (AddressLoc(r13, ImmedLoc(0), 0, 0), ImmedLoc(12345)), '\x66\x41\xC7\x45\x00\x39\x30')
+    assert_encodes_as(cb64, "MOV16",
+                      (AddressLoc(r13, ImmedLoc(0), 0, 0), ImmedLoc(12345)),
+                      '\x66\x41\xC7\x45\x00\x39\x30')
 
 def test_cmp_16():
+    # only 'CMP16_mi' is supported
     # 32-bit
-    assert_encodes_as(cb32, "CMP16", (ecx, ebx), '\x66\x39\xD9')
-    assert_encodes_as(cb32, "CMP16", (ecx, ImmedLoc(12345)), '\x66\x81\xF9\x39\x30')
-
+    assert_encodes_as(cb32, "CMP16",
+                      (AddressLoc(ecx, ImmedLoc(0), 0, 0), ImmedLoc(21324)),
+                      '\x66\x81\x39\x4c\x53')
+    assert_encodes_as(cb32, "CMP16",
+                      (AddressLoc(esi, ImmedLoc(2), 0, 0), ImmedLoc(-12345)),
+                      '\x66\x81\x7e\x02\xc7\xcf')
     # 64-bit
-    assert_encodes_as(cb64, "CMP16", (r8, ebx), '\x66\x41\x39\xD8')  # 11 011 000
-    assert_encodes_as(cb64, "CMP16", (ebx, r8), '\x66\x44\x39\xC3')  # 11 000 011
-    assert_encodes_as(cb64, "CMP16", (ecx, ebx), '\x66\x40\x39\xD9')
-    assert_encodes_as(cb64, "CMP16", (ecx, ImmedLoc(12345)), '\x66\x40\x81\xF9\x39\x30')
-    assert_encodes_as(cb64, "CMP16", (AddressLoc(r13, ImmedLoc(0), 0, 0), ImmedLoc(12345)), '\x66\x41\x81\x7D\x00\x39\x30')
+    assert_encodes_as(cb64, "CMP16",
+                      (AddressLoc(r13, ImmedLoc(0), 0, 0), ImmedLoc(12345)),
+                      '\x66\x41\x81\x7D\x00\x39\x30')
 
 def test_relocation():
     from rpython.rtyper.lltypesystem import lltype, rffi
diff --git a/rpython/memory/gc/env.py b/rpython/memory/gc/env.py
--- a/rpython/memory/gc/env.py
+++ b/rpython/memory/gc/env.py
@@ -149,6 +149,10 @@
     else:
         data = ''.join(data)
         linepos = 0
+        # Currently on ARM-linux we won't find any information about caches in
+        # cpuinfo
+        if _detect_arm_cpu(data):
+            return -1
         while True:
             start = _findend(data, '\ncache size', linepos)
             if start < 0:
@@ -201,6 +205,11 @@
         pos += 1
     return pos
 
+def _detect_arm_cpu(data):
+    # check for the presence of a 'Processor' entry
+    p = _findend(data, 'Processor', 0)
+    return p >= 0 and _findend(data, 'ARMv', p) > 0
+
 # ---------- Darwin ----------
 
 sysctlbyname = rffi.llexternal('sysctlbyname',
@@ -270,7 +279,7 @@
 def best_nursery_size_for_L2cache(L2cache):
     # Heuristically, the best nursery size to choose is about half
     # of the L2 cache.
-    if L2cache > 1024 * 1024: # we don't want to have nursery estimated
+    if L2cache > 2 * 1024 * 1024: # we don't want to have nursery estimated
         # on L2 when L3 is present
         return L2cache // 2
     else:
diff --git a/rpython/memory/gc/minimark.py b/rpython/memory/gc/minimark.py
--- a/rpython/memory/gc/minimark.py
+++ b/rpython/memory/gc/minimark.py
@@ -130,6 +130,7 @@
 FORWARDSTUB = lltype.GcStruct('forwarding_stub',
                               ('forw', llmemory.Address))
 FORWARDSTUBPTR = lltype.Ptr(FORWARDSTUB)
+NURSARRAY = lltype.Array(llmemory.Address)
 
 # ____________________________________________________________
 
@@ -263,7 +264,7 @@
         self.nursery_top  = NULL
         self.nursery_real_top = NULL
         self.debug_tiny_nursery = -1
-        self.debug_rotating_nurseries = None
+        self.debug_rotating_nurseries = lltype.nullptr(NURSARRAY)
         self.extra_threshold = 0
         #
         # The ArenaCollection() handles the nonmovable objects allocation.
@@ -350,8 +351,6 @@
             # hacking at the current nursery position in collect_and_reserve().
             if newsize <= 0:
                 newsize = env.estimate_best_nursery_size()
-                #         4*1024*1024   # fixed to 4MB by default
-                #        (it was env.estimate_best_nursery_size())
                 if newsize <= 0:
                     newsize = defaultsize
             if newsize < minsize:
@@ -471,23 +470,32 @@
             # and use them alternatively, while mprotect()ing the unused
             # ones to detect invalid access.
             debug_start("gc-debug")
-            self.debug_rotating_nurseries = []
-            for i in range(22):
+            self.debug_rotating_nurseries = lltype.malloc(
+                NURSARRAY, 22, flavor='raw', track_allocation=False)
+            i = 0
+            while i < 22:
                 nurs = self._alloc_nursery()
                 llarena.arena_protect(nurs, self._nursery_memory_size(), True)
-                self.debug_rotating_nurseries.append(nurs)
+                self.debug_rotating_nurseries[i] = nurs
+                i += 1
             debug_print("allocated", len(self.debug_rotating_nurseries),
                         "extra nurseries")
             debug_stop("gc-debug")
 
     def debug_rotate_nursery(self):
-        if self.debug_rotating_nurseries is not None:
+        if self.debug_rotating_nurseries:
             debug_start("gc-debug")
             oldnurs = self.nursery
             llarena.arena_protect(oldnurs, self._nursery_memory_size(), True)
-            self.debug_rotating_nurseries.append(oldnurs)
             #
-            newnurs = self.debug_rotating_nurseries.pop(0)
+            newnurs = self.debug_rotating_nurseries[0]
+            i = 0
+            while i < len(self.debug_rotating_nurseries) - 1:
+                self.debug_rotating_nurseries[i] = (
+                    self.debug_rotating_nurseries[i + 1])
+                i += 1
+            self.debug_rotating_nurseries[i] = oldnurs
+            #
             llarena.arena_protect(newnurs, self._nursery_memory_size(), False)
             self.nursery = newnurs
             self.nursery_top = self.nursery + self.initial_cleanup
diff --git a/rpython/memory/gc/test/test_env.py b/rpython/memory/gc/test/test_env.py
--- a/rpython/memory/gc/test/test_env.py
+++ b/rpython/memory/gc/test/test_env.py
@@ -161,3 +161,23 @@
 """)
     result = env.get_L2cache_linux2(str(filepath))
     assert result == 3072 * 1024
+
+def test_estimate_best_nursery_size_linux2_arm():
+    filepath = udir.join('estimate_best_nursery_size_linux2')
+    filepath.write("""\
+Processor       : ARMv6-compatible processor rev 7 (v6l)
+# this is not actually from cpuinfo, but here for the test
+cache size      : 3072 KB
+...
+""")
+    result = env.get_L2cache_linux2(str(filepath))
+    assert result == -1
+
+def test__detect_arm():
+    assert env._detect_arm_cpu("Processor       : ARMv6-compatible processor rev 7 (v6l)")
+    assert not env._detect_arm_cpu("""\
+processor   : 0
+vendor_id   : GenuineIntel
+cpu family  : 6
+model       : 37
+""")
diff --git a/rpython/memory/gctransform/framework.py b/rpython/memory/gctransform/framework.py
--- a/rpython/memory/gctransform/framework.py
+++ b/rpython/memory/gctransform/framework.py
@@ -619,6 +619,12 @@
         func = getattr(graph, 'func', None)
         if func and getattr(func, '_gc_no_collect_', False):
             if self.collect_analyzer.analyze_direct_call(graph):
+                print '!'*79
+                ca = CollectAnalyzer(self.translator)
+                ca.verbose = True
+                ca.analyze_direct_call(graph)
+                # ^^^ for the dump of which operation in which graph actually
+                # causes it to return True
                 raise Exception("'no_collect' function can trigger collection:"
                                 " %s" % func)
 
diff --git a/rpython/rlib/rwin32.py b/rpython/rlib/rwin32.py
--- a/rpython/rlib/rwin32.py
+++ b/rpython/rlib/rwin32.py
@@ -5,6 +5,7 @@
 import os
 import errno
 
+from rpython.rtyper.module.ll_os_environ import make_env_impls
 from rpython.rtyper.tool import rffi_platform
 from rpython.tool.udir import udir
 from rpython.translator.tool.cbuild import ExternalCompilationInfo
@@ -390,3 +391,5 @@
                 raise lastWindowsError('os_kill failed to terminate process')
         finally:
             CloseHandle(handle)
+
+    _wenviron_items, _wgetenv, _wputenv = make_env_impls(win32=True)
diff --git a/rpython/rlib/test/test_rwin32.py b/rpython/rlib/test/test_rwin32.py
--- a/rpython/rlib/test/test_rwin32.py
+++ b/rpython/rlib/test/test_rwin32.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 import os, py
 if os.name != 'nt':
     py.test.skip('tests for win32 only')
@@ -47,3 +48,13 @@
     rwin32.CloseHandle(handle)
     assert proc.wait() == signal.SIGTERM
  
+ at py.test.mark.dont_track_allocations('putenv intentionally keeps strings alive')
+def test_wenviron():
+    name, value = u'PYPY_TEST_日本', u'foobar日本'
+    rwin32._wputenv(name, value)
+    assert rwin32._wgetenv(name) == value
+    env = dict(rwin32._wenviron_items())
+    assert env[name] == value
+    for key, value in env.iteritems():
+        assert type(key) is unicode
+        assert type(value) is unicode
diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1059,6 +1059,7 @@
             self.buf = str2charp(value)
         else:
             self.buf = lltype.nullptr(CCHARP.TO)
+    __init__._annenforceargs_ = [None, annmodel.SomeString(can_be_None=True)]
     def __enter__(self):
         return self.buf
     def __exit__(self, *args):
@@ -1072,6 +1073,8 @@
             self.buf = unicode2wcharp(value)
         else:
             self.buf = lltype.nullptr(CWCHARP.TO)
+    __init__._annenforceargs_ = [None,
+                                 annmodel.SomeUnicodeString(can_be_None=True)]
     def __enter__(self):
         return self.buf
     def __exit__(self, *args):
diff --git a/rpython/rtyper/module/ll_os.py b/rpython/rtyper/module/ll_os.py
--- a/rpython/rtyper/module/ll_os.py
+++ b/rpython/rtyper/module/ll_os.py
@@ -7,14 +7,15 @@
 
 import os, sys, errno
 import py
-from rpython.rtyper.module.support import OOSupport
+from rpython.rtyper.module.support import (
+    _WIN32, OOSupport, StringTraits, UnicodeTraits, underscore_on_windows)
 from rpython.tool.sourcetools import func_renamer
 from rpython.rlib.rarithmetic import r_longlong
 from rpython.rtyper.extfunc import (
     BaseLazyRegistering, register_external)
 from rpython.rtyper.extfunc import registering, registering_if, extdef
 from rpython.annotator.model import (
-    SomeInteger, SomeString, SomeTuple, SomeFloat, SomeUnicodeString)
+    SomeInteger, SomeString, SomeTuple, SomeFloat, s_Str0, s_Unicode0)
 from rpython.annotator.model import s_ImpossibleValue, s_None, s_Bool
 from rpython.rtyper.lltypesystem import rffi
 from rpython.rtyper.lltypesystem import lltype
@@ -25,8 +26,8 @@
 from rpython.rtyper.lltypesystem.rstr import STR
 from rpython.rlib.objectmodel import specialize
 
-str0 = SomeString(no_nul=True)
-unicode0 = SomeUnicodeString(no_nul=True)
+str0 = s_Str0
+unicode0 = s_Unicode0
 
 def monkeypatch_rposix(posixfunc, unicodefunc, signature):
     func_name = posixfunc.__name__
@@ -66,42 +67,6 @@
     # Monkeypatch the function in rpython.rlib.rposix
     setattr(rposix, func_name, new_func)
 
-class StringTraits:
-    str = str
-    str0 = str0
-    CHAR = rffi.CHAR
-    CCHARP = rffi.CCHARP
-    charp2str = staticmethod(rffi.charp2str)
-    str2charp = staticmethod(rffi.str2charp)
-    free_charp = staticmethod(rffi.free_charp)
-    scoped_alloc_buffer = staticmethod(rffi.scoped_alloc_buffer)
-
-    @staticmethod
-    def posix_function_name(name):
-        return underscore_on_windows + name
-
-    @staticmethod
-    def ll_os_name(name):
-        return 'll_os.ll_os_' + name
-
-class UnicodeTraits:
-    str = unicode
-    str0 = unicode0
-    CHAR = rffi.WCHAR_T
-    CCHARP = rffi.CWCHARP
-    charp2str = staticmethod(rffi.wcharp2unicode)
-    str2charp = staticmethod(rffi.unicode2wcharp)
-    free_charp = staticmethod(rffi.free_wcharp)
-    scoped_alloc_buffer = staticmethod(rffi.scoped_alloc_unicodebuffer)
-
-    @staticmethod
-    def posix_function_name(name):
-        return underscore_on_windows + 'w' + name
-
-    @staticmethod
-    def ll_os_name(name):
-        return 'll_os.ll_os_w' + name
-
 def registering_str_unicode(posixfunc, condition=True):
     if not condition or posixfunc is None:
         return registering(None, condition=False)
@@ -129,16 +94,6 @@
 
 posix = __import__(os.name)
 
-if sys.platform.startswith('win'):
-    _WIN32 = True
-else:
-    _WIN32 = False
-
-if _WIN32:
-    underscore_on_windows = '_'
-else:
-    underscore_on_windows = ''
-
 includes = []
 if not _WIN32:
     # XXX many of these includes are not portable at all
diff --git a/rpython/rtyper/module/ll_os_environ.py b/rpython/rtyper/module/ll_os_environ.py
--- a/rpython/rtyper/module/ll_os_environ.py
+++ b/rpython/rtyper/module/ll_os_environ.py
@@ -1,36 +1,42 @@
-import os, sys
+import os
+import sys
 from rpython.annotator import model as annmodel
 from rpython.rtyper.controllerentry import Controller
 from rpython.rtyper.extfunc import register_external
 from rpython.rtyper.lltypesystem import rffi, lltype
-from rpython.rtyper.module import ll_os
-from rpython.rlib import rposix
+from rpython.rtyper.module.support import _WIN32, StringTraits, UnicodeTraits
+from rpython.translator.tool.cbuild import ExternalCompilationInfo
 
-str0 = ll_os.str0
+str0 = annmodel.s_Str0
 
 # ____________________________________________________________
 #
-# Annotation support to control access to 'os.environ' in the RPython program
+# Annotation support to control access to 'os.environ' in the RPython
+# program
 
 class OsEnvironController(Controller):
     knowntype = os.environ.__class__
 
     def convert(self, obj):
-        return None     # 'None' is good enough, there is only one os.environ
+        # 'None' is good enough, there is only one os.environ
+        return None
 
     def getitem(self, obj, key):
-        # in the RPython program reads of 'os.environ[key]' are redirected here
+        # in the RPython program reads of 'os.environ[key]' are
+        # redirected here
         result = r_getenv(key)
         if result is None:
             raise KeyError
         return result
 
     def setitem(self, obj, key, value):
-        # in the RPython program, 'os.environ[key] = value' is redirected here
+        # in the RPython program, 'os.environ[key] = value' is
+        # redirected here
         r_putenv(key, value)
 
     def delitem(self, obj, key):
-        # in the RPython program, 'del os.environ[key]' is redirected here
+        # in the RPython program, 'del os.environ[key]' is redirected
+        # here
         absent = r_getenv(key) is None
         # Always call unsetenv(), to get eventual OSErrors
         r_unsetenv(key)
@@ -38,102 +44,23 @@
             raise KeyError
 
     def get_keys(self, obj):
-        # 'os.environ.keys' is redirected here - note that it's the getattr
-        # that arrives here, not the actual method call!
+        # 'os.environ.keys' is redirected here - note that it's the
+        # getattr that arrives here, not the actual method call!
         return r_envkeys
 
     def get_items(self, obj):
-        # 'os.environ.items' is redirected here (not the actual method call!)
+        # 'os.environ.items' is redirected here (not the actual method
+        # call!)
         return r_envitems
 
     def get_get(self, obj):
-        # 'os.environ.get' is redirected here (not the actual method call!)
+        # 'os.environ.get' is redirected here (not the actual method
+        # call!)
         return r_getenv
 
 # ____________________________________________________________
-#
-# Lower-level interface: dummy placeholders and external registations
-
-def r_getenv(name):
-    just_a_placeholder     # should return None if name not found
-
-os_getenv = rffi.llexternal('getenv', [rffi.CCHARP], rffi.CCHARP, threadsafe=False)
-
-def getenv_llimpl(name):
-    l_name = rffi.str2charp(name)
-    l_result = os_getenv(l_name)
-    if l_result:
-        result = rffi.charp2str(l_result)
-    else:
-        result = None
-    rffi.free_charp(l_name)
-    return result
-
-register_external(r_getenv, [str0], annmodel.SomeString(can_be_None=True, no_nul=True),
-                  export_name='ll_os.ll_os_getenv',
-                  llimpl=getenv_llimpl)
-
-# ____________________________________________________________
-
-def r_putenv(name, value):
-    just_a_placeholder
-
-class EnvKeepalive:
-    pass
-envkeepalive = EnvKeepalive()
-envkeepalive.byname = {}
-
-os_putenv = rffi.llexternal('putenv', [rffi.CCHARP], rffi.INT)
-
-def putenv_llimpl(name, value):
-    l_string = rffi.str2charp('%s=%s' % (name, value))
-    error = rffi.cast(lltype.Signed, os_putenv(l_string))
-    if error:
-        rffi.free_charp(l_string)
-        raise OSError(rposix.get_errno(), "os_putenv failed")
-    # keep 'l_string' alive - we know that the C library needs it
-    # until the next call to putenv() with the same 'name'.
-    l_oldstring = envkeepalive.byname.get(name, lltype.nullptr(rffi.CCHARP.TO))
-    envkeepalive.byname[name] = l_string
-    if l_oldstring:
-        rffi.free_charp(l_oldstring)
-
-register_external(r_putenv, [str0, str0], annmodel.s_None,
-                  export_name='ll_os.ll_os_putenv',
-                  llimpl=putenv_llimpl)
-
-# ____________________________________________________________
-
-def r_unsetenv(name):
-    # default implementation for platforms without a real unsetenv()
-    r_putenv(name, '')
-
-if hasattr(__import__(os.name), 'unsetenv'):
-    os_unsetenv = rffi.llexternal('unsetenv', [rffi.CCHARP], rffi.INT)
-
-    def unsetenv_llimpl(name):
-        l_name = rffi.str2charp(name)
-        error = rffi.cast(lltype.Signed, os_unsetenv(l_name))
-        rffi.free_charp(l_name)
-        if error:
-            raise OSError(rposix.get_errno(), "os_unsetenv failed")
-        try:
-            l_oldstring = envkeepalive.byname[name]
-        except KeyError:
-            pass
-        else:
-            del envkeepalive.byname[name]
-            rffi.free_charp(l_oldstring)
-
-    register_external(r_unsetenv, [str0], annmodel.s_None,
-                      export_name='ll_os.ll_os_unsetenv',
-                      llimpl=unsetenv_llimpl)
-
-# ____________________________________________________________
 # Access to the 'environ' external variable
 
-from rpython.translator.tool.cbuild import ExternalCompilationInfo
-
 if sys.platform.startswith('darwin'):
     CCHARPPP = rffi.CArrayPtr(rffi.CCHARPP)
     _os_NSGetEnviron = rffi.llexternal(
@@ -142,17 +69,21 @@
         )
     def os_get_environ():
         return _os_NSGetEnviron()[0]
-elif sys.platform.startswith('win'):
+elif _WIN32:
+    eci = ExternalCompilationInfo(includes=['stdlib.h'])
+    CWCHARPP = lltype.Ptr(lltype.Array(rffi.CWCHARP, hints={'nolength': True}))
+
     os_get_environ, _os_set_environ = rffi.CExternVariable(
-        rffi.CCHARPP,
-        '_environ',
-        ExternalCompilationInfo(includes=['stdlib.h']))
+        rffi.CCHARPP, '_environ', eci)
+    get__wenviron, _set__wenviron = rffi.CExternVariable(
+        CWCHARPP, '_wenviron', eci, c_type='wchar_t **')
 else:
-    os_get_environ, _os_set_environ = rffi.CExternVariable(rffi.CCHARPP,
-                                                           'environ',
-                                                           ExternalCompilationInfo())
+    os_get_environ, _os_set_environ = rffi.CExternVariable(
+        rffi.CCHARPP, 'environ', ExternalCompilationInfo())
 
 # ____________________________________________________________
+#
+# Lower-level interface: dummy placeholders and external registations
 
 def r_envkeys():
     just_a_placeholder
@@ -178,18 +109,109 @@
 def r_envitems():
     just_a_placeholder
 
-def envitems_llimpl():
-    environ = os_get_environ()
-    result = []
-    i = 0
-    while environ[i]:
-        name_value = rffi.charp2str(environ[i])
-        p = name_value.find('=')
-        if p >= 0:
-            result.append((name_value[:p], name_value[p+1:]))
-        i += 1
-    return result
+def r_getenv(name):
+    just_a_placeholder     # should return None if name not found
+
+def r_putenv(name, value):
+    just_a_placeholder
+
+os_getenv = rffi.llexternal('getenv', [rffi.CCHARP], rffi.CCHARP,
+                            threadsafe=False)
+os_putenv = rffi.llexternal('putenv', [rffi.CCHARP], rffi.INT)
+if _WIN32:
+    _wgetenv = rffi.llexternal('_wgetenv', [rffi.CWCHARP], rffi.CWCHARP,
+                               compilation_info=eci, threadsafe=False)
+    _wputenv = rffi.llexternal('_wputenv', [rffi.CWCHARP], rffi.INT,
+                               compilation_info=eci)
+
+class EnvKeepalive:
+    pass
+envkeepalive = EnvKeepalive()
+envkeepalive.byname = {}