[pypy-commit] pypy utf8-unicode2: Fix array

Tue Jul 29 16:17:00 CEST 2014

Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r72606:60fb6d7660b0
Date: 2014-07-19 10:21 -0500
http://bitbucket.org/pypy/pypy/changeset/60fb6d7660b0/

Log:	Fix array

diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -7,15 +7,18 @@
 
 wchar_rint = rffi.r_uint
 WCHAR_INTP = rffi.UINTP
+WCHAR_INT = rffi.UINT
 if rffi.sizeof(rffi.WCHAR_T) == 2:
     wchar_rint = rffi.r_ushort
     WCHAR_INTP = rffi.USHORTP
+    WCHAR_INT = rffi.USHORT
 
 
-def utf8chr(value):
+def utf8chr(value, allow_large_codepoints=False):
     # Like unichr, but returns a Utf8Str object
+    # TODO: Do this without the builder so its faster
     b = Utf8Builder()
-    b.append(value)
+    b.append(value, allow_large_codepoints=allow_large_codepoints)
     return b.build()
 
 def utf8ord_bytes(bytes, start):
@@ -550,7 +553,7 @@
 
 
     @specialize.argtype(1)
-    def append(self, c):
+    def append(self, c, allow_large_codepoints=False):
         if isinstance(c, int) or isinstance(c, r_uint):
             if c < 0x80:
                 self._builder.append(chr(c))
@@ -563,7 +566,7 @@
                 self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
                 self._builder.append(chr(0x80 | (c & 0x3F)))
                 self._is_ascii = False
-            elif c <= 0x10FFFF:
+            elif c <= 0x10FFFF or allow_large_codepoints:
                 self._builder.append(chr(0xF0 | (c >> 18)))
                 self._builder.append(chr(0x80 | (c >> 12 & 0x3F)))
                 self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -3,7 +3,7 @@
 from rpython.rlib import jit
 from rpython.rlib.buffer import Buffer
 from rpython.rlib.objectmodel import keepalive_until_here
-from rpython.rlib.rarithmetic import ovfcheck, widen
+from rpython.rlib.rarithmetic import ovfcheck, widen, intmask
 from rpython.rlib.unroll import unrolling_iterable
 from rpython.rtyper.annlowlevel import llstr
 from rpython.rtyper.lltypesystem import lltype, rffi
@@ -15,6 +15,8 @@
     interp2app, interpindirect2app, unwrap_spec)
 from pypy.interpreter.typedef import (
     GetSetProperty, TypeDef, make_weakref_descr)
+from pypy.interpreter import utf8
+from pypy.interpreter.utf8 import Utf8Str, utf8ord, utf8chr
 from pypy.module._file.interp_file import W_File
 from pypy.objspace.std.floatobject import W_FloatObject
 
@@ -314,7 +316,7 @@
         """
         if self.typecode == 'u':
             buf = rffi.cast(UNICODE_ARRAY, self._buffer_as_unsigned())
-            return space.wrap(rffi.wcharpsize2unicode(buf, self.len))
+            return space.wrap(Utf8Str.from_wcharpsize(buf, self.len))
         else:
             msg = "tounicode() may only be called on type 'u' arrays"
             raise OperationError(space.w_ValueError, space.wrap(msg))
@@ -570,7 +572,7 @@
 
 types = {
     'c': TypeCode(lltype.Char,        'str_w', method=''),
-    'u': TypeCode(lltype.UniChar,     'unicode_w', method=''),
+    'u': TypeCode(utf8.WCHAR_INT,     'unicode_w', method=''),
     'b': TypeCode(rffi.SIGNEDCHAR,    'int_w', True, True),
     'B': TypeCode(rffi.UCHAR,         'int_w', True),
     'h': TypeCode(rffi.SHORT,         'int_w', True, True),
@@ -670,7 +672,10 @@
                 if len(item) != 1:
                     msg = 'array item must be char'
                     raise OperationError(space.w_TypeError, space.wrap(msg))
-                item = item[0]
+                if mytype.unwrap == 'str_w':
+                    item = item[0]
+                else:
+                    item = utf8ord(item)
                 return rffi.cast(mytype.itemtype, item)
             #
             # "regular" case: it fits in an rpython integer (lltype.Signed)
@@ -791,6 +796,9 @@
                 item = rffi.cast(lltype.Signed, item)
             elif mytype.typecode == 'f':
                 item = float(item)
+            elif mytype.typecode == 'u':
+                # TODO: Does this nned special handling for 16bit whar_t?
+                item = utf8chr(intmask(item), allow_large_codepoints=True)
             return space.wrap(item)
 
         # interface