[pypy-svn] pypy default: Fix unicodedata on narrow unicode builds (sizeof w_char_t == 2):

Mon Jan 24 17:52:30 CET 2011

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: 
Changeset: r41262:8e6e05173f0a
Date: 2011-01-24 17:45 +0100
http://bitbucket.org/pypy/pypy/changeset/8e6e05173f0a/

Log:	Fix unicodedata on narrow unicode builds (sizeof w_char_t == 2): the
	unicode database accepts chars from the entire Unicode range.

	For characters outside the BMP, a surrogate pair is used and the
	resulting unicode has a length of two.

diff --git a/pypy/module/unicodedata/test/test_unicodedata.py b/pypy/module/unicodedata/test/test_unicodedata.py
--- a/pypy/module/unicodedata/test/test_unicodedata.py
+++ b/pypy/module/unicodedata/test/test_unicodedata.py
@@ -1,4 +1,5 @@
 from py.test import raises, skip
+from pypy.rpython.test.tool import BaseRtypingTest, LLRtypeMixin
 from pypy.conftest import gettestobjspace
 
 from pypy.module.unicodedata import unicodedb_3_2_0, unicodedb_5_2_0
@@ -49,8 +50,6 @@
 
     def test_cjk(self):
         import sys
-        if sys.maxunicode < 0x10ffff:
-            skip("requires a 'wide' python build.")
         import unicodedata
         cases = ((0x3400, 0x4DB5),
                  (0x4E00, 0x9FA5))
@@ -62,21 +61,22 @@
             # Test at and inside the boundary
             for i in (first, first + 1, last - 1, last):
                 charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
-                assert unicodedata.name(unichr(i)) == charname
-                assert unicodedata.lookup(charname) == unichr(i)
+                char = ('\\U%08X' % i).decode('unicode-escape')
+                assert unicodedata.name(char) == charname
+                assert unicodedata.lookup(charname) == char
             # Test outside the boundary
             for i in first - 1, last + 1:
                 charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
+                char = ('\\U%08X' % i).decode('unicode-escape')
                 try:
-                    unicodedata.name(unichr(i))
-                except ValueError:
-                    pass
+                    unicodedata.name(char)
+                except ValueError, e:
+                    assert e.message == 'no such name'
                 raises(KeyError, unicodedata.lookup, charname)
 
     def test_bug_1704793(self): # from CPython
-        import sys, unicodedata
-        if sys.maxunicode == 65535:
-            raises(KeyError, unicodedata.lookup, "GOTHIC LETTER FAIHU")
+        import unicodedata
+        assert unicodedata.lookup("GOTHIC LETTER FAIHU") == u'\U00010346'
 
     def test_normalize(self):
         import unicodedata
@@ -172,5 +172,23 @@
         raises(KeyError, unicodedb_3_2_0.lookup, 'BENZENE RING WITH CIRCLE')
         raises(KeyError, unicodedb_3_2_0.name, 9187)
 
+class TestTranslated(BaseRtypingTest, LLRtypeMixin):
 
+    def test_translated(self):
+        def f(n):
+            if n == 0:
+                return -1
+            else:
+                u = unicodedb_5_2_0.lookup("GOTHIC LETTER FAIHU")
+                return u
+        res = self.interpret(f, [1])
+        print hex(res)
+        assert res == f(1)
 
+    def test_code_to_unichr(self):
+        from pypy.module.unicodedata.interp_ucd import code_to_unichr
+        res = self.ll_to_unicode(self.interpret(code_to_unichr, [0x10346]))
+        assert res == u'\U00010346'
+
+
+

diff --git a/pypy/module/unicodedata/interp_ucd.py b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -7,6 +7,9 @@
 from pypy.interpreter.error import OperationError
 from pypy.interpreter.typedef import TypeDef, interp_attrproperty
 from pypy.rlib.rarithmetic import r_longlong
+from pypy.rlib.objectmodel import we_are_translated
+from pypy.rlib.runicode import MAXUNICODE
+import sys
 
 from pypy.module.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
 
@@ -21,12 +24,84 @@
 NCount = (VCount*TCount)
 SCount = (LCount*NCount)
 
-def unichr_to_code_w(space, w_unichr):
-    if not space.is_true(space.isinstance(w_unichr, space.w_unicode)):
-        raise OperationError(space.w_TypeError, space.wrap('argument 1 must be unicode'))
-    if not space.int_w(space.len(w_unichr)) == 1:
-        raise OperationError(space.w_TypeError, space.wrap('need a single Unicode character as parameter'))
-    return space.int_w(space.ord(w_unichr))
+# Since Python2.7, the unicodedata module gives a preview of Python3 character
+# handling: on narrow unicode builds, a surrogate pair is considered as one
+# unicode code point.
+
+# The functions below are subtly different from the ones in runicode.py.
+# When PyPy implements Python 3 they should be merged.
+
+def UNICHR(c):
+    if c <= sys.maxunicode and c <= MAXUNICODE:
+        return unichr(c)
+    else:
+        c -= 0x10000
+        return (unichr(0xD800 + (c >> 10)) +
+                unichr(0xDC00 + (c & 0x03FF)))
+
+def ORD(u):
+    assert isinstance(u, unicode)
+    if len(u) == 1:
+        return ord(u[0])
+    elif len(u) == 2:
+        ch1 = ord(u[0])
+        ch2 = ord(u[1])
+        if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
+            return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000
+    raise ValueError
+
+if MAXUNICODE > 0xFFFF:
+    # Target is wide build
+    def unichr_to_code_w(space, w_unichr):
+        if not space.is_true(space.isinstance(w_unichr, space.w_unicode)):
+            raise OperationError(space.w_TypeError, space.wrap(
+                'argument 1 must be unicode'))
+
+        if not we_are_translated() and sys.maxunicode == 0xFFFF:
+            # Host CPython is narrow build, accept surrogates
+            try:
+                return ORD(space.unicode_w(w_unichr))
+            except ValueError:
+                raise OperationError(space.w_TypeError, space.wrap(
+                    'need a single Unicode character as parameter'))
+        else:
+            if not space.int_w(space.len(w_unichr)) == 1:
+                raise OperationError(space.w_TypeError, space.wrap(
+                    'need a single Unicode character as parameter'))
+            return space.int_w(space.ord(w_unichr))
+
+    def code_to_unichr(code):
+        if not we_are_translated() and sys.maxunicode == 0xFFFF:
+            # Host CPython is narrow build, generate surrogates
+            return UNICHR(code)
+        else:
+            return unichr(code)
+else:
+    # Target is narrow build
+    def unichr_to_code_w(space, w_unichr):
+        if not space.is_true(space.isinstance(w_unichr, space.w_unicode)):
+            raise OperationError(space.w_TypeError, space.wrap(
+                'argument 1 must be unicode'))
+
+        if not we_are_translated() and sys.maxunicode > 0xFFFF:
+            # Host CPython is wide build, forbid surrogates
+            if not space.int_w(space.len(w_unichr)) == 1:
+                raise OperationError(space.w_TypeError, space.wrap(
+                    'need a single Unicode character as parameter'))
+            return space.int_w(space.ord(w_unichr))
+
+        else:
+            # Accept surrogates
+            try:
+                return ORD(space.unicode_w(w_unichr))
+            except ValueError:
+                raise OperationError(space.w_TypeError, space.wrap(
+                    'need a single Unicode character as parameter'))
+
+    def code_to_unichr(code):
+        # generate surrogates for large codes
+        return UNICHR(code)
+
 
 class UCD(Wrappable):
     def __init__(self, unicodedb):
@@ -57,15 +132,12 @@
     _get_code.unwrap_spec = ['self', ObjSpace, str]
     
     def lookup(self, space, name):
-        w_code = self._get_code(space, name)
         try:
-            return space.call_function(space.builtin.get('unichr'), w_code)
-        except OperationError, ex:
-            if not ex.match(space, space.w_ValueError):
-                raise
-            msg = space.mod(space.wrap("result %d larger than sys.maxunicode"), w_code)
+            code = self._lookup(name.upper())
+        except KeyError:
+            msg = space.mod(space.wrap("undefined character name '%s'"), space.wrap(name))
             raise OperationError(space.w_KeyError, msg)
-
+        return space.wrap(code_to_unichr(code))
     lookup.unwrap_spec = ['self', ObjSpace, str]
 
     def name(self, space, w_unichr, w_default=NoneNotWrapped):