[pypy-svn] pypy default: Fix unicodedata on narrow unicode builds (sizeof w_char_t == 2):
amauryfa
commits-noreply at bitbucket.org
Mon Jan 24 17:52:30 CET 2011
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch:
Changeset: r41262:8e6e05173f0a
Date: 2011-01-24 17:45 +0100
http://bitbucket.org/pypy/pypy/changeset/8e6e05173f0a/
Log: Fix unicodedata on narrow unicode builds (sizeof w_char_t == 2): the
unicode database accepts chars from the entire Unicode range.
For characters outside the BMP, a surrogate pair is used and the
resulting unicode has a length of two.
diff --git a/pypy/module/unicodedata/test/test_unicodedata.py b/pypy/module/unicodedata/test/test_unicodedata.py
--- a/pypy/module/unicodedata/test/test_unicodedata.py
+++ b/pypy/module/unicodedata/test/test_unicodedata.py
@@ -1,4 +1,5 @@
from py.test import raises, skip
+from pypy.rpython.test.tool import BaseRtypingTest, LLRtypeMixin
from pypy.conftest import gettestobjspace
from pypy.module.unicodedata import unicodedb_3_2_0, unicodedb_5_2_0
@@ -49,8 +50,6 @@
def test_cjk(self):
import sys
- if sys.maxunicode < 0x10ffff:
- skip("requires a 'wide' python build.")
import unicodedata
cases = ((0x3400, 0x4DB5),
(0x4E00, 0x9FA5))
@@ -62,21 +61,22 @@
# Test at and inside the boundary
for i in (first, first + 1, last - 1, last):
charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
- assert unicodedata.name(unichr(i)) == charname
- assert unicodedata.lookup(charname) == unichr(i)
+ char = ('\\U%08X' % i).decode('unicode-escape')
+ assert unicodedata.name(char) == charname
+ assert unicodedata.lookup(charname) == char
# Test outside the boundary
for i in first - 1, last + 1:
charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
+ char = ('\\U%08X' % i).decode('unicode-escape')
try:
- unicodedata.name(unichr(i))
- except ValueError:
- pass
+ unicodedata.name(char)
+ except ValueError, e:
+ assert e.message == 'no such name'
raises(KeyError, unicodedata.lookup, charname)
def test_bug_1704793(self): # from CPython
- import sys, unicodedata
- if sys.maxunicode == 65535:
- raises(KeyError, unicodedata.lookup, "GOTHIC LETTER FAIHU")
+ import unicodedata
+ assert unicodedata.lookup("GOTHIC LETTER FAIHU") == u'\U00010346'
def test_normalize(self):
import unicodedata
@@ -172,5 +172,23 @@
raises(KeyError, unicodedb_3_2_0.lookup, 'BENZENE RING WITH CIRCLE')
raises(KeyError, unicodedb_3_2_0.name, 9187)
+class TestTranslated(BaseRtypingTest, LLRtypeMixin):
+ def test_translated(self):
+ def f(n):
+ if n == 0:
+ return -1
+ else:
+ u = unicodedb_5_2_0.lookup("GOTHIC LETTER FAIHU")
+ return u
+ res = self.interpret(f, [1])
+ print hex(res)
+ assert res == f(1)
+ def test_code_to_unichr(self):
+ from pypy.module.unicodedata.interp_ucd import code_to_unichr
+ res = self.ll_to_unicode(self.interpret(code_to_unichr, [0x10346]))
+ assert res == u'\U00010346'
+
+
+
diff --git a/pypy/module/unicodedata/interp_ucd.py b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -7,6 +7,9 @@
from pypy.interpreter.error import OperationError
from pypy.interpreter.typedef import TypeDef, interp_attrproperty
from pypy.rlib.rarithmetic import r_longlong
+from pypy.rlib.objectmodel import we_are_translated
+from pypy.rlib.runicode import MAXUNICODE
+import sys
from pypy.module.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
@@ -21,12 +24,84 @@
NCount = (VCount*TCount)
SCount = (LCount*NCount)
-def unichr_to_code_w(space, w_unichr):
- if not space.is_true(space.isinstance(w_unichr, space.w_unicode)):
- raise OperationError(space.w_TypeError, space.wrap('argument 1 must be unicode'))
- if not space.int_w(space.len(w_unichr)) == 1:
- raise OperationError(space.w_TypeError, space.wrap('need a single Unicode character as parameter'))
- return space.int_w(space.ord(w_unichr))
+# Since Python2.7, the unicodedata module gives a preview of Python3 character
+# handling: on narrow unicode builds, a surrogate pair is considered as one
+# unicode code point.
+
+# The functions below are subtly different from the ones in runicode.py.
+# When PyPy implements Python 3 they should be merged.
+
+def UNICHR(c):
+ if c <= sys.maxunicode and c <= MAXUNICODE:
+ return unichr(c)
+ else:
+ c -= 0x10000
+ return (unichr(0xD800 + (c >> 10)) +
+ unichr(0xDC00 + (c & 0x03FF)))
+
+def ORD(u):
+ assert isinstance(u, unicode)
+ if len(u) == 1:
+ return ord(u[0])
+ elif len(u) == 2:
+ ch1 = ord(u[0])
+ ch2 = ord(u[1])
+ if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
+ return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000
+ raise ValueError
+
+if MAXUNICODE > 0xFFFF:
+ # Target is wide build
+ def unichr_to_code_w(space, w_unichr):
+ if not space.is_true(space.isinstance(w_unichr, space.w_unicode)):
+ raise OperationError(space.w_TypeError, space.wrap(
+ 'argument 1 must be unicode'))
+
+ if not we_are_translated() and sys.maxunicode == 0xFFFF:
+ # Host CPython is narrow build, accept surrogates
+ try:
+ return ORD(space.unicode_w(w_unichr))
+ except ValueError:
+ raise OperationError(space.w_TypeError, space.wrap(
+ 'need a single Unicode character as parameter'))
+ else:
+ if not space.int_w(space.len(w_unichr)) == 1:
+ raise OperationError(space.w_TypeError, space.wrap(
+ 'need a single Unicode character as parameter'))
+ return space.int_w(space.ord(w_unichr))
+
+ def code_to_unichr(code):
+ if not we_are_translated() and sys.maxunicode == 0xFFFF:
+ # Host CPython is narrow build, generate surrogates
+ return UNICHR(code)
+ else:
+ return unichr(code)
+else:
+ # Target is narrow build
+ def unichr_to_code_w(space, w_unichr):
+ if not space.is_true(space.isinstance(w_unichr, space.w_unicode)):
+ raise OperationError(space.w_TypeError, space.wrap(
+ 'argument 1 must be unicode'))
+
+ if not we_are_translated() and sys.maxunicode > 0xFFFF:
+ # Host CPython is wide build, forbid surrogates
+ if not space.int_w(space.len(w_unichr)) == 1:
+ raise OperationError(space.w_TypeError, space.wrap(
+ 'need a single Unicode character as parameter'))
+ return space.int_w(space.ord(w_unichr))
+
+ else:
+ # Accept surrogates
+ try:
+ return ORD(space.unicode_w(w_unichr))
+ except ValueError:
+ raise OperationError(space.w_TypeError, space.wrap(
+ 'need a single Unicode character as parameter'))
+
+ def code_to_unichr(code):
+ # generate surrogates for large codes
+ return UNICHR(code)
+
class UCD(Wrappable):
def __init__(self, unicodedb):
@@ -57,15 +132,12 @@
_get_code.unwrap_spec = ['self', ObjSpace, str]
def lookup(self, space, name):
- w_code = self._get_code(space, name)
try:
- return space.call_function(space.builtin.get('unichr'), w_code)
- except OperationError, ex:
- if not ex.match(space, space.w_ValueError):
- raise
- msg = space.mod(space.wrap("result %d larger than sys.maxunicode"), w_code)
+ code = self._lookup(name.upper())
+ except KeyError:
+ msg = space.mod(space.wrap("undefined character name '%s'"), space.wrap(name))
raise OperationError(space.w_KeyError, msg)
-
+ return space.wrap(code_to_unichr(code))
lookup.unwrap_spec = ['self', ObjSpace, str]
def name(self, space, w_unichr, w_default=NoneNotWrapped):
More information about the Pypy-commit
mailing list