[pypy-svn] r48601 - in pypy/branch/more-unicode-improvements/pypy/rlib: . test
cfbolz at codespeak.net
cfbolz at codespeak.net
Mon Nov 12 16:21:22 CET 2007
Author: cfbolz
Date: Mon Nov 12 16:21:22 2007
New Revision: 48601
Modified:
pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
Log:
add utf16
Modified: pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py (original)
+++ pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py Mon Nov 12 16:21:22 2007
@@ -45,7 +45,7 @@
p += unichr(ordch1)
pos += 1
continue
-
+
n = utf8_code_length[ordch1]
if (pos + n > size):
if not final:
@@ -67,7 +67,7 @@
ordch2 = ord(s[pos+1])
z, two = splitter[6, 2](ordch2)
y, six = splitter[5, 3](ordch1)
- assert six == 6
+ assert six == 6
if (two != 2):
r, pos = errorhandler(errors, "utf8", "invalid data",
s, pos, pos + 2)
@@ -152,6 +152,118 @@
return u"".join(p), pos
+def str_decode_utf16(s, size, errors, final=True,
+ errorhandler=raise_unicode_exception):
+ result, length, byteorder = str_decode_utf16_helper(s, size, errors, final,
+ errorhandler, "native")
+ return result, length
+
+def str_decode_utf16be(s, size, errors, final=True,
+ errorhandler=raise_unicode_exception):
+ result, length, byteorder = str_decode_utf16_helper(s, size, errors, final,
+ errorhandler, "big")
+ return result, length
+
+def str_decode_utf16le(s, size, errors, final=True,
+ errorhandler=raise_unicode_exception):
+ result, length, byteorder = str_decode_utf16_helper(s, size, errors, final,
+ errorhandler, "little")
+ return result, length
+
+def str_decode_utf16_helper(s, size, errors, final=True,
+ errorhandler=raise_unicode_exception,
+ byteorder="native"):
+
+ bo = 0
+ consumed = 0
+
+ if sys.byteorder == 'little':
+ ihi = 1
+ ilo = 0
+ else:
+ ihi = 0
+ ilo = 1
+
+ # Check for BOM marks (U+FEFF) in the input and adjust current
+ # byte order setting accordingly. In native mode, the leading BOM
+ # mark is skipped, in all other modes, it is copied to the output
+ # stream as-is (giving a ZWNBSP character).
+ q = 0
+ p = []
+ if byteorder == 'native':
+ if (size >= 2):
+ bom = (ord(s[ihi]) << 8) | ord(s[ilo])
+ if sys.byteorder == 'little':
+ if (bom == 0xFEFF):
+ q += 2
+ bo = -1
+ elif bom == 0xFFFE:
+ q += 2
+ bo = 1
+ else:
+ if bom == 0xFEFF:
+ q += 2
+ bo = 1
+ elif bom == 0xFFFE:
+ q += 2
+ bo = -1
+ elif byteorder == 'little':
+ bo = -1
+ else:
+ bo = 1
+ if (size == 0):
+ return u'', 0, bo
+ if (bo == -1):
+ # force little endian
+ ihi = 1
+ ilo = 0
+
+ elif (bo == 1):
+ # force big endian
+ ihi = 0
+ ilo = 1
+
+ #XXX I think the errors are not correctly handled here
+ while (q < len(s)):
+ # remaining bytes at the end? (size should be even)
+ if len(s) - q < 2:
+ if not final:
+ break
+ errmsg = "truncated data"
+ startinpos = q
+ endinpos = len(s)
+ errorhandler(errors, 'utf-16', "truncated data",
+ s, startinpos, endinpos, True)
+ # CPython ignores the remaining input chars if the callback
+ # chooses to skip the input. XXX is this sensible?
+ ch = (ord(s[q + ihi]) << 8) | ord(s[q + ilo])
+ q += 2
+ if (ch < 0xD800 or ch > 0xDFFF):
+ p += unichr(ch)
+ continue
+ # UTF-16 code pair:
+ if (q >= len(s)):
+ errmsg = "unexpected end of data"
+ errorhandler(errors, 'utf-16', errmsg, s, q - 2, len(s))
+ elif (0xD800 <= ch and ch <= 0xDBFF):
+ ch2 = (ord(s[q+ihi]) << 8) | ord(s[q+ilo])
+ q += 2
+ if (0xDC00 <= ch2 and ch2 <= 0xDFFF):
+ if MAXUNICODE < 65536:
+ p += unichr(ch)
+ p += unichr(ch2)
+ else:
+ p += unichr((((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000)
+ continue
+ else:
+ errmsg = "illegal UTF-16 surrogate"
+ errorhandler(errors, 'utf-16', errmsg, s, q - 4, q - 2)
+ errmsg = "illegal encoding"
+ startinpos = q-2
+ endinpos = startinpos+2
+ errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True)
+ return u"".join(p), q, bo
+
def str_decode_latin1(s, size, errors, final=False,
errorhandler=raise_unicode_exception):
# latin1 is equivalent to the first 256 ordinals in Unicode.
Modified: pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
==============================================================================
--- pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py (original)
+++ pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py Mon Nov 12 16:21:22 2007
@@ -1,3 +1,4 @@
+import sys, random
from pypy.rlib import runicode
class UnicodeTests(object):
@@ -6,7 +7,7 @@
assert type(x) is type(y)
def checkdecode(self, s, encoding):
- decoder = getattr(runicode, "str_decode_%s" % encoding)
+ decoder = getattr(runicode, "str_decode_%s" % encoding.replace("-", ""))
if isinstance(s, str):
trueresult = s.decode(encoding)
else:
@@ -26,7 +27,13 @@
def test_all_first_256(self):
for i in range(256):
- for encoding in "utf8 latin1".split():
+ for encoding in "utf8 latin1 utf16 utf-16-be utf-16-le".split():
+ self.checkdecode(unichr(i), encoding)
+
+ def test_random(self):
+ for i in range(10000):
+ uni = unichr(random.randrange(sys.maxunicode))
+ for encoding in "utf8 utf16 utf-16-be utf-16-le".split():
self.checkdecode(unichr(i), encoding)
def test_single_chars_utf8(self):
More information about the Pypy-commit
mailing list