[pypy-svn] r75737 - in pypy/trunk/pypy: interpreter interpreter/pyparser module/_codecs module/_codecs/test objspace/std rlib
afa at codespeak.net
afa at codespeak.net
Thu Jul 1 22:24:33 CEST 2010
Author: afa
Date: Thu Jul 1 22:24:32 2010
New Revision: 75737
Removed:
pypy/trunk/pypy/module/_codecs/app_codecs.py
Modified:
pypy/trunk/pypy/interpreter/pyparser/parsestring.py
pypy/trunk/pypy/interpreter/unicodehelper.py
pypy/trunk/pypy/module/_codecs/__init__.py
pypy/trunk/pypy/module/_codecs/interp_codecs.py
pypy/trunk/pypy/module/_codecs/test/test_codecs.py
pypy/trunk/pypy/objspace/std/marshal_impl.py
pypy/trunk/pypy/objspace/std/unicodeobject.py
pypy/trunk/pypy/rlib/rstring.py
pypy/trunk/pypy/rlib/runicode.py
Log:
Merge branch/interplevel-codecs:
Rewrite all encodings implemented at applevel,
move most of them to rlib.runicode where they may be useful for RPython programs.
- This fixes translation with -O0: this option disables geninterp,
and uses the pypy compiler instead to compile applevel code.
But unicode literals need to be decoded with the codec module...
- This also removes some huge geninterp'd code: app_codecs.py used to contain 3
or 4 functions each rendered with 20000 lines of C code!
+ use StringBuilder everywhere
Modified: pypy/trunk/pypy/interpreter/pyparser/parsestring.py
==============================================================================
--- pypy/trunk/pypy/interpreter/pyparser/parsestring.py (original)
+++ pypy/trunk/pypy/interpreter/pyparser/parsestring.py Thu Jul 1 22:24:32 2010
@@ -72,12 +72,12 @@
bufp = 0
bufq = len(buf)
assert 0 <= bufp <= bufq
- w_substr = space.wrap(buf[bufp : bufq])
+ substr = buf[bufp:bufq]
if rawmode:
- w_v = unicodehelper.PyUnicode_DecodeRawUnicodeEscape(space, w_substr)
+ v = unicodehelper.PyUnicode_DecodeRawUnicodeEscape(space, substr)
else:
- w_v = unicodehelper.PyUnicode_DecodeUnicodeEscape(space, w_substr)
- return w_v
+ v = unicodehelper.PyUnicode_DecodeUnicodeEscape(space, substr)
+ return space.wrap(v)
need_encoding = (encoding is not None and
encoding != "utf-8" and encoding != "iso-8859-1")
@@ -86,7 +86,7 @@
substr = s[ps : q]
if rawmode or '\\' not in s[ps:]:
if need_encoding:
- w_u = unicodehelper.PyUnicode_DecodeUTF8(space, space.wrap(substr))
+ w_u = space.wrap(unicodehelper.PyUnicode_DecodeUTF8(space, substr))
#w_v = space.wrap(space.unwrap(w_u).encode(encoding)) this works
w_v = unicodehelper.PyUnicode_AsEncodedString(space, w_u, space.wrap(encoding))
return w_v
@@ -96,7 +96,7 @@
enc = None
if need_encoding:
enc = encoding
- v = PyString_DecodeEscape(space, substr, unicode, enc)
+ v = PyString_DecodeEscape(space, substr, enc)
return space.wrap(v)
def hexbyte(val):
@@ -105,10 +105,9 @@
result = "0" + result
return result
-def PyString_DecodeEscape(space, s, unicode, recode_encoding):
+def PyString_DecodeEscape(space, s, recode_encoding):
"""
- Unescape a backslash-escaped string. If unicode is non-zero,
- the string is a u-literal. If recode_encoding is non-zero,
+ Unescape a backslash-escaped string. If recode_encoding is non-zero,
the string is UTF-8 encoded and should be re-encoded in the
specified encoding.
"""
@@ -171,9 +170,6 @@
raise_app_valueerror(space, 'invalid \\x escape')
# ignored replace and ignore for now
- elif unicode and (ch == 'u' or ch == 'U' or ch == 'N'):
- raise_app_valueerror(space, 'Unicode escapes not legal '
- 'when Unicode disabled')
else:
# this was not an escape, so the backslash
# has to be added, and we start over in
@@ -200,7 +196,7 @@
# while (s < end && *s != '\\') s++; */ /* inefficient for u".."
while ps < end and ord(s[ps]) & 0x80:
ps += 1
- w_u = unicodehelper.PyUnicode_DecodeUTF8(space, space.wrap(s[pt : ps]))
+ w_u = space.wrap(unicodehelper.PyUnicode_DecodeUTF8(space, s[pt:ps]))
w_v = unicodehelper.PyUnicode_AsEncodedString(space, w_u, space.wrap(encoding))
v = space.str_w(w_v)
return v, ps
Modified: pypy/trunk/pypy/interpreter/unicodehelper.py
==============================================================================
--- pypy/trunk/pypy/interpreter/unicodehelper.py (original)
+++ pypy/trunk/pypy/interpreter/unicodehelper.py Thu Jul 1 22:24:32 2010
@@ -1,30 +1,10 @@
-from pypy.interpreter import gateway
+from pypy.module._codecs import interp_codecs
-app = gateway.applevel(r'''
- def PyUnicode_DecodeUnicodeEscape(data):
- import _codecs
- return _codecs.unicode_escape_decode(data)[0]
+def PyUnicode_AsEncodedString(space, w_data, w_encoding):
+ return interp_codecs.encode(space, w_data, w_encoding)
- def PyUnicode_DecodeRawUnicodeEscape(data):
- import _codecs
- return _codecs.raw_unicode_escape_decode(data)[0]
-
- def PyUnicode_DecodeUTF8(data):
- import _codecs
- return _codecs.utf_8_decode(data)[0]
-
- def PyUnicode_AsEncodedString(data, encoding):
- import _codecs
- return _codecs.encode(data, encoding)
-
- def PyUnicode_EncodeUTF8(data):
- import _codecs
- return _codecs.utf_8_encode(data)[0]
-
-''')
-
-PyUnicode_DecodeUnicodeEscape = app.interphook('PyUnicode_DecodeUnicodeEscape')
-PyUnicode_DecodeRawUnicodeEscape = app.interphook('PyUnicode_DecodeRawUnicodeEscape')
-PyUnicode_DecodeUTF8 = app.interphook('PyUnicode_DecodeUTF8')
-PyUnicode_AsEncodedString = app.interphook('PyUnicode_AsEncodedString')
-PyUnicode_EncodeUTF8 = app.interphook('PyUnicode_EncodeUTF8')
+# These functions take and return unwrapped rpython strings and unicodes
+PyUnicode_DecodeUnicodeEscape = interp_codecs.make_raw_decoder('unicode_escape')
+PyUnicode_DecodeRawUnicodeEscape = interp_codecs.make_raw_decoder('raw_unicode_escape')
+PyUnicode_DecodeUTF8 = interp_codecs.make_raw_decoder('utf_8')
+PyUnicode_EncodeUTF8 = interp_codecs.make_raw_encoder('utf_8')
Modified: pypy/trunk/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/__init__.py (original)
+++ pypy/trunk/pypy/module/_codecs/__init__.py Thu Jul 1 22:24:32 2010
@@ -3,22 +3,43 @@
from pypy.module._codecs import interp_codecs
class Module(MixedModule):
- appleveldefs = {
- '__doc__' : 'app_codecs.__doc__',
- '__name__' : 'app_codecs.__name__',
- 'charmap_encode' : 'app_codecs.charmap_encode',
- 'escape_decode' : 'app_codecs.escape_decode',
- 'escape_encode' : 'app_codecs.escape_encode',
- 'raw_unicode_escape_decode' : 'app_codecs.raw_unicode_escape_decode',
- 'raw_unicode_escape_encode' : 'app_codecs.raw_unicode_escape_encode',
- 'unicode_escape_decode' : 'app_codecs.unicode_escape_decode',
- 'unicode_escape_encode' : 'app_codecs.unicode_escape_encode',
- 'unicode_internal_decode' : 'app_codecs.unicode_internal_decode',
- 'unicode_internal_encode' : 'app_codecs.unicode_internal_encode',
- 'utf_7_decode' : 'app_codecs.utf_7_decode',
- 'utf_7_encode' : 'app_codecs.utf_7_encode',
- 'charmap_build' : 'app_codecs.charmap_build'
- }
+ """
+ _codecs -- Provides access to the codec registry and the builtin
+ codecs.
+
+ This module should never be imported directly. The standard library
+ module "codecs" wraps this builtin module for use within Python.
+
+ The codec registry is accessible via:
+
+ register(search_function) -> None
+
+ lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
+
+ The builtin Unicode codecs use the following interface:
+
+ <encoding>_encode(Unicode_object[,errors='strict']) ->
+ (string object, bytes consumed)
+
+ <encoding>_decode(char_buffer_obj[,errors='strict']) ->
+ (Unicode object, bytes consumed)
+
+ <encoding>_encode() interfaces also accept non-Unicode object as
+ input. The objects are then converted to Unicode using
+ PyUnicode_FromObject() prior to applying the conversion.
+
+ These <encoding>s are available: utf_8, unicode_escape,
+ raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
+ mbcs (on win32).
+
+
+Written by Marc-Andre Lemburg (mal at lemburg.com).
+
+Copyright (c) Corporation for National Research Initiatives.
+"""
+
+ appleveldefs = {}
+
interpleveldefs = {
'encode': 'interp_codecs.encode',
'decode': 'interp_codecs.decode',
@@ -26,12 +47,15 @@
'lookup_error': 'interp_codecs.lookup_error',
'register': 'interp_codecs.register_codec',
'register_error': 'interp_codecs.register_error',
+ 'charmap_build' : 'interp_codecs.charmap_build',
# encoders and decoders
'ascii_decode' : 'interp_codecs.ascii_decode',
'ascii_encode' : 'interp_codecs.ascii_encode',
'latin_1_decode' : 'interp_codecs.latin_1_decode',
'latin_1_encode' : 'interp_codecs.latin_1_encode',
+ 'utf_7_decode' : 'interp_codecs.utf_7_decode',
+ 'utf_7_encode' : 'interp_codecs.utf_7_encode',
'utf_8_decode' : 'interp_codecs.utf_8_decode',
'utf_8_encode' : 'interp_codecs.utf_8_encode',
'utf_16_be_decode' : 'interp_codecs.utf_16_be_decode',
@@ -44,6 +68,15 @@
'charbuffer_encode': 'interp_codecs.buffer_encode',
'readbuffer_encode': 'interp_codecs.buffer_encode',
'charmap_decode' : 'interp_codecs.charmap_decode',
+ 'charmap_encode' : 'interp_codecs.charmap_encode',
+ 'escape_encode' : 'interp_codecs.escape_encode',
+ 'escape_decode' : 'interp_codecs.escape_decode',
+ 'unicode_escape_decode' : 'interp_codecs.unicode_escape_decode',
+ 'unicode_escape_encode' : 'interp_codecs.unicode_escape_encode',
+ 'raw_unicode_escape_decode' : 'interp_codecs.raw_unicode_escape_decode',
+ 'raw_unicode_escape_encode' : 'interp_codecs.raw_unicode_escape_encode',
+ 'unicode_internal_decode' : 'interp_codecs.unicode_internal_decode',
+ 'unicode_internal_encode' : 'interp_codecs.unicode_internal_encode',
}
def __init__(self, space, *args):
Modified: pypy/trunk/pypy/module/_codecs/interp_codecs.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/interp_codecs.py (original)
+++ pypy/trunk/pypy/module/_codecs/interp_codecs.py Thu Jul 1 22:24:32 2010
@@ -1,5 +1,6 @@
from pypy.interpreter.error import OperationError, operationerrfmt
-from pypy.interpreter.gateway import ObjSpace, NoneNotWrapped, applevel
+from pypy.interpreter.gateway import ObjSpace, NoneNotWrapped, interp2app
+from pypy.interpreter.gateway import unwrap_spec
from pypy.interpreter.baseobjspace import W_Root
from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
from pypy.rlib.objectmodel import we_are_translated
@@ -13,6 +14,8 @@
self.decode_error_handler = self.make_errorhandler(space, True)
self.encode_error_handler = self.make_errorhandler(space, False)
+ self.unicodedata_handler = None
+
def make_errorhandler(self, space, decode):
def unicode_call_errorhandler(errors, encoding, reason, input,
startpos, endpos):
@@ -53,6 +56,21 @@
return replace, newpos
return unicode_call_errorhandler
+ def get_unicodedata_handler(self, space):
+ if self.unicodedata_handler:
+ return self.unicodedata_handler
+ try:
+ w_builtin = space.getbuiltinmodule('__builtin__')
+ w_import = space.getattr(w_builtin, space.wrap("__import__"))
+ w_unicodedata = space.call_function(w_import,
+ space.wrap("unicodedata"))
+ w_getcode = space.getattr(w_unicodedata, space.wrap("_get_code"))
+ except OperationError:
+ return None
+ else:
+ self.unicodedata_handler = UnicodeData_Handler(space, w_getcode)
+ return self.unicodedata_handler
+
def _freeze_(self):
assert not self.codec_search_path
return False
@@ -114,78 +132,125 @@
"unknown encoding: %s", encoding)
lookup_codec.unwrap_spec = [ObjSpace, str]
-app_errors = applevel("""
-def check_exception(exc):
+# ____________________________________________________________
+# Register standard error handlers
+
+def check_exception(space, w_exc):
try:
- delta = exc.end - exc.start
- if delta < 0 or not isinstance(exc.object, (unicode, str)):
- raise TypeError("wrong exception")
- except AttributeError:
- raise TypeError("wrong exception")
-
-def strict_errors(exc):
- if isinstance(exc, Exception):
- raise exc
- else:
- raise TypeError("codec must pass exception instance")
-
-def ignore_errors(exc):
- check_exception(exc)
- if isinstance(exc, UnicodeEncodeError):
- return u'', exc.end
- elif isinstance(exc, (UnicodeDecodeError, UnicodeTranslateError)):
- return u'', exc.end
- else:
- raise TypeError("don't know how to handle %.400s in error callback"%exc)
-
-Py_UNICODE_REPLACEMENT_CHARACTER = u"\ufffd"
-
-def replace_errors(exc):
- check_exception(exc)
- if isinstance(exc, UnicodeEncodeError):
- return u'?'*(exc.end-exc.start), exc.end
- elif isinstance(exc, (UnicodeTranslateError, UnicodeDecodeError)):
- return Py_UNICODE_REPLACEMENT_CHARACTER*(exc.end-exc.start), exc.end
- else:
- raise TypeError("don't know how to handle %.400s in error callback"%exc)
-
-def xmlcharrefreplace_errors(exc):
- if isinstance(exc, UnicodeEncodeError):
- res = []
- for ch in exc.object[exc.start:exc.end]:
- res += '&#'
- res += str(ord(ch))
- res += ';'
- return u''.join(res), exc.end
- else:
- raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
-
-def backslashreplace_errors(exc):
- if isinstance(exc, UnicodeEncodeError):
- p = []
- for c in exc.object[exc.start:exc.end]:
- p += '\\\\'
- oc = ord(c)
- if (oc >= 0x00010000):
- p += 'U'
- p += "%.8x" % ord(c)
+ w_start = space.getattr(w_exc, space.wrap('start'))
+ w_end = space.getattr(w_exc, space.wrap('end'))
+ w_obj = space.getattr(w_exc, space.wrap('object'))
+ except OperationError, e:
+ if not e.match(space, space.w_AttributeError):
+ raise
+ raise OperationError(space.w_TypeError, space.wrap(
+ "wrong exception"))
+
+ delta = space.int_w(w_end) - space.int_w(w_start)
+ if delta < 0 or not (space.isinstance_w(w_obj, space.w_str) or
+ space.isinstance_w(w_obj, space.w_unicode)):
+ raise OperationError(space.w_TypeError, space.wrap(
+ "wrong exception"))
+
+def strict_errors(space, w_exc):
+ check_exception(space, w_exc)
+ if space.isinstance_w(w_exc, space.w_BaseException):
+ raise OperationError(space.type(w_exc), w_exc)
+ else:
+ raise OperationError(space.w_TypeError, space.wrap(
+ "codec must pass exception instance"))
+
+def ignore_errors(space, w_exc):
+ check_exception(space, w_exc)
+ w_end = space.getattr(w_exc, space.wrap('end'))
+ if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+ return space.newtuple([space.wrap(''), w_end])
+ elif (space.isinstance_w(w_exc, space.w_UnicodeDecodeError) or
+ space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
+ return space.newtuple([space.wrap(u''), w_end])
+ else:
+ typename = space.type(w_exc).getname(space, '?')
+ raise operationerrfmt(space.w_TypeError,
+ "don't know how to handle %s in error callback", typename)
+
+def replace_errors(space, w_exc):
+ check_exception(space, w_exc)
+ w_start = space.getattr(w_exc, space.wrap('start'))
+ w_end = space.getattr(w_exc, space.wrap('end'))
+ size = space.int_w(w_end) - space.int_w(w_start)
+ if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+ text = '?' * size
+ return space.newtuple([space.wrap(text), w_end])
+ elif (space.isinstance_w(w_exc, space.w_UnicodeDecodeError) or
+ space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
+ text = u'\ufffd' * size
+ return space.newtuple([space.wrap(text), w_end])
+ else:
+ typename = space.type(w_exc).getname(space, '?')
+ raise operationerrfmt(space.w_TypeError,
+ "don't know how to handle %s in error callback", typename)
+
+def xmlcharrefreplace_errors(space, w_exc):
+ check_exception(space, w_exc)
+ if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+ obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object')))
+ start = space.int_w(space.getattr(w_exc, space.wrap('start')))
+ w_end = space.getattr(w_exc, space.wrap('end'))
+ end = space.int_w(w_end)
+ builder = UnicodeBuilder()
+ pos = start
+ while pos < end:
+ ch = obj[pos]
+ builder.append(u"&#")
+ builder.append(unicode(str(ord(ch))))
+ builder.append(u";")
+ pos += 1
+ return space.newtuple([space.wrap(builder.build()), w_end])
+ else:
+ typename = space.type(w_exc).getname(space, '?')
+ raise operationerrfmt(space.w_TypeError,
+ "don't know how to handle %s in error callback", typename)
+
+def backslashreplace_errors(space, w_exc):
+ check_exception(space, w_exc)
+ if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+ obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object')))
+ start = space.int_w(space.getattr(w_exc, space.wrap('start')))
+ w_end = space.getattr(w_exc, space.wrap('end'))
+ end = space.int_w(w_end)
+ builder = UnicodeBuilder()
+ pos = start
+ while pos < end:
+ oc = ord(obj[pos])
+ num = hex(oc)
+ if (oc >= 0x10000):
+ builder.append(u"\\U")
+ zeros = 8
elif (oc >= 0x100):
- p += 'u'
- p += "%.4x" % ord(c)
+ builder.append(u"\\u")
+ zeros = 4
else:
- p += 'x'
- p += "%.2x" % ord(c)
- return u''.join(p), exc.end
- else:
- raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
-""")
+ builder.append(u"\\x")
+ zeros = 2
+ lnum = len(num)
+ nb = zeros + 2 - lnum # num starts with '0x'
+ if nb > 0:
+ builder.append_multiple_char(u'0', nb)
+ builder.append_slice(unicode(num), 2, lnum)
+ pos += 1
+ return space.newtuple([space.wrap(builder.build()), w_end])
+ else:
+ typename = space.type(w_exc).getname(space, '?')
+ raise operationerrfmt(space.w_TypeError,
+ "don't know how to handle %s in error callback", typename)
def register_builtin_error_handlers(space):
+ "NOT_RPYTHON"
state = space.fromcache(CodecState)
for error in ("strict", "ignore", "replace", "xmlcharrefreplace",
"backslashreplace"):
name = error + "_errors"
- state.codec_error_registry[error] = app_errors.wget(space, name)
+ state.codec_error_registry[error] = space.wrap(interp2app(globals()[name]))
def lookup_error(space, errors):
@@ -279,6 +344,38 @@
from pypy.rlib import runicode
+def make_raw_encoder(name):
+ rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
+ assert hasattr(runicode, rname)
+ def raw_encoder(space, uni):
+ state = space.fromcache(CodecState)
+ func = getattr(runicode, rname)
+ errors = "strict"
+ return func(uni, len(uni), errors, state.encode_error_handler)
+ raw_encoder.func_name = rname
+ return raw_encoder
+
+def make_raw_decoder(name):
+ rname = "str_decode_%s" % (name.replace("_decode", ""), )
+ assert hasattr(runicode, rname)
+ def raw_decoder(space, string):
+ final = True
+ errors = "strict"
+ state = space.fromcache(CodecState)
+ func = getattr(runicode, rname)
+ kwargs = {}
+ if name == 'unicode_escape':
+ unicodedata_handler = state.get_unicodedata_handler(space)
+ result, consumed = func(string, len(string), errors,
+ final, state.decode_error_handler,
+ unicodedata_handler=unicodedata_handler)
+ else:
+ result, consumed = func(string, len(string), errors,
+ final, state.decode_error_handler)
+ return result
+ raw_decoder.func_name = rname
+ return raw_decoder
+
def make_encoder_wrapper(name):
rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
assert hasattr(runicode, rname)
@@ -308,20 +405,26 @@
for encoders in [
"ascii_encode",
"latin_1_encode",
+ "utf_7_encode",
"utf_8_encode",
"utf_16_encode",
"utf_16_be_encode",
"utf_16_le_encode",
+ "unicode_escape_encode",
+ "raw_unicode_escape_encode",
+ "unicode_internal_encode",
]:
make_encoder_wrapper(encoders)
for decoders in [
"ascii_decode",
"latin_1_decode",
+ "utf_7_decode",
"utf_8_decode",
"utf_16_decode",
"utf_16_be_decode",
"utf_16_le_decode",
+ "raw_unicode_escape_decode",
]:
make_decoder_wrapper(decoders)
@@ -330,8 +433,6 @@
make_decoder_wrapper('mbcs_decode')
def utf_16_ex_decode(space, data, errors='strict', byteorder=0, w_final=False):
- """None
- """
final = space.is_true(w_final)
state = space.fromcache(CodecState)
if byteorder == 0:
@@ -349,77 +450,213 @@
space.wrap(byteorder)])
utf_16_ex_decode.unwrap_spec = [ObjSpace, str, str, int, W_Root]
-def _extract_from_mapping(space, mapping_w, w_mapping, ch):
- if mapping_w is not None:
+# ____________________________________________________________
+# Charmap
+
+class Charmap_Decode:
+ def __init__(self, space, w_mapping):
+ self.space = space
+ self.w_mapping = w_mapping
+
+ # fast path for all the stuff in the encodings module
+ if space.is_true(space.isinstance(w_mapping, space.w_tuple)):
+ self.mapping_w = space.fixedview(w_mapping)
+ else:
+ self.mapping_w = None
+
+ def get(self, ch, errorchar):
+ space = self.space
+
+ # get the character from the mapping
+ if self.mapping_w is not None:
+ w_ch = self.mapping_w[ord(ch)]
+ else:
+ try:
+ w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
+ except OperationError, e:
+ if not e.match(space, space.w_LookupError):
+ raise
+ return errorchar
+
+ # Charmap may return a unicode string
try:
- return mapping_w[ord(ch)]
- except IndexError:
- pass
- else:
+ x = space.unicode_w(w_ch)
+ except OperationError, e:
+ if not e.match(space, space.w_TypeError):
+ raise
+ else:
+ return x
+
+ # Charmap may return a number
+ try:
+ x = space.int_w(w_ch)
+ except OperationError:
+ if not e.match(space, space.w_TypeError):
+ raise
+ else:
+ if 0 <= x < 65536: # Even on wide unicode builds...
+ return unichr(x)
+ else:
+ raise OperationError(space.w_TypeError, space.wrap(
+ "character mapping must be in range(65536)"))
+
+ # Charmap may return None
+ if space.is_w(w_ch, space.w_None):
+ return errorchar
+
+ raise OperationError(space.w_TypeError, space.wrap("invalid mapping"))
+
+class Charmap_Encode:
+ def __init__(self, space, w_mapping):
+ self.space = space
+ self.w_mapping = w_mapping
+
+ def get(self, ch, errorchar):
+ space = self.space
+
+ # get the character from the mapping
try:
- return space.getitem(w_mapping, space.newint(ord(ch)))
+ w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
except OperationError, e:
- if (not e.match(space, space.w_KeyError) and
- not e.match(space, space.w_IndexError)):
+ if not e.match(space, space.w_LookupError):
raise
- pass
+ return errorchar
-def _append_unicode(space, builder, w_x):
- try:
- x = space.unicode_w(w_x)
- except OperationError, e:
- if not e.match(space, space.w_TypeError):
- raise
- else:
- if x != u"\ufffe":
- builder.append(x)
- return True
- return False
- try:
- x = space.int_w(w_x)
- except OperationError:
- if not e.match(space, space.w_TypeError):
- raise
- else:
- if x < 65536:
- builder.append(unichr(x))
+ # Charmap may return a string
+ try:
+ x = space.realstr_w(w_ch)
+ except OperationError, e:
+ if not e.match(space, space.w_TypeError):
+ raise
else:
- raise OperationError(space.w_TypeError, space.wrap("character mapping must be in range(65536)"))
- return True
- if not space.is_true(w_x):
- return False
+ return x
+
+ # Charmap may return a number
+ try:
+ x = space.int_w(w_ch)
+ except OperationError:
+ if not e.match(space, space.w_TypeError):
+ raise
+ else:
+ if 0 <= x < 256:
+ return chr(x)
+ else:
+ raise OperationError(space.w_TypeError, space.wrap(
+ "character mapping must be in range(256)"))
+
+ # Charmap may return None
+ if space.is_w(w_ch, space.w_None):
+ return errorchar
+
+ raise OperationError(space.w_TypeError, space.wrap("invalid mapping"))
+
+
+ at unwrap_spec(ObjSpace, str, str, W_Root)
+def charmap_decode(space, string, errors="strict", w_mapping=None):
+ if len(string) == 0:
+ return space.newtuple([space.wrap(u''), space.wrap(0)])
+
+ if space.is_w(w_mapping, space.w_None):
+ mapping = None
else:
- raise OperationError(space.w_TypeError, space.w_None)
+ mapping = Charmap_Decode(space, w_mapping)
+
+ final = True
+ state = space.fromcache(CodecState)
+ result, consumed = runicode.str_decode_charmap(
+ string, len(string), errors,
+ final, state.decode_error_handler, mapping)
+ return space.newtuple([space.wrap(result), space.wrap(consumed)])
+
+ at unwrap_spec(ObjSpace, unicode, str, W_Root)
+def charmap_encode(space, uni, errors="strict", w_mapping=None):
+ if space.is_w(w_mapping, space.w_None):
+ mapping = None
+ else:
+ mapping = Charmap_Encode(space, w_mapping)
+
+ state = space.fromcache(CodecState)
+ result = runicode.unicode_encode_charmap(
+ uni, len(uni), errors,
+ state.encode_error_handler, mapping)
+ return space.newtuple([space.wrap(result), space.wrap(len(uni))])
-def charmap_decode(space, s, errors="strict", w_mapping=None):
- size = len(s)
- # Default to Latin-1
- if space.is_true(space.is_(w_mapping, space.w_None)):
- return latin_1_decode(space, s, errors, space.w_False)
+ at unwrap_spec(ObjSpace, unicode)
+def charmap_build(space, chars):
+ # XXX CPython sometimes uses a three-level trie
+ w_charmap = space.newdict()
+ for num in range(len(chars)):
+ elem = chars[num]
+ space.setitem(w_charmap, space.newint(ord(elem)), space.newint(num))
+ return w_charmap
- if (size == 0):
+# ____________________________________________________________
+# Unicode escape
+
+class UnicodeData_Handler:
+ def __init__(self, space, w_getcode):
+ self.space = space
+ self.w_getcode = w_getcode
+
+ def call(self, name):
+ space = self.space
+ try:
+ w_code = space.call_function(self.w_getcode, space.wrap(name))
+ except OperationError, e:
+ if not e.match(space, space.w_KeyError):
+ raise
+ return -1
+ return space.int_w(w_code)
+
+ at unwrap_spec(ObjSpace, 'bufferstr', str, W_Root)
+def unicode_escape_decode(space, string, errors="strict", w_final=False):
+ final = space.is_true(w_final)
+ state = space.fromcache(CodecState)
+ errorhandler=state.decode_error_handler
+
+ unicode_name_handler = state.get_unicodedata_handler(space)
+
+ result, consumed = runicode.str_decode_unicode_escape(
+ string, len(string), errors,
+ final, state.decode_error_handler,
+ unicode_name_handler)
+
+ return space.newtuple([space.wrap(result), space.wrap(consumed)])
+
+# ____________________________________________________________
+# Unicode-internal
+
+ at unwrap_spec(ObjSpace, W_Root, str)
+def unicode_internal_decode(space, w_string, errors="strict"):
+ # special case for this codec: unicodes are returned as is
+ if space.isinstance_w(w_string, space.w_unicode):
+ return space.newtuple([w_string, space.len(w_string)])
+
+ string = space.str_w(w_string)
+
+ if len(string) == 0:
return space.newtuple([space.wrap(u''), space.wrap(0)])
-
- # fast path for all the stuff in the encodings module
- if space.is_true(space.isinstance(w_mapping, space.w_tuple)):
- mapping_w = space.fixedview(w_mapping)
- else:
- mapping_w = None
-
- builder = UnicodeBuilder(size)
- inpos = 0
- while (inpos < len(s)):
- #/* Get mapping_w (char ordinal -> integer, Unicode char or None) */
- ch = s[inpos]
- w_x = _extract_from_mapping(space, mapping_w, w_mapping, ch)
- if w_x is not None and _append_unicode(space, builder, w_x):
- inpos += 1
- continue
- state = space.fromcache(CodecState)
- next, inpos = state.decode_error_handler(errors, "charmap",
- "character maps to <undefined>", s, inpos, inpos+1)
- builder.append(next)
- res = builder.build()
- return space.newtuple([space.wrap(res), space.wrap(size)])
-charmap_decode.unwrap_spec = [ObjSpace, str, str, W_Root]
+
+ final = True
+ state = space.fromcache(CodecState)
+ result, consumed = runicode.str_decode_unicode_internal(
+ string, len(string), errors,
+ final, state.decode_error_handler)
+ return space.newtuple([space.wrap(result), space.wrap(consumed)])
+
+# ____________________________________________________________
+# support for the "string escape" codec
+# This is a bytes-to bytes transformation
+
+ at unwrap_spec(ObjSpace, W_Root, str)
+def escape_encode(space, w_string, errors='strict'):
+ w_repr = space.repr(w_string)
+ w_result = space.getslice(w_repr, space.wrap(1), space.wrap(-1))
+ return space.newtuple([w_result, space.len(w_string)])
+
+ at unwrap_spec(ObjSpace, str, str)
+def escape_decode(space, data, errors='strict'):
+ from pypy.interpreter.pyparser.parsestring import PyString_DecodeEscape
+ result = PyString_DecodeEscape(space, data, None)
+ return space.newtuple([space.wrap(result), space.wrap(len(data))])
Modified: pypy/trunk/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/test/test_codecs.py (original)
+++ pypy/trunk/pypy/module/_codecs/test/test_codecs.py Thu Jul 1 22:24:32 2010
@@ -1,7 +1,5 @@
import autopath
from pypy.conftest import gettestobjspace
-from pypy.module._codecs.app_codecs import unicode_escape_encode,\
- charmap_encode, unicode_escape_decode
class AppTestCodecs:
@@ -14,26 +12,16 @@
raises(TypeError, _codecs.register, 1)
def test_bigU_codecs(self):
- import sys
- oldmaxunicode = sys.maxunicode
- if sys.maxunicode <= 0xffff:
- return # this test cannot run on UCS2 builds
u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
'raw_unicode_escape',
'unicode_escape', 'unicode_internal'):
assert unicode(u.encode(encoding),encoding) == u
- sys.maxunicode = oldmaxunicode
def test_ucs4(self):
- import sys
- oldmaxunicode = sys.maxunicode
- if sys.maxunicode <= 0xffff:
- sys.maxunicode = 0xffffffff
x = u'\U00100000'
y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
assert x == y
- sys.maxunicode = oldmaxunicode
def test_named_unicode(self):
assert unicode('\\N{SPACE}','unicode-escape') == u" "
@@ -118,12 +106,20 @@
def test_charmap_decode(self):
from _codecs import charmap_decode
+ import sys
assert charmap_decode('', 'strict', 'blablabla') == ('', 0)
assert charmap_decode('xxx') == ('xxx', 3)
assert charmap_decode('xxx', 'strict', {ord('x'): u'XX'}) == ('XXXXXX', 3)
map = tuple([unichr(i) for i in range(256)])
assert charmap_decode('xxx\xff', 'strict', map) == (u'xxx\xff', 4)
+ raises(TypeError, charmap_decode, '\xff', "replace", {0xff: 0x10001})
+
+ def test_unicode_escape(self):
+ from _codecs import unicode_escape_encode, unicode_escape_decode
+ assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
+ assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
+ assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)
class AppTestPartialEvaluation:
@@ -377,6 +373,9 @@
def test_charmap_decode_1(self):
import codecs
+ assert codecs.charmap_encode(u'xxx') == ('xxx', 3)
+ assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 3)
+
res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab")
assert res == (u"ab\ufffd", 3)
res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe")
@@ -464,6 +463,9 @@
assert '\xff'.decode('utf-7', 'ignore') == ''
assert '\x00'.decode('unicode-internal', 'ignore') == ''
+ def test_backslahreplace(self):
+ assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') == 'a\\xac\u1234\u20ac\u8000'
+
def test_badhandler(self):
import codecs
results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
@@ -527,9 +529,26 @@
def test_charmap_encode(self):
assert 'xxx'.encode('charmap') == 'xxx'
+ import codecs
+ raises(TypeError, codecs.charmap_encode, u'\xff', "replace", {0xff: 300})
+ raises(UnicodeError, codecs.charmap_encode, u"\xff", "replace", {0xff: None})
+
+ def test_charmap_encode_replace(self):
+ charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
+ charmap[ord("?")] = "XYZ"
+ import codecs
+ sin = u"abcDEF"
+ sout = codecs.charmap_encode(sin, "replace", charmap)[0]
+ assert sout == "AABBCCXYZXYZXYZ"
+
def test_charmap_decode_2(self):
assert 'foo'.decode('charmap') == 'foo'
+ def test_charmap_build(self):
+ import codecs
+ assert codecs.charmap_build(u'123456') == {49: 0, 50: 1, 51: 2,
+ 52: 3, 53: 4, 54: 5}
+
def test_utf7_start_end_in_exception(self):
try:
'+IC'.decode('utf-7')
@@ -537,6 +556,9 @@
assert exc.start == 0
assert exc.end == 3
+ def test_utf7_surrogate(self):
+ raises(UnicodeDecodeError, '+3ADYAA-'.decode, 'utf-7')
+
def test_utf_16_encode_decode(self):
import codecs
x = u'123abc'
@@ -546,6 +568,8 @@
def test_unicode_escape(self):
assert u'\\'.encode('unicode-escape') == '\\\\'
assert '\\\\'.decode('unicode-escape') == u'\\'
+ assert u'\ud801'.encode('unicode-escape') == '\\ud801'
+ assert u'\u0013'.encode('unicode-escape') == '\\x13'
def test_mbcs(self):
import sys
@@ -555,14 +579,3 @@
assert u'caf\xe9'.encode('mbcs') == 'caf\xe9'
assert u'\u040a'.encode('mbcs') == '?' # some cyrillic letter
assert 'cafx\e9'.decode('mbcs') == u'cafx\e9'
-
-
-class TestDirect:
- def test_charmap_encode(self):
- assert charmap_encode(u'xxx') == ('xxx', 3)
- assert charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 6)
-
- def test_unicode_escape(self):
- assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
- assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
- assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)
Modified: pypy/trunk/pypy/objspace/std/marshal_impl.py
==============================================================================
--- pypy/trunk/pypy/objspace/std/marshal_impl.py (original)
+++ pypy/trunk/pypy/objspace/std/marshal_impl.py Thu Jul 1 22:24:32 2010
@@ -447,11 +447,11 @@
register(TYPE_CODE, unmarshal_pycode)
def marshal_w__Unicode(space, w_unicode, m):
- s = space.str_w(unicodehelper.PyUnicode_EncodeUTF8(space, w_unicode))
+ s = unicodehelper.PyUnicode_EncodeUTF8(space, space.unicode_w(w_unicode))
m.atom_str(TYPE_UNICODE, s)
def unmarshal_Unicode(space, u, tc):
- return unicodehelper.PyUnicode_DecodeUTF8(space, space.wrap(u.get_str()))
+ return space.wrap(unicodehelper.PyUnicode_DecodeUTF8(space, u.get_str()))
register(TYPE_UNICODE, unmarshal_Unicode)
app = gateway.applevel(r'''
Modified: pypy/trunk/pypy/objspace/std/unicodeobject.py
==============================================================================
--- pypy/trunk/pypy/objspace/std/unicodeobject.py (original)
+++ pypy/trunk/pypy/objspace/std/unicodeobject.py Thu Jul 1 22:24:32 2010
@@ -12,6 +12,7 @@
from pypy.rlib.rarithmetic import intmask, ovfcheck
from pypy.rlib.objectmodel import compute_hash
from pypy.rlib.rstring import string_repeat
+from pypy.rlib.runicode import unicode_encode_unicode_escape
from pypy.module.unicodedata import unicodedb_4_1_0 as unicodedb
from pypy.tool.sourcetools import func_with_new_name
@@ -892,101 +893,11 @@
space.wrap("character mapping must return integer, None or unicode"))
return W_UnicodeObject(u''.join(result))
-# Move this into the _codecs module as 'unicodeescape_string (Remember to cater for quotes)'
def repr__Unicode(space, w_unicode):
- hexdigits = "0123456789abcdef"
chars = w_unicode._value
size = len(chars)
-
- singlequote = doublequote = False
- for c in chars:
- if c == u'\'':
- singlequote = True
- elif c == u'"':
- doublequote = True
- if singlequote and not doublequote:
- quote = '"'
- else:
- quote = '\''
- result = ['u', quote]
- j = 0
- while j<len(chars):
- ch = chars[j]
- code = ord(ch)
- if code >= 0x10000:
- # Resize if needed
- result.extend(['\\', "U",
- hexdigits[(code >> 28) & 0xf],
- hexdigits[(code >> 24) & 0xf],
- hexdigits[(code >> 20) & 0xf],
- hexdigits[(code >> 16) & 0xf],
- hexdigits[(code >> 12) & 0xf],
- hexdigits[(code >> 8) & 0xf],
- hexdigits[(code >> 4) & 0xf],
- hexdigits[(code >> 0) & 0xf],
- ])
- j += 1
- continue
- if code >= 0xD800 and code < 0xDC00:
- if j < size - 1:
- ch2 = chars[j+1]
- code2 = ord(ch2)
- if code2 >= 0xDC00 and code2 <= 0xDFFF:
- code = (((code & 0x03FF) << 10) | (code2 & 0x03FF)) + 0x00010000
- result.extend(['\\', "U",
- hexdigits[(code >> 28) & 0xf],
- hexdigits[(code >> 24) & 0xf],
- hexdigits[(code >> 20) & 0xf],
- hexdigits[(code >> 16) & 0xf],
- hexdigits[(code >> 12) & 0xf],
- hexdigits[(code >> 8) & 0xf],
- hexdigits[(code >> 4) & 0xf],
- hexdigits[(code >> 0) & 0xf],
- ])
- j += 2
- continue
-
- if code >= 0x100:
- result.extend(['\\', "u",
- hexdigits[(code >> 12) & 0xf],
- hexdigits[(code >> 8) & 0xf],
- hexdigits[(code >> 4) & 0xf],
- hexdigits[(code >> 0) & 0xf],
- ])
- j += 1
- continue
- if code == ord('\\') or code == ord(quote):
- result.append('\\')
- result.append(chr(code))
- j += 1
- continue
- if code == ord('\t'):
- result.append('\\')
- result.append('t')
- j += 1
- continue
- if code == ord('\r'):
- result.append('\\')
- result.append('r')
- j += 1
- continue
- if code == ord('\n'):
- result.append('\\')
- result.append('n')
- j += 1
- continue
- if code < ord(' ') or code >= 0x7f:
- result.extend(['\\', "x",
- hexdigits[(code >> 4) & 0xf],
- hexdigits[(code >> 0) & 0xf],
- ])
- j += 1
- continue
- result.append(chr(code))
- j += 1
- result.append(quote)
- return space.wrap(''.join(result))
-
+ s = unicode_encode_unicode_escape(chars, size, "strict", quotes=True)
+ return space.wrap(s)
def mod__Unicode_ANY(space, w_format, w_values):
return mod_format(space, w_format, w_values, do_unicode=True)
Modified: pypy/trunk/pypy/rlib/rstring.py
==============================================================================
--- pypy/trunk/pypy/rlib/rstring.py (original)
+++ pypy/trunk/pypy/rlib/rstring.py Thu Jul 1 22:24:32 2010
@@ -56,6 +56,7 @@
self.l.append(s)
def append_slice(self, s, start, end):
+ assert 0 <= start <= end <= len(s)
self.l.append(s[start:end])
def append_multiple_char(self, c, times):
Modified: pypy/trunk/pypy/rlib/runicode.py
==============================================================================
--- pypy/trunk/pypy/rlib/runicode.py (original)
+++ pypy/trunk/pypy/rlib/runicode.py Thu Jul 1 22:24:32 2010
@@ -1,7 +1,9 @@
import sys
from pypy.rlib.bitmanipulation import splitter
from pypy.rpython.lltypesystem import lltype, rffi
-from pypy.rlib.objectmodel import we_are_translated
+from pypy.rlib.objectmodel import we_are_translated, specialize
+from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
+from pypy.rlib.rarithmetic import r_uint
if rffi.sizeof(lltype.UniChar) == 4:
MAXUNICODE = 0x10ffff
@@ -42,8 +44,6 @@
UNICHR = unichr
ORD = ord
-# XXX review the functions below and think about using stringbuilders for them
-
def raise_unicode_exception_decode(errors, encoding, msg, s,
startingpos, endingpos):
@@ -55,8 +55,8 @@
assert isinstance(u, unicode)
raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
-# ____________________________________________________________
-# unicode decoding
+# ____________________________________________________________
+# utf-8
utf8_code_length = [
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -81,9 +81,10 @@
errorhandler=None):
if errorhandler is None:
errorhandler = raise_unicode_exception_decode
- if (size == 0):
+ if size == 0:
return u'', 0
- result = []
+
+ result = UnicodeBuilder(size)
pos = 0
while pos < size:
ch = s[pos]
@@ -94,14 +95,14 @@
continue
n = utf8_code_length[ordch1]
- if (pos + n > size):
+ if pos + n > size:
if not final:
break
else:
r, pos = errorhandler(errors, "utf-8",
"unexpected end of data", s, pos, size)
result.append(r)
- if (pos + n > size):
+ if pos + n > size:
break
if n == 0:
r, pos = errorhandler(errors, "utf-8", "unexpected code byte",
@@ -116,7 +117,7 @@
z, two = splitter[6, 2](ordch2)
y, six = splitter[5, 3](ordch1)
assert six == 6
- if (two != 2):
+ if two != 2:
r, pos = errorhandler(errors, "utf-8", "invalid data",
s, pos, pos + 2)
result.append(r)
@@ -137,7 +138,7 @@
y, two2 = splitter[6, 2](ordch2)
x, fourteen = splitter[4, 4](ordch1)
assert fourteen == 14
- if (two1 != 2 or two2 != 2):
+ if two1 != 2 or two2 != 2:
r, pos = errorhandler(errors, "utf-8", "invalid data",
s, pos, pos + 3)
result.append(r)
@@ -166,7 +167,7 @@
x, two3 = splitter[6, 2](ordch2)
w, thirty = splitter[3, 5](ordch1)
assert thirty == 30
- if (two1 != 2 or two2 != 2 or two3 != 2):
+ if two1 != 2 or two2 != 2 or two3 != 2:
r, pos = errorhandler(errors, "utf-8", "invalid data",
s, pos, pos + 4)
result.append(r)
@@ -174,7 +175,7 @@
c = (w << 18) + (x << 12) + (y << 6) + z
# minimum value allowed for 4 byte encoding
# maximum value allowed for UTF-16
- if ((c < 0x10000) or (c > 0x10ffff)):
+ if c < 0x10000 or c > 0x10ffff:
r, pos = errorhandler(errors, "utf-8", "illegal encoding",
s, pos, pos + 4)
result.append(r)
@@ -197,8 +198,53 @@
s, pos, pos + n)
result.append(r)
- return u"".join(result), pos
+ return result.build(), pos
+def _encodeUCS4(result, ch):
+ # Encode UCS4 Unicode ordinals
+ result.append((chr((0xf0 | (ch >> 18)))))
+ result.append((chr((0x80 | ((ch >> 12) & 0x3f)))))
+ result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+ result.append((chr((0x80 | (ch & 0x3f)))))
+
+def unicode_encode_utf_8(s, size, errors, errorhandler=None):
+ assert(size >= 0)
+ result = StringBuilder(size)
+ i = 0
+ while i < size:
+ ch = ord(s[i])
+ i += 1
+ if ch < 0x80:
+ # Encode ASCII
+ result.append(chr(ch))
+ elif ch < 0x0800:
+ # Encode Latin-1
+ result.append(chr((0xc0 | (ch >> 6))))
+ result.append(chr((0x80 | (ch & 0x3f))))
+ else:
+ # Encode UCS2 Unicode ordinals
+ if ch < 0x10000:
+ # Special case: check for high surrogate
+ if 0xD800 <= ch <= 0xDBFF and i != size:
+ ch2 = ord(s[i])
+ # Check for low surrogate and combine the two to
+ # form a UCS4 value
+ if 0xDC00 <= ch2 <= 0xDFFF:
+ ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
+ i += 1
+ _encodeUCS4(result, ch3)
+ continue
+ # Fall through: handles isolated high surrogates
+ result.append((chr((0xe0 | (ch >> 12)))))
+ result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+ result.append((chr((0x80 | (ch & 0x3f)))))
+ continue
+ else:
+ _encodeUCS4(result, ch)
+ return result.build()
+
+# ____________________________________________________________
+# utf-16
def str_decode_utf_16(s, size, errors, final=True,
errorhandler=None):
@@ -238,12 +284,11 @@
# mark is skipped, in all other modes, it is copied to the output
# stream as-is (giving a ZWNBSP character).
pos = 0
- result = []
if byteorder == 'native':
- if (size >= 2):
+ if size >= 2:
bom = (ord(s[ihi]) << 8) | ord(s[ilo])
if BYTEORDER == 'little':
- if (bom == 0xFEFF):
+ if bom == 0xFEFF:
pos += 2
bo = -1
elif bom == 0xFFFE:
@@ -260,20 +305,22 @@
bo = -1
else:
bo = 1
- if (size == 0):
+ if size == 0:
return u'', 0, bo
- if (bo == -1):
+ if bo == -1:
# force little endian
ihi = 1
ilo = 0
- elif (bo == 1):
+ elif bo == 1:
# force big endian
ihi = 0
ilo = 1
+ result = UnicodeBuilder(size // 2)
+
#XXX I think the errors are not correctly handled here
- while (pos < len(s)):
+ while pos < size:
# remaining bytes at the end? (size should be even)
if len(s) - pos < 2:
if not final:
@@ -285,7 +332,7 @@
break
ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
pos += 2
- if (ch < 0xD800 or ch > 0xDFFF):
+ if ch < 0xD800 or ch > 0xDFFF:
result.append(unichr(ch))
continue
# UTF-16 code pair:
@@ -297,10 +344,10 @@
result.append(r)
if len(s) - pos < 2:
break
- elif (0xD800 <= ch and ch <= 0xDBFF):
+ elif 0xD800 <= ch <= 0xDBFF:
ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo])
pos += 2
- if (0xDC00 <= ch2 and ch2 <= 0xDFFF):
+ if 0xDC00 <= ch2 <= 0xDFFF:
if MAXUNICODE < 65536:
result.append(unichr(ch))
result.append(unichr(ch2))
@@ -318,17 +365,305 @@
"illegal encoding",
s, pos - 2, pos)
result.append(r)
- return u"".join(result), pos, bo
+ return result.build(), pos, bo
+
+def _STORECHAR(result, CH, byteorder):
+ hi = chr(((CH) >> 8) & 0xff)
+ lo = chr((CH) & 0xff)
+ if byteorder == 'little':
+ result.append(lo)
+ result.append(hi)
+ else:
+ result.append(hi)
+ result.append(lo)
+
+def unicode_encode_utf_16_helper(s, size, errors,
+ errorhandler=None,
+ byteorder='little'):
+ if size == 0:
+ return ""
+
+ result = StringBuilder(size * 2 + 2)
+ if byteorder == 'native':
+ _STORECHAR(result, 0xFEFF, BYTEORDER)
+ byteorder = BYTEORDER
+
+ i = 0
+ while i < size:
+ ch = ord(s[i])
+ i += 1
+ ch2 = 0
+ if ch >= 0x10000:
+ ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
+ ch = 0xD800 | ((ch-0x10000) >> 10)
+
+ _STORECHAR(result, ch, byteorder)
+ if ch2:
+ _STORECHAR(result, ch2, byteorder)
+
+ return result.build()
+
+def unicode_encode_utf_16(s, size, errors,
+ errorhandler=None):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "native")
+
+
+def unicode_encode_utf_16_be(s, size, errors,
+ errorhandler=None):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "big")
+
+
+def unicode_encode_utf_16_le(s, size, errors,
+ errorhandler=None):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "little")
+
+
+# ____________________________________________________________
+# utf-7
+
+## indicate whether a UTF-7 character is special i.e. cannot be directly
+## encoded:
+## 0 - not special
+## 1 - special
+## 2 - whitespace (optional)
+## 3 - RFC2152 Set O (optional)
+
+_utf7_special = [
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
+ 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
+ 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
+]
+
+def _utf7_SPECIAL(oc, encodeO=False, encodeWS=False):
+ return (oc > 127 or _utf7_special[oc] == 1 or
+ (encodeWS and _utf7_special[oc] == 2) or
+ (encodeO and _utf7_special[oc] == 3))
+
+def _utf7_B64CHAR(oc):
+ if oc > 127:
+ return False
+ c = chr(oc)
+ return c.isalnum() or c == '+' or c == '/'
+def _utf7_TO_BASE64(n):
+ "Returns the base-64 character of the bottom 6 bits of n"
+ return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[n & 0x3f]
+def _utf7_FROM_BASE64(c):
+ "Retuns the base-64 value of a base-64 character"
+ if c == '+':
+ return 62
+ elif c == '/':
+ return 63
+ elif c >= 'a':
+ return ord(c) - 71
+ elif c >= 'A':
+ return ord(c) - 65
+ else:
+ return ord(c) + 4
+
+def _utf7_ENCODE(result, ch, bits):
+ while bits >= 6:
+ result.append(_utf7_TO_BASE64(ch >> (bits - 6)))
+ bits -= 6
+ return bits
+
+def _utf7_DECODE(s, result, errorhandler, errors,
+ pos, charsleft, bitsleft, surrogate):
+ while bitsleft >= 16:
+ outCh = (charsleft >> (bitsleft-16)) & 0xffff
+ bitsleft -= 16
+
+ if surrogate:
+ ## We have already generated an error for the high
+ ## surrogate so let's not bother seeing if the low
+ ## surrogate is correct or not
+ surrogate = False
+ elif 0xDC00 <= outCh <= 0xDFFF:
+ ## This is a surrogate pair. Unfortunately we can't
+ ## represent it in a 16-bit character
+ surrogate = True
+ msg = "code pairs are not supported"
+ res, pos = errorhandler(errors, 'utf-7',
+ msg, s, pos-1, pos)
+ result.append(res)
+ bitsleft = 0
+ break
+ else:
+ result.append(unichr(outCh))
+ return pos, charsleft, bitsleft, surrogate
+
+
+def str_decode_utf_7(s, size, errors, final=False,
+ errorhandler=None):
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_decode
+ if size == 0:
+ return u'', 0
+
+ inShift = False
+ bitsleft = 0
+ startinpos = 0
+ charsleft = 0
+ surrogate = False
+
+ result = UnicodeBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+ oc = ord(ch)
+
+ if inShift:
+ if ch == '-' or not _utf7_B64CHAR(oc):
+ inShift = 0
+ pos += 1
+
+ pos, charsleft, bitsleft, surrogate = _utf7_DECODE(
+ s, result, errorhandler, errors,
+ pos, charsleft, bitsleft, surrogate)
+ if bitsleft >= 6:
+ ## The shift sequence has a partial character in it. If
+ ## bitsleft < 6 then we could just classify it as padding
+ ## but that is not the case here
+ msg = "partial character in shift sequence"
+ res, pos = errorhandler(errors, 'utf-7',
+ msg, s, pos-1, pos)
+ result.append(res)
+ ## According to RFC2152 the remaining bits should be
+ ## zero. We choose to signal an error/insert a replacement
+ ## character here so indicate the potential of a
+ ## misencoded character.
+ if ch == '-':
+ if pos < size and s[pos] == '-':
+ result.append(u'-')
+ inShift = True
+
+ elif _utf7_SPECIAL(oc):
+ msg = "unexpected special character"
+ res, pos = errorhandler(errors, 'utf-7',
+ msg, s, pos-1, pos)
+ result.append(res)
+ else:
+ result.append(unichr(ord(ch)))
+ else:
+ charsleft = (charsleft << 6) | _utf7_FROM_BASE64(ch)
+ bitsleft += 6
+ pos += 1
+
+ pos, charsleft, bitsleft, surrogate = _utf7_DECODE(
+ s, result, errorhandler, errors,
+ pos, charsleft, bitsleft, surrogate)
+ elif ch == '+':
+ startinpos = pos
+ pos += 1
+ if pos < size and s[pos] == '-':
+ pos += 1
+ result.append(u'+')
+ else:
+ inShift = 1
+ bitsleft = 0
+
+ elif _utf7_SPECIAL(oc):
+ pos += 1
+ msg = "unexpected special character"
+ res, pos = errorhandler(errors, 'utf-7', msg, s, pos-1, pos)
+ result.append(res)
+ else:
+ result.append(unichr(oc))
+ pos += 1
+
+ if inShift:
+ endinpos = size
+ msg = "unterminated shift sequence"
+ res, pos = errorhandler(errors, 'utf-7', msg, s, startinpos, pos)
+ result.append(res)
+
+ return result.build(), pos
+
+def unicode_encode_utf_7(s, size, errors, errorhandler=None):
+ if size == 0:
+ return ''
+ result = StringBuilder(size)
+
+ encodeSetO = encodeWhiteSpace = False
+
+ inShift = False
+ bitsleft = 0
+ charsleft = 0
+
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+ oc = ord(ch)
+ if not inShift:
+ if ch == u'+':
+ result.append('+-')
+ elif _utf7_SPECIAL(oc, encodeSetO, encodeWhiteSpace):
+ charsleft = oc
+ bitsleft = 16
+ result.append('+')
+ bitsleft = _utf7_ENCODE(result, charsleft, bitsleft)
+ inShift = bitsleft > 0
+ else:
+ result.append(chr(oc))
+ else:
+ if not _utf7_SPECIAL(oc, encodeSetO, encodeWhiteSpace):
+ result.append(_utf7_TO_BASE64(charsleft << (6-bitsleft)))
+ charsleft = 0
+ bitsleft = 0
+ ## Characters not in the BASE64 set implicitly unshift the
+ ## sequence so no '-' is required, except if the character is
+ ## itself a '-'
+ if _utf7_B64CHAR(oc) or ch == u'-':
+ result.append('-')
+ inShift = False
+ result.append(chr(oc))
+ else:
+ bitsleft += 16
+ charsleft = (charsleft << 16) | oc
+ bitsleft = _utf7_ENCODE(result, charsleft, bitsleft)
+ ## If the next character is special then we dont' need to
+ ## terminate the shift sequence. If the next character is not
+ ## a BASE64 character or '-' then the shift sequence will be
+ ## terminated implicitly and we don't have to insert a '-'.
+ if bitsleft == 0:
+ if pos + 1 < size:
+ ch2 = s[pos + 1]
+ oc2 = ord(ch2)
+
+ if _utf7_SPECIAL(oc2, encodeSetO, encodeWhiteSpace):
+ pass
+ elif _utf7_B64CHAR(oc2) or ch2 == u'-':
+ result.append('-')
+ inShift = False
+ else:
+ inShift = False
+ else:
+ result.append('-')
+ inShift = False
+ pos += 1
+
+ if bitsleft:
+ result.append(_utf7_TO_BASE64(charsleft << (6 - bitsleft)))
+ result.append('-')
+
+ return result.build()
+
+# ____________________________________________________________
+# ascii and latin-1
def str_decode_latin_1(s, size, errors, final=False,
errorhandler=None):
# latin1 is equivalent to the first 256 ordinals in Unicode.
pos = 0
- result = []
- while (pos < size):
+ result = UnicodeBuilder(size)
+ while pos < size:
result.append(unichr(ord(s[pos])))
pos += 1
- return u"".join(result), pos
+ return result.build(), pos
def str_decode_ascii(s, size, errors, final=False,
@@ -336,9 +671,9 @@
if errorhandler is None:
errorhandler = raise_unicode_exception_decode
# ASCII is equivalent to the first 128 ordinals in Unicode.
- result = []
+ result = UnicodeBuilder(size)
pos = 0
- while pos < len(s):
+ while pos < size:
c = s[pos]
if ord(c) < 128:
result.append(unichr(ord(c)))
@@ -347,55 +682,7 @@
r, pos = errorhandler(errors, "ascii", "ordinal not in range(128)",
s, pos, pos + 1)
result.append(r)
- return u"".join(result), pos
-
-
-# ____________________________________________________________
-# unicode encoding
-
-
-def unicode_encode_utf_8(s, size, errors, errorhandler=None):
- assert(size >= 0)
- result = []
- i = 0
- while i < size:
- ch = ord(s[i])
- i += 1
- if (ch < 0x80):
- # Encode ASCII
- result.append(chr(ch))
- elif (ch < 0x0800) :
- # Encode Latin-1
- result.append(chr((0xc0 | (ch >> 6))))
- result.append(chr((0x80 | (ch & 0x3f))))
- else:
- # Encode UCS2 Unicode ordinals
- if (ch < 0x10000):
- # Special case: check for high surrogate
- if (0xD800 <= ch and ch <= 0xDBFF and i != size) :
- ch2 = ord(s[i])
- # Check for low surrogate and combine the two to
- # form a UCS4 value
- if (0xDC00 <= ch2 and ch2 <= 0xDFFF) :
- ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
- i += 1
- _encodeUCS4(result, ch3)
- continue
- # Fall through: handles isolated high surrogates
- result.append((chr((0xe0 | (ch >> 12)))))
- result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
- result.append((chr((0x80 | (ch & 0x3f)))))
- continue
- else:
- _encodeUCS4(result, ch)
- return "".join(result)
-
-def _encodeUCS4(result, ch):
- # Encode UCS4 Unicode ordinals
- result.append((chr((0xf0 | (ch >> 18)))))
- result.append((chr((0x80 | ((ch >> 12) & 0x3f)))))
- result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
- result.append((chr((0x80 | (ch & 0x3f)))))
+ return result.build(), pos
def unicode_encode_ucs1_helper(p, size, errors,
@@ -408,12 +695,12 @@
else:
reason = "ordinal not in range(128)"
encoding = "ascii"
-
- if (size == 0):
+
+ if size == 0:
return ''
- result = []
+ result = StringBuilder(size)
pos = 0
- while pos < len(p):
+ while pos < size:
ch = p[pos]
if ord(ch) < limit:
@@ -427,9 +714,9 @@
collend += 1
r, pos = errorhandler(errors, encoding, reason, p,
collstart, collend)
- result += r # extend 'result' as a list of characters
+ result.append(r)
- return "".join(result)
+ return result.build()
def unicode_encode_latin_1(p, size, errors, errorhandler=None):
res = unicode_encode_ucs1_helper(p, size, errors, errorhandler, 256)
@@ -439,57 +726,479 @@
res = unicode_encode_ucs1_helper(p, size, errors, errorhandler, 128)
return res
+# ____________________________________________________________
+# Charmap
-def _STORECHAR(result, CH, byteorder):
- hi = chr(((CH) >> 8) & 0xff)
- lo = chr((CH) & 0xff)
- if byteorder == 'little':
- result.append(lo)
- result.append(hi)
+ERROR_CHAR = u'\ufffe'
+
+ at specialize.argtype(5)
+def str_decode_charmap(s, size, errors, final=False,
+ errorhandler=None, mapping=None):
+ "mapping can be a rpython dictionary, or a dict-like object."
+
+ # Default to Latin-1
+ if mapping is None:
+ return str_decode_latin_1(s, size, errors, final=final,
+ errorhandler=errorhandler)
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_decode
+ if size == 0:
+ return u'', 0
+
+ pos = 0
+ result = UnicodeBuilder(size)
+ while pos < size:
+ ch = s[pos]
+
+ c = mapping.get(ch, ERROR_CHAR)
+ if c == ERROR_CHAR:
+ r, pos = errorhandler(errors, "charmap",
+ "character maps to <undefined>",
+ s, pos, pos + 1)
+ result.append(r)
+ continue
+ result.append(c)
+ pos += 1
+ return result.build(), pos
+
+def unicode_encode_charmap(s, size, errors, errorhandler=None,
+ mapping=None):
+ if mapping is None:
+ return unicode_encode_latin_1(s, size, errors,
+ errorhandler=errorhandler)
+
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_encode
+
+ if size == 0:
+ return ''
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ c = mapping.get(ch, '')
+ if len(c) == 0:
+ res, pos = errorhandler(errors, "charmap",
+ "character maps to <undefined>",
+ s, pos, pos + 1)
+ for ch2 in res:
+ c2 = mapping.get(unichr(ord(ch2)), '')
+ if len(c2) == 0:
+ errorhandler(
+ "strict", "charmap",
+ "character maps to <undefined>",
+ s, pos, pos + 1)
+ result.append(c2)
+ continue
+ result.append(c)
+ pos += 1
+ return result.build()
+
+# ____________________________________________________________
+# Unicode escape
+
+hexdigits = "0123456789ABCDEFabcdef"
+
+def hexescape(builder, s, pos, digits,
+ encoding, errorhandler, message, errors):
+ import sys
+ chr = 0
+ if pos + digits > len(s):
+ message = "end of string in escape sequence"
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-2, len(s))
+ builder.append(res)
else:
- result.append(hi)
- result.append(lo)
+ try:
+ chr = r_uint(int(s[pos:pos+digits], 16))
+ except ValueError:
+ endinpos = pos
+ while s[endinpos] in hexdigits:
+ endinpos += 1
+ res, pos = errorhandler(errors, encoding,
+ message, s, pos-2, endinpos+1)
+ builder.append(res)
+ else:
+ # when we get here, chr is a 32-bit unicode character
+ if chr <= MAXUNICODE:
+ builder.append(unichr(chr))
+ pos += digits
+
+ elif chr <= 0x10ffff:
+ chr -= 0x10000L
+ builder.append(unichr(0xD800 + (chr >> 10)))
+ builder.append(unichr(0xDC00 + (chr & 0x03FF)))
+ pos += digits
+ else:
+ message = "illegal Unicode character"
+ res, pos = errorhandler(errors, encoding,
+ message, s, pos-2, pos+digits)
+ builder.append(res)
+ return pos
+
+def str_decode_unicode_escape(s, size, errors, final=False,
+ errorhandler=False,
+ unicodedata_handler=None):
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_decode
-def unicode_encode_utf_16_helper(s, size, errors,
- errorhandler=None,
- byteorder='little'):
- result = []
- if (byteorder == 'native'):
- _STORECHAR(result, 0xFEFF, BYTEORDER)
- byteorder = BYTEORDER
-
if size == 0:
- return ""
+ return u'', 0
- i = 0
- while i < size:
- ch = ord(s[i])
- i += 1
- ch2 = 0
- if (ch >= 0x10000) :
- ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
- ch = 0xD800 | ((ch-0x10000) >> 10)
+ builder = UnicodeBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
- _STORECHAR(result, ch, byteorder)
- if ch2:
- _STORECHAR(result, ch2, byteorder)
+ # Non-escape characters are interpreted as Unicode ordinals
+ if ch != '\\':
+ builder.append(unichr(ord(ch)))
+ pos += 1
+ continue
- return "".join(result)
+ # - Escapes
+ pos += 1
+ if pos >= size:
+ message = "\\ at end of string"
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, size)
+ builder.append(res)
+ continue
-def unicode_encode_utf_16(s, size, errors,
- errorhandler=None):
- return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "native")
+ ch = s[pos]
+ pos += 1
+ # \x escapes
+ if ch == '\n': pass
+ elif ch == '\\': builder.append(u'\\')
+ elif ch == '\'': builder.append(u'\'')
+ elif ch == '\"': builder.append(u'\"')
+ elif ch == 'b' : builder.append(u'\b')
+ elif ch == 'f' : builder.append(u'\f')
+ elif ch == 't' : builder.append(u'\t')
+ elif ch == 'n' : builder.append(u'\n')
+ elif ch == 'r' : builder.append(u'\r')
+ elif ch == 'v' : builder.append(u'\v')
+ elif ch == 'a' : builder.append(u'\a')
+ elif '0' <= ch <= '7':
+ x = ord(ch) - ord('0')
+ if pos < size:
+ ch = s[pos]
+ if '0' <= ch <= '7':
+ pos += 1
+ x = (x<<3) + ord(ch) - ord('0')
+ if pos < size:
+ ch = s[pos]
+ if '0' <= ch <= '7':
+ pos += 1
+ x = (x<<3) + ord(ch) - ord('0')
+ builder.append(unichr(x))
+ # hex escapes
+ # \xXX
+ elif ch == 'x':
+ digits = 2
+ message = "truncated \\xXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+
+ # \uXXXX
+ elif ch == 'u':
+ digits = 4
+ message = "truncated \\uXXXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+
+ # \UXXXXXXXX
+ elif ch == 'U':
+ digits = 8
+ message = "truncated \\UXXXXXXXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+
+ # \N{name}
+ elif ch == 'N':
+ message = "malformed \\N character escape"
+ look = pos
+ if unicodedata_handler is None:
+ message = ("\\N escapes not supported "
+ "(can't load unicodedata module)")
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, size)
+ builder.append(res)
+ continue
+
+ if look < size and s[look] == '{':
+ # look for the closing brace
+ while look < size and s[look] != '}':
+ look += 1
+ if look < size and s[look] == '}':
+ # found a name. look it up in the unicode database
+ message = "unknown Unicode character name"
+ name = s[pos+1:look]
+ code = unicodedata_handler.call(name)
+ if code < 0:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ continue
+ pos = look + 1
+ if code <= MAXUNICODE:
+ builder.append(unichr(code))
+ else:
+ code -= 0x10000L
+ builder.append(unichr(0xD800 + (code >> 10)))
+ builder.append(unichr(0xDC00 + (code & 0x03FF)))
+ else:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ else:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ else:
+ builder.append(u'\\')
+ builder.append(unichr(ord(ch)))
+ return builder.build(), pos
-def unicode_encode_utf_16_be(s, size, errors,
- errorhandler=None):
- return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "big")
+def unicode_encode_unicode_escape(s, size, errors, errorhandler=None, quotes=False):
+ # errorhandler is not used: this function cannot cause Unicode errors
+ result = StringBuilder(size)
+
+ if quotes:
+ if s.find(u'\'') != -1 and s.find(u'\"') == -1:
+ quote = ord('\"')
+ result.append('u"')
+ else:
+ quote = ord('\'')
+ result.append('u\'')
+ else:
+ quote = 0
+ if size == 0:
+ return ''
-def unicode_encode_utf_16_le(s, size, errors,
- errorhandler=None):
- return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "little")
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+ oc = ord(ch)
+
+ # Escape quotes
+ if quotes and (oc == quote or ch == '\\'):
+ result.append('\\')
+ result.append(chr(oc))
+ pos += 1
+ continue
+
+ if 0xD800 <= oc < 0xDC00 and pos + 1 < size:
+ # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
+ pos += 1
+ oc2 = ord(s[pos])
+
+ if 0xDC00 <= oc2 <= 0xDFFF:
+ ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
+ raw_unicode_escape_helper(result, ucs)
+ pos += 1
+ continue
+ # Fall through: isolated surrogates are copied as-is
+ pos -= 1
+
+ # Map special whitespace to '\t', \n', '\r'
+ if ch == '\t':
+ result.append('\\t')
+ elif ch == '\n':
+ result.append('\\n')
+ elif ch == '\r':
+ result.append('\\r')
+ elif ch == '\\':
+ result.append('\\\\')
+
+ # Map non-printable or non-ascii to '\xhh' or '\uhhhh'
+ elif oc < 32 or oc >= 0x7F:
+ raw_unicode_escape_helper(result, oc)
+
+ # Copy everything else as-is
+ else:
+ result.append(chr(oc))
+ pos += 1
+
+ if quotes:
+ result.append(chr(quote))
+ return result.build()
+
+# ____________________________________________________________
+# Raw unicode escape
+
+def str_decode_raw_unicode_escape(s, size, errors, final=False,
+ errorhandler=None):
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_decode
+ if size == 0:
+ return u'', 0
+
+ result = UnicodeBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ # Non-escape characters are interpreted as Unicode ordinals
+ if ch != '\\':
+ result.append(unichr(ord(ch)))
+ pos += 1
+ continue
+
+ startinpos = pos
+ # \u-escapes are only interpreted iff the number of leading
+ # backslashes is odd
+ bs = pos
+ while pos < size:
+ pos += 1
+ if pos == size or s[pos] != '\\':
+ break
+ result.append(u'\\')
+
+ # we have a backslash at the end of the string, stop here
+ if pos >= size:
+ result.append(u'\\')
+ break
+
+ if ((pos - bs) & 1 == 0 or
+ pos >= size or
+ (s[pos] != 'u' and s[pos] != 'U')):
+ result.append(u'\\')
+ result.append(unichr(ord(s[pos])))
+ pos += 1
+ continue
+
+ if s[pos] == 'u':
+ digits = 4
+ message = "truncated \\uXXXX escape"
+ else:
+ digits = 8
+ message = "truncated \\UXXXXXXXX escape"
+ pos += 1
+ pos = hexescape(result, s, pos, digits,
+ "rawunicodeescape", errorhandler, message, errors)
+
+ return result.build(), pos
+
+def raw_unicode_escape_helper(result, char):
+ num = hex(char)
+ if char >= 0x10000:
+ result.append("\\U")
+ zeros = 8
+ elif char >= 0x100:
+ result.append("\\u")
+ zeros = 4
+ else:
+ result.append("\\x")
+ zeros = 2
+ lnum = len(num)
+ nb = zeros + 2 - lnum # num starts with '0x'
+ if nb > 0:
+ result.append_multiple_char('0', nb)
+ result.append_slice(num, 2, lnum)
+
+def unicode_encode_raw_unicode_escape(s, size, errors, errorhandler=None):
+ # errorhandler is not used: this function cannot cause Unicode errors
+ if size == 0:
+ return ''
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ oc = ord(s[pos])
+ if oc < 0x100:
+ result.append(chr(oc))
+ else:
+ raw_unicode_escape_helper(result, oc)
+ pos += 1
+
+ return result.build()
+
+# ____________________________________________________________
+# unicode-internal
+
+def str_decode_unicode_internal(s, size, errors, final=False,
+ errorhandler=None):
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_decode
+ if size == 0:
+ return u'', 0
+
+ if MAXUNICODE < 65536:
+ unicode_bytes = 2
+ else:
+ unicode_bytes = 4
+ if BYTEORDER == "little":
+ start = 0
+ stop = unicode_bytes
+ step = 1
+ else:
+ start = unicode_bytes - 1
+ stop = -1
+ step = -1
+
+ result = UnicodeBuilder(size // unicode_bytes)
+ pos = 0
+ while pos < size:
+ if pos > size - unicode_bytes:
+ res, pos = errorhandler(errors, "unicode_internal",
+ "truncated input",
+ s, pos, size)
+ result.append(res)
+ if pos > size - unicode_bytes:
+ break
+ continue
+ t = r_uint(0)
+ h = 0
+ for j in range(start, stop, step):
+ t += r_uint(ord(s[pos + j])) << (h*8)
+ h += 1
+ if t > MAXUNICODE:
+ res, pos = errorhandler(errors, "unicode_internal",
+ "unichr(%d) not in range" % (t,),
+ s, pos, pos + unicode_bytes)
+ result.append(res)
+ continue
+ result.append(unichr(t))
+ pos += unicode_bytes
+ return result.build(), pos
+
+def unicode_encode_unicode_internal(s, size, errors, errorhandler=None):
+ if size == 0:
+ return ''
+
+ if MAXUNICODE < 65536:
+ unicode_bytes = 2
+ else:
+ unicode_bytes = 4
+
+ result = StringBuilder(size * unicode_bytes)
+ pos = 0
+ while pos < size:
+ oc = ord(s[pos])
+ if MAXUNICODE < 65536:
+ if BYTEORDER == "little":
+ result.append(chr(oc & 0xFF))
+ result.append(chr(oc >> 8 & 0xFF))
+ else:
+ result.append(chr(oc >> 8 & 0xFF))
+ result.append(chr(oc & 0xFF))
+ else:
+ if BYTEORDER == "little":
+ result.append(chr(oc & 0xFF))
+ result.append(chr(oc >> 8 & 0xFF))
+ result.append(chr(oc >> 16 & 0xFF))
+ result.append(chr(oc >> 24 & 0xFF))
+ else:
+ result.append(chr(oc >> 24 & 0xFF))
+ result.append(chr(oc >> 16 & 0xFF))
+ result.append(chr(oc >> 8 & 0xFF))
+ result.append(chr(oc & 0xFF))
+ pos += 1
+ return result.build()
# ____________________________________________________________
# MBCS codecs for Windows
More information about the Pypy-commit
mailing list