[pypy-svn] r75634 - in pypy/branch/interplevel-codecs/pypy: module/_codecs module/_codecs/test objspace/std rlib
afa at codespeak.net
afa at codespeak.net
Mon Jun 28 17:05:27 CEST 2010
Author: afa
Date: Mon Jun 28 17:05:25 2010
New Revision: 75634
Modified:
pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py
pypy/branch/interplevel-codecs/pypy/objspace/std/unicodeobject.py
pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
Log:
Rewrite the unicode-escape codec at interplevel.
Use it for unicode.__repr__
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py Mon Jun 28 17:05:25 2010
@@ -9,8 +9,6 @@
'charmap_encode' : 'app_codecs.charmap_encode',
'escape_decode' : 'app_codecs.escape_decode',
'escape_encode' : 'app_codecs.escape_encode',
- 'unicode_escape_decode' : 'app_codecs.unicode_escape_decode',
- 'unicode_escape_encode' : 'app_codecs.unicode_escape_encode',
'unicode_internal_decode' : 'app_codecs.unicode_internal_decode',
'unicode_internal_encode' : 'app_codecs.unicode_internal_encode',
'utf_7_decode' : 'app_codecs.utf_7_decode',
@@ -42,6 +40,8 @@
'charbuffer_encode': 'interp_codecs.buffer_encode',
'readbuffer_encode': 'interp_codecs.buffer_encode',
'charmap_decode' : 'interp_codecs.charmap_decode',
+ 'unicode_escape_decode' : 'interp_codecs.unicode_escape_decode',
+ 'unicode_escape_encode' : 'interp_codecs.unicode_escape_encode',
'raw_unicode_escape_decode' : 'interp_codecs.raw_unicode_escape_decode',
'raw_unicode_escape_encode' : 'interp_codecs.raw_unicode_escape_encode',
}
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py Mon Jun 28 17:05:25 2010
@@ -53,21 +53,6 @@
res = u''.join(res)
return res, len(data)
-def unicode_escape_encode( obj, errors='strict'):
- """None
- """
- res = unicodeescape_string(obj, len(obj), 0)
- res = ''.join(res)
- return res, len(obj)
-
-def unicode_escape_decode( data, errors='strict'):
- """None
- """
- res = PyUnicode_DecodeUnicodeEscape(data, len(data), errors)
- res = u''.join(res)
- return res, len(data)
-
-
def charmap_encode(obj, errors='strict', mapping=None):
"""None
"""
@@ -422,84 +407,6 @@
unicode_empty = u''
-def unicodeescape_string(s, size, quotes):
-
- p = []
- if (quotes) :
- p += 'u'
- if (s.find('\'') != -1 and s.find('"') == -1):
- p += '"'
- else:
- p += '\''
- pos = 0
- while (pos < size):
- ch = s[pos]
- #/* Escape quotes */
- if (quotes and (ch == p[1] or ch == '\\')):
- p += '\\'
- p += chr(ord(ch))
- pos += 1
- continue
-
-#ifdef Py_UNICODE_WIDE
- #/* Map 21-bit characters to '\U00xxxxxx' */
- elif (ord(ch) >= 0x10000):
- p += '\\'
- p += 'U'
- p += '%08x' % ord(ch)
- pos += 1
- continue
-#endif
- #/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
- elif (ord(ch) >= 0xD800 and ord(ch) < 0xDC00):
- pos += 1
- ch2 = s[pos]
-
- if (ord(ch2) >= 0xDC00 and ord(ch2) <= 0xDFFF):
- ucs = (((ord(ch) & 0x03FF) << 10) | (ord(ch2) & 0x03FF)) + 0x00010000
- p += '\\'
- p += 'U'
- p += '%08x' % ucs
- pos += 1
- continue
-
- #/* Fall through: isolated surrogates are copied as-is */
- pos -= 1
-
- #/* Map 16-bit characters to '\uxxxx' */
- elif (ord(ch) >= 256):
- p += '\\'
- p += 'u'
- p += '%04x' % ord(ch)
-
- #/* Map special whitespace to '\t', \n', '\r' */
- elif (ch == '\t'):
- p += '\\'
- p += 't'
-
- elif (ch == '\n'):
- p += '\\'
- p += 'n'
-
- elif (ch == '\r'):
- p += '\\'
- p += 'r'
- elif ch == '\\':
- p += '\\\\'
-
- #/* Map non-printable US ASCII to '\xhh' */
- elif (ch < ' ' or ord(ch) >= 0x7F) :
- p += '\\'
- p += 'x'
- p += '%02x' % ord(ch)
- #/* Copy everything else as-is */
- else:
- p += chr(ord(ch))
- pos += 1
- if (quotes):
- p += p[1]
- return p
-
def unicode_call_errorhandler(errors, encoding,
reason, input, startinpos, endinpos, decode=True):
@@ -522,160 +429,6 @@
-hexdigits = [hex(i)[-1] for i in range(16)]+[hex(i)[-1].upper() for i in range(10, 16)]
-
-def hexescape(s, pos, digits, message, errors):
- import sys
- chr = 0
- p = []
- if (pos+digits>len(s)):
- message = "end of string in escape sequence"
- x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2, len(s))
- p += x[0]
- pos = x[1]
- else:
- try:
- chr = int(s[pos:pos+digits], 16)
- except ValueError:
- endinpos = pos
- while s[endinpos] in hexdigits:
- endinpos += 1
- x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2,
- endinpos+1)
- p += x[0]
- pos = x[1]
- #/* when we get here, chr is a 32-bit unicode character */
- else:
- if chr <= sys.maxunicode:
- p += unichr(chr)
- pos += digits
-
- elif (chr <= 0x10ffff):
- chr -= 0x10000L
- p += unichr(0xD800 + (chr >> 10))
- p += unichr(0xDC00 + (chr & 0x03FF))
- pos += digits
- else:
- message = "illegal Unicode character"
- x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2,
- pos+1)
- p += x[0]
- pos = x[1]
- res = p
- return res, pos
-
-def PyUnicode_DecodeUnicodeEscape(s, size, errors):
- import sys
-
- if (size == 0):
- return u''
-
- p = []
- pos = 0
- while (pos < size):
-## /* Non-escape characters are interpreted as Unicode ordinals */
- if (s[pos] != '\\') :
- p += unichr(ord(s[pos]))
- pos += 1
- continue
-## /* \ - Escapes */
- else:
- pos += 1
- if pos >= len(s):
- errmessage = "\\ at end of string"
- unicode_call_errorhandler(errors, "unicodeescape", errmessage, s, pos-1, size)
- ch = s[pos]
- pos += 1
- ## /* \x escapes */
- if ch == '\n': pass
- elif ch == '\\' : p += u'\\'
- elif ch == '\'': p += u'\''
- elif ch == '\"': p += u'\"'
- elif ch == 'b' : p += u'\b'
- elif ch == 'f' : p += u'\014' #/* FF */
- elif ch == 't' : p += u'\t'
- elif ch == 'n' : p += u'\n'
- elif ch == 'r' : p += u'\r'
- elif ch == 'v': p += u'\013' #break; /* VT */
- elif ch == 'a': p += u'\007' # break; /* BEL, not classic C */
- elif '0' <= ch <= '7':
- x = ord(ch) - ord('0')
- if pos < size:
- ch = s[pos]
- if '0' <= ch <= '7':
- pos += 1
- x = (x<<3) + ord(ch) - ord('0')
- if pos < size:
- ch = s[pos]
- if '0' <= ch <= '7':
- pos += 1
- x = (x<<3) + ord(ch) - ord('0')
- p += unichr(x)
- ## /* hex escapes */
- ## /* \xXX */
- elif ch == 'x':
- digits = 2
- message = "truncated \\xXX escape"
- x = hexescape(s, pos, digits, message, errors)
- p += x[0]
- pos = x[1]
-
- # /* \uXXXX */
- elif ch == 'u':
- digits = 4
- message = "truncated \\uXXXX escape"
- x = hexescape(s, pos, digits, message, errors)
- p += x[0]
- pos = x[1]
-
- # /* \UXXXXXXXX */
- elif ch == 'U':
- digits = 8
- message = "truncated \\UXXXXXXXX escape"
- x = hexescape(s, pos, digits, message, errors)
- p += x[0]
- pos = x[1]
-## /* \N{name} */
- elif ch == 'N':
- message = "malformed \\N character escape"
- #pos += 1
- look = pos
- try:
- import unicodedata
- except ImportError:
- message = "\\N escapes not supported (can't load unicodedata module)"
- unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, size)
- if look < size and s[look] == '{':
- #/* look for the closing brace */
- while (look < size and s[look] != '}'):
- look += 1
- if (look > pos+1 and look < size and s[look] == '}'):
- #/* found a name. look it up in the unicode database */
- message = "unknown Unicode character name"
- st = s[pos+1:look]
- try:
- ch = unicodedata._get_code("%s" % st)
- except KeyError, e:
- x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1)
- p += x[0]
- pos = x[1]
- else:
- pos = look + 1
- if ch <= sys.maxunicode:
- p += unichr(ch)
- else:
- ch -= 0x10000L
- p += unichr(0xD800 + (ch >> 10))
- p += unichr(0xDC00 + (ch & 0x03FF))
- else:
- x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1)
- else:
- x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1)
- else:
- p += '\\'
- p += ch
- return p
-
def charmapencode_output(c, mapping):
rep = mapping[c]
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py Mon Jun 28 17:05:25 2010
@@ -1,5 +1,6 @@
from pypy.interpreter.error import OperationError, operationerrfmt
from pypy.interpreter.gateway import ObjSpace, NoneNotWrapped, interp2app
+from pypy.interpreter.gateway import unwrap_spec
from pypy.interpreter.baseobjspace import W_Root
from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
from pypy.rlib.objectmodel import we_are_translated
@@ -13,6 +14,8 @@
self.decode_error_handler = self.make_errorhandler(space, True)
self.encode_error_handler = self.make_errorhandler(space, False)
+ self.unicodedata_getcode = None
+
def make_errorhandler(self, space, decode):
def unicode_call_errorhandler(errors, encoding, reason, input,
startpos, endpos):
@@ -53,6 +56,17 @@
return replace, newpos
return unicode_call_errorhandler
+ def get_unicodedata_function(self, space):
+ if self.unicodedata_getcode:
+ return self.unicodedata_getcode
+ w_builtin = space.getbuiltinmodule('__builtin__')
+ w_import = space.getattr(w_builtin, space.wrap("__import__"))
+ w_unicodedata = space.call_function(w_import,
+ space.wrap("unicodedata"))
+ self.unicodedata_getcode = space.getattr(w_unicodedata,
+ space.wrap("_get_code"))
+ return self.unicodedata_getcode
+
def _freeze_(self):
assert not self.codec_search_path
return False
@@ -184,7 +198,7 @@
while pos < end:
ch = obj[pos]
builder.append(u"&#")
- builder.append(unicode(ord(ch)))
+ builder.append(unicode(str(ord(ch))))
builder.append(u";")
pos += 1
return space.newtuple([space.wrap(builder.build()), w_end])
@@ -358,6 +372,7 @@
"utf_16_encode",
"utf_16_be_encode",
"utf_16_le_encode",
+ "unicode_escape_encode",
"raw_unicode_escape_encode",
]:
make_encoder_wrapper(encoders)
@@ -471,3 +486,40 @@
res = builder.build()
return space.newtuple([space.wrap(res), space.wrap(size)])
charmap_decode.unwrap_spec = [ObjSpace, str, str, W_Root]
+
+# ____________________________________________________________
+# Unicode escape
+
+class UnicodeData_Handler:
+ def __init__(self, space):
+ self.space = space
+ state = space.fromcache(CodecState)
+ self.get_code = state.get_unicodedata_function(space)
+
+ def call(self, name):
+ space = self.space
+ try:
+ w_code = space.call_function(self.get_code, space.wrap(name))
+ except OperationError, e:
+ if not e.match(space, space.w_KeyError):
+ raise
+ return -1
+ return space.int_w(w_code)
+
+ at unwrap_spec(ObjSpace, 'bufferstr', str, W_Root)
+def unicode_escape_decode(space, string, errors="strict", w_final=False):
+ final = space.is_true(w_final)
+ state = space.fromcache(CodecState)
+ errorhandler=state.decode_error_handler
+
+ try:
+ unicode_name_handler = UnicodeData_Handler(space)
+ except OperationError:
+ unicode_name_handler = None
+
+ result, consumed = runicode.str_decode_unicode_escape(
+ string, len(string), errors,
+ final, state.decode_error_handler,
+ unicode_name_handler)
+
+ return space.newtuple([space.wrap(result), space.wrap(consumed)])
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py Mon Jun 28 17:05:25 2010
@@ -1,7 +1,6 @@
import autopath
from pypy.conftest import gettestobjspace
-from pypy.module._codecs.app_codecs import unicode_escape_encode,\
- charmap_encode, unicode_escape_decode
+from pypy.module._codecs.app_codecs import charmap_encode
class AppTestCodecs:
@@ -120,6 +119,11 @@
map = tuple([unichr(i) for i in range(256)])
assert charmap_decode('xxx\xff', 'strict', map) == (u'xxx\xff', 4)
+ def test_unicode_escape(self):
+ from _codecs import unicode_escape_encode, unicode_escape_decode
+ assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
+ assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
+ assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)
class AppTestPartialEvaluation:
@@ -542,6 +546,7 @@
def test_unicode_escape(self):
assert u'\\'.encode('unicode-escape') == '\\\\'
assert '\\\\'.decode('unicode-escape') == u'\\'
+ assert u'\ud801'.encode('unicode-escape') == '\\ud801'
def test_mbcs(self):
import sys
@@ -557,8 +562,3 @@
def test_charmap_encode(self):
assert charmap_encode(u'xxx') == ('xxx', 3)
assert charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 6)
-
- def test_unicode_escape(self):
- assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
- assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
- assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)
Modified: pypy/branch/interplevel-codecs/pypy/objspace/std/unicodeobject.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/objspace/std/unicodeobject.py (original)
+++ pypy/branch/interplevel-codecs/pypy/objspace/std/unicodeobject.py Mon Jun 28 17:05:25 2010
@@ -12,6 +12,7 @@
from pypy.rlib.rarithmetic import intmask, ovfcheck
from pypy.rlib.objectmodel import compute_hash
from pypy.rlib.rstring import string_repeat
+from pypy.rlib.runicode import unicode_encode_unicode_escape
from pypy.module.unicodedata import unicodedb_4_1_0 as unicodedb
from pypy.tool.sourcetools import func_with_new_name
@@ -892,101 +893,11 @@
space.wrap("character mapping must return integer, None or unicode"))
return W_UnicodeObject(u''.join(result))
-# Move this into the _codecs module as 'unicodeescape_string (Remember to cater for quotes)'
def repr__Unicode(space, w_unicode):
- hexdigits = "0123456789abcdef"
chars = w_unicode._value
size = len(chars)
-
- singlequote = doublequote = False
- for c in chars:
- if c == u'\'':
- singlequote = True
- elif c == u'"':
- doublequote = True
- if singlequote and not doublequote:
- quote = '"'
- else:
- quote = '\''
- result = ['u', quote]
- j = 0
- while j<len(chars):
- ch = chars[j]
- code = ord(ch)
- if code >= 0x10000:
- # Resize if needed
- result.extend(['\\', "U",
- hexdigits[(code >> 28) & 0xf],
- hexdigits[(code >> 24) & 0xf],
- hexdigits[(code >> 20) & 0xf],
- hexdigits[(code >> 16) & 0xf],
- hexdigits[(code >> 12) & 0xf],
- hexdigits[(code >> 8) & 0xf],
- hexdigits[(code >> 4) & 0xf],
- hexdigits[(code >> 0) & 0xf],
- ])
- j += 1
- continue
- if code >= 0xD800 and code < 0xDC00:
- if j < size - 1:
- ch2 = chars[j+1]
- code2 = ord(ch2)
- if code2 >= 0xDC00 and code2 <= 0xDFFF:
- code = (((code & 0x03FF) << 10) | (code2 & 0x03FF)) + 0x00010000
- result.extend(['\\', "U",
- hexdigits[(code >> 28) & 0xf],
- hexdigits[(code >> 24) & 0xf],
- hexdigits[(code >> 20) & 0xf],
- hexdigits[(code >> 16) & 0xf],
- hexdigits[(code >> 12) & 0xf],
- hexdigits[(code >> 8) & 0xf],
- hexdigits[(code >> 4) & 0xf],
- hexdigits[(code >> 0) & 0xf],
- ])
- j += 2
- continue
-
- if code >= 0x100:
- result.extend(['\\', "u",
- hexdigits[(code >> 12) & 0xf],
- hexdigits[(code >> 8) & 0xf],
- hexdigits[(code >> 4) & 0xf],
- hexdigits[(code >> 0) & 0xf],
- ])
- j += 1
- continue
- if code == ord('\\') or code == ord(quote):
- result.append('\\')
- result.append(chr(code))
- j += 1
- continue
- if code == ord('\t'):
- result.append('\\')
- result.append('t')
- j += 1
- continue
- if code == ord('\r'):
- result.append('\\')
- result.append('r')
- j += 1
- continue
- if code == ord('\n'):
- result.append('\\')
- result.append('n')
- j += 1
- continue
- if code < ord(' ') or code >= 0x7f:
- result.extend(['\\', "x",
- hexdigits[(code >> 4) & 0xf],
- hexdigits[(code >> 0) & 0xf],
- ])
- j += 1
- continue
- result.append(chr(code))
- j += 1
- result.append(quote)
- return space.wrap(''.join(result))
-
+ s = unicode_encode_unicode_escape(chars, size, "strict", quotes=True)
+ return space.wrap(s)
def mod__Unicode_ANY(space, w_format, w_values):
return mod_format(space, w_format, w_values, do_unicode=True)
Modified: pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/rlib/runicode.py (original)
+++ pypy/branch/interplevel-codecs/pypy/rlib/runicode.py Mon Jun 28 17:05:25 2010
@@ -319,7 +319,7 @@
result = UnicodeBuilder(size // 2)
#XXX I think the errors are not correctly handled here
- while (pos < len(s)):
+ while pos < size:
# remaining bytes at the end? (size should be even)
if len(s) - pos < 2:
if not final:
@@ -425,7 +425,7 @@
# latin1 is equivalent to the first 256 ordinals in Unicode.
pos = 0
result = UnicodeBuilder(size)
- while (pos < size):
+ while pos < size:
result.append(unichr(ord(s[pos])))
pos += 1
return result.build(), pos
@@ -438,7 +438,7 @@
# ASCII is equivalent to the first 128 ordinals in Unicode.
result = UnicodeBuilder(size)
pos = 0
- while pos < len(s):
+ while pos < size:
c = s[pos]
if ord(c) < 128:
result.append(unichr(ord(c)))
@@ -465,7 +465,7 @@
return ''
result = StringBuilder(size)
pos = 0
- while pos < len(p):
+ while pos < size:
ch = p[pos]
if ord(ch) < limit:
@@ -492,6 +492,248 @@
return res
# ____________________________________________________________
+# Unicode escape
+
+hexdigits = "0123456789ABCDEFabcdef"
+
+def hexescape(builder, s, pos, digits, errorhandler, message, errors):
+ import sys
+ chr = 0
+ if (pos+digits>len(s)):
+ message = "end of string in escape sequence"
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-2, len(s))
+ builder.append(res)
+ else:
+ try:
+ chr = int(s[pos:pos+digits], 16)
+ except ValueError:
+ endinpos = pos
+ while s[endinpos] in hexdigits:
+ endinpos += 1
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-2, endinpos+1)
+ builder.append(res)
+ else:
+ # when we get here, chr is a 32-bit unicode character
+ if chr <= MAXUNICODE:
+ builder.append(unichr(chr))
+ pos += digits
+
+ elif (chr <= 0x10ffff):
+ chr -= 0x10000L
+ builder.append(unichr(0xD800 + (chr >> 10)))
+ builder.append(unichr(0xDC00 + (chr & 0x03FF)))
+ pos += digits
+ else:
+ message = "illegal Unicode character"
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-2, pos+1)
+ builder.append(res)
+ return pos
+
+def str_decode_unicode_escape(s, size, errors, final=False,
+ errorhandler=False,
+ unicodedata_handler=None):
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_decode
+
+ if (size == 0):
+ return u'', 0
+
+ builder = UnicodeBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ # Non-escape characters are interpreted as Unicode ordinals
+ if (ch != '\\') :
+ builder.append(unichr(ord(ch)))
+ pos += 1
+ continue
+
+ # - Escapes
+ pos += 1
+ if pos >= size:
+ message = "\\ at end of string"
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, size)
+ builder.append(res)
+ continue
+
+ ch = s[pos]
+ pos += 1
+ # \x escapes */
+ if ch == '\n': pass
+ elif ch == '\\': builder.append(u'\\')
+ elif ch == '\'': builder.append(u'\'')
+ elif ch == '\"': builder.append(u'\"')
+ elif ch == 'b' : builder.append(u'\b')
+ elif ch == 'f' : builder.append(u'\f')
+ elif ch == 't' : builder.append(u'\t')
+ elif ch == 'n' : builder.append(u'\n')
+ elif ch == 'r' : builder.append(u'\r')
+ elif ch == 'v' : builder.append(u'\v')
+ elif ch == 'a' : builder.append(u'\a')
+ elif '0' <= ch <= '7':
+ x = ord(ch) - ord('0')
+ if pos < size:
+ ch = s[pos]
+ if '0' <= ch <= '7':
+ pos += 1
+ x = (x<<3) + ord(ch) - ord('0')
+ if pos < size:
+ ch = s[pos]
+ if '0' <= ch <= '7':
+ pos += 1
+ x = (x<<3) + ord(ch) - ord('0')
+ builder.append(unichr(x))
+ # hex escapes
+ # \xXX
+ elif ch == 'x':
+ digits = 2
+ message = "truncated \\xXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ errorhandler, message, errors)
+
+ # \uXXXX
+ elif ch == 'u':
+ digits = 4
+ message = "truncated \\uXXXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ errorhandler, message, errors)
+
+ # \UXXXXXXXX
+ elif ch == 'U':
+ digits = 8
+ message = "truncated \\UXXXXXXXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ errorhandler, message, errors)
+
+ # \N{name}
+ elif ch == 'N':
+ message = "malformed \\N character escape"
+ #pos += 1
+ look = pos
+ if unicodedata_handler is None:
+ message = ("\\N escapes not supported "
+ "(can't load unicodedata module)")
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, size)
+ builder.append(res)
+ continue
+
+ if look < size and s[look] == '{':
+ # look for the closing brace
+ while (look < size and s[look] != '}'):
+ look += 1
+ if (look > pos+1 and look < size and s[look] == '}'):
+ # found a name. look it up in the unicode database
+ message = "unknown Unicode character name"
+ name = s[pos+1:look]
+ code = unicodedata_handler.call(name)
+ if code < 0:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ continue
+ pos = look + 1
+ if code <= MAXUNICODE:
+ builder.append(unichr(code))
+ else:
+ code -= 0x10000L
+ builder.append(unichr(0xD800 + (code >> 10)))
+ builder.append(unichr(0xDC00 + (code & 0x03FF)))
+ else:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ else:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ else:
+ builder.append(u'\\')
+ builder.append(unichr(ord(ch)))
+
+ return builder.build(), pos
+
+def unicode_encode_unicode_escape(s, size, errors, errorhandler=None, quotes=False):
+ # errorhandler is not used: this function cannot cause Unicode errors
+ result = StringBuilder(size)
+
+ if quotes:
+ if s.find(u'\'') != -1 and s.find(u'\"') == -1:
+ quote = ord('\"')
+ result.append('u"')
+ else:
+ quote = ord('\'')
+ result.append('u\'')
+ else:
+ quote = 0
+
+ if size == 0:
+ return ''
+
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+ oc = ord(ch)
+
+ # Escape quotes
+ if quotes and (oc == quote or ch == '\\'):
+ result.append('\\')
+ result.append(chr(oc))
+ pos += 1
+ continue
+
+ if oc > 0x10000:
+ raw_unicode_escape_helper(result, oc)
+ pos += 1
+ continue
+
+ if 0xD800 <= oc < 0xDC00 and pos + 1 < size:
+ # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
+ pos += 1
+ oc2 = ord(s[pos])
+
+ if 0xDC00 <= oc2 < 0xDFFF:
+ ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
+ raw_unicode_escape_helper(result, ucs)
+ pos += 1
+ continue
+ # Fall through: isolated surrogates are copied as-is
+ pos -= 1
+
+ # Map 16-bit characters to '\uxxxx'
+ if oc >= 0x100:
+ raw_unicode_escape_helper(result, oc)
+ pos += 1
+ continue
+
+ # Map special whitespace to '\t', \n', '\r'
+ if ch == '\t':
+ result.append('\\t')
+ elif ch == '\n':
+ result.append('\\n')
+ elif ch == '\r':
+ result.append('\\r')
+ elif ch == '\\':
+ result.append('\\\\')
+
+ # Map non-printable US ASCII to '\xhh'
+ elif (oc < 32 or oc >= 0x7F) :
+ raw_unicode_escape_helper(result, oc)
+ # Copy everything else as-is
+ else:
+ result.append(chr(oc))
+ pos += 1
+
+ if quotes:
+ result.append(chr(quote))
+ return result.build()
+
+# ____________________________________________________________
# Raw unicode escape
def str_decode_raw_unicode_escape(s, size, errors, final=False,
@@ -503,7 +745,7 @@
result = UnicodeBuilder(size)
pos = 0
- while pos < len(s):
+ while pos < size:
ch = s[pos]
# Non-escape characters are interpreted as Unicode ordinals
@@ -518,7 +760,7 @@
bs = pos
while pos < size:
pos += 1
- if (s[pos] != '\\'):
+ if pos == size or s[pos] != '\\':
break
result.append(u'\\')
@@ -564,6 +806,19 @@
return result.build(), pos
+def raw_unicode_escape_helper(result, char):
+ num = hex(char)
+ if char >= 0x10000:
+ result.append("\\U")
+ zeros = 8
+ else:
+ result.append("\\u")
+ zeros = 4
+ nb = zeros + 2 - len(num) # num starts with '0x'
+ if nb > 0:
+ result.append_multiple_char('0', nb)
+ result.append_slice(num, 2, 8)
+
def unicode_encode_raw_unicode_escape(s, size, errors, errorhandler=None):
# errorhandler is not used: this function cannot cause Unicode errors
if (size == 0):
@@ -575,17 +830,7 @@
if oc < 0x100:
result.append(chr(oc))
else:
- num = hex(oc)
- if (oc >= 0x10000):
- result.append("\\U")
- zeros = 8
- else:
- result.append("\\u")
- zeros = 4
- nb = zeros + 2 - len(num) # num starts with '0x'
- if nb > 0:
- result.append_multiple_char('0', nb)
- result.append_slice(num, 2, 8)
+ raw_unicode_escape_helper(result, oc)
pos += 1
return result.build()
More information about the Pypy-commit
mailing list