[pypy-commit] pypy unicode-utf8: Start fighting on a new branch with utf8 being the default storage
fijal
pypy.commits at gmail.com
Tue Feb 21 06:17:07 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r90253:c9309766eac6
Date: 2017-02-21 12:17 +0100
http://bitbucket.org/pypy/pypy/changeset/c9309766eac6/
Log: Start fighting on a new branch with utf8 being the default storage
for W_Unicode object
diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -57,6 +57,7 @@
assert 0 <= ps <= q
substr = s[ps:q]
else:
+ xxx
substr = decode_unicode_utf8(space, s, ps, q)
if rawmode:
v = unicodehelper.decode_raw_unicode_escape(space, substr)
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -38,14 +38,16 @@
def decode_unicode_escape(space, string):
state = space.fromcache(interp_codecs.CodecState)
unicodedata_handler = state.get_unicodedata_handler(space)
- result, consumed = runicode.str_decode_unicode_escape(
+ # XXX pick better length, maybe
+ result, consumed = runicode.str_decode_utf8_escape(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space),
unicodedata_handler=unicodedata_handler)
return result
def decode_raw_unicode_escape(space, string):
- result, consumed = runicode.str_decode_raw_unicode_escape(
+ # XXX pick better length, maybe
+ result, consumed = runicode.str_decode_raw_utf8_escape(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space))
return result
diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -5,7 +5,7 @@
from pypy.interpreter import gateway
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import unwrap_spec, WrappedDefault
-from rpython.rlib.runicode import UNICHR
+from rpython.rlib.runicode import UNICHR, unichr_as_utf8
from rpython.rlib.rfloat import isnan, isinf, round_double
from rpython.rlib import rfloat
import __builtin__
@@ -25,12 +25,12 @@
@unwrap_spec(code=int)
def unichr(space, code):
"Return a Unicode string of one character with the given ordinal."
- # XXX range checking!
+ # XXX this assumes unichr would be happy to return you surrogates
try:
- c = UNICHR(code)
+ s = unichr_as_utf8(code)
except ValueError:
raise oefmt(space.w_ValueError, "unichr() arg out of range")
- return space.newunicode(c)
+ return space.newunicode(s)
def len(space, w_obj):
"len(object) -> integer\n\nReturn the number of items of a sequence or mapping."
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -1,11 +1,10 @@
"""The builtin str implementation"""
from rpython.rlib import jit
-from rpython.rlib.jit import we_are_jitted
from rpython.rlib.objectmodel import (
compute_hash, compute_unique_id, import_from_mixin)
from rpython.rlib.buffer import StringBuffer
-from rpython.rlib.rstring import StringBuilder, replace
+from rpython.rlib.rstring import StringBuilder
from pypy.interpreter.baseobjspace import W_Root
from pypy.interpreter.error import OperationError, oefmt
@@ -18,7 +17,7 @@
from pypy.objspace.std.stringmethods import StringMethods
from pypy.objspace.std.unicodeobject import (
decode_object, unicode_from_encoded_object,
- unicode_from_string, getdefaultencoding)
+ getdefaultencoding)
from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -346,10 +346,10 @@
return self.w_None
return self.newtext(s)
- def newunicode(self, uni):
- assert uni is not None
- assert isinstance(uni, unicode)
- return W_UnicodeObject(uni)
+ def newunicode(self, utf8s):
+ assert utf8s is not None
+ assert isinstance(utf8s, str)
+ return W_UnicodeObject(utf8s)
def type(self, w_obj):
jit.promote(w_obj.__class__)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -7,7 +7,8 @@
from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
from rpython.rlib.runicode import (
make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
- unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii)
+ unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii,
+ check_ascii, AsciiCheckError)
from pypy.interpreter import unicodehelper
from pypy.interpreter.baseobjspace import W_Root
@@ -23,21 +24,24 @@
__all__ = ['W_UnicodeObject', 'wrapunicode', 'plain_str2unicode',
'encode_object', 'decode_object', 'unicode_from_object',
- 'unicode_from_string', 'unicode_to_decimal_w']
+ 'utf8_from_string', 'unicode_to_decimal_w']
class W_UnicodeObject(W_Root):
import_from_mixin(StringMethods)
- _immutable_fields_ = ['_value']
+ _immutable_fields_ = ['_utf8']
- @enforceargs(uni=unicode)
- def __init__(self, unistr):
- assert isinstance(unistr, unicode)
- self._value = unistr
+ @enforceargs(utf8str=str)
+ def __init__(self, utf8str, ucs4str=None):
+ assert isinstance(utf8str, str)
+ if ucs4str is not None:
+ assert isinstance(ucs4str, unicode)
+ self._utf8 = utf8str
+ self._ucs4 = ucs4str
def __repr__(self):
"""representation for debugging purposes"""
- return "%s(%r)" % (self.__class__.__name__, self._value)
+ return "%s(%r)" % (self.__class__.__name__, self._utf8)
def unwrap(self, space):
# for testing
@@ -79,8 +83,8 @@
def str_w(self, space):
return space.text_w(space.str(self))
- def unicode_w(self, space):
- return self._value
+ def utf8_w(self, space):
+ return self._utf8
def readbuf_w(self, space):
from rpython.rlib.rstruct.unichar import pack_unichar, UNICODE_SIZE
@@ -117,7 +121,7 @@
def _len(self):
return len(self._value)
- _val = unicode_w
+ _val = utf8_w
@staticmethod
def _use_rstr_ops(space, w_other):
@@ -128,9 +132,10 @@
@staticmethod
def _op_val(space, w_other, strict=None):
if isinstance(w_other, W_UnicodeObject):
- return w_other._value
+ return w_other._utf8
if space.isinstance_w(w_other, space.w_bytes):
- return unicode_from_string(space, w_other)._value
+ return utf8_from_string(space, w_other)._utf8
+ yyy
if strict:
raise oefmt(space.w_TypeError,
"%s arg must be None, unicode or str", strict)
@@ -235,7 +240,7 @@
return encode_object(space, self, None, None)
def descr_hash(self, space):
- x = compute_hash(self._value)
+ x = compute_hash(self._utf8)
return space.newint(x)
def descr_eq(self, space, w_other):
@@ -564,17 +569,19 @@
return unicode_from_encoded_object(space, w_res, None, "strict")
-def unicode_from_string(space, w_bytes):
+def utf8_from_string(space, w_bytes):
# this is a performance and bootstrapping hack
encoding = getdefaultencoding(space)
if encoding != 'ascii':
+ xxx
return unicode_from_encoded_object(space, w_bytes, encoding, "strict")
s = space.bytes_w(w_bytes)
try:
- return W_UnicodeObject(s.decode("ascii"))
- except UnicodeDecodeError:
+ check_ascii(s)
+ except AsciiCheckError:
# raising UnicodeDecodeError is messy, "please crash for me"
return unicode_from_encoded_object(space, w_bytes, "ascii", "strict")
+ return W_UnicodeObject(s)
class UnicodeDocstrings:
@@ -1121,7 +1128,7 @@
return [s for s in value]
-W_UnicodeObject.EMPTY = W_UnicodeObject(u'')
+W_UnicodeObject.EMPTY = W_UnicodeObject('')
# Helper for converting int/long
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -43,6 +43,36 @@
return ord(u[0])
raise TypeError
+def unichr_as_utf8(code):
+ """ Encode code (numeric value) as utf8 encoded string
+ """
+ if code < 0:
+ raise ValueError
+ if code < 0x80:
+ # Encode ASCII
+ return chr(code)
+ if code < 0x0800:
+ # Encode Latin-1
+ return chr((0xc0 | (code >> 6))) + chr((0x80 | (code & 0x3f)))
+ if code < 0x10000:
+ return (chr((0xe0 | (code >> 12))) +
+ chr((0x80 | ((code >> 6) & 0x3f))) +
+ chr((0x80 | (code & 0x3f))))
+ if code < 0x10ffff:
+ return (chr((0xf0 | (code >> 18))) +
+ chr((0x80 | ((code >> 12) & 0x3f))) +
+ chr((0x80 | ((code >> 6) & 0x3f))) +
+ chr((0x80 | (code & 0x3f))))
+ raise ValueError
+
+class AsciiCheckError(Exception):
+ pass
+
+def check_ascii(s):
+ for i in range(0, len(s)):
+ if ord(s[i]) & 0x80:
+ raise AsciiCheckError
+
if MAXUNICODE > sys.maxunicode:
# A version of unichr which allows codes outside the BMP
# even on narrow unicode builds.
@@ -1377,6 +1407,129 @@
return builder.build(), pos
+def str_decode_utf8_escape(s, size, errors, final=False,
+ errorhandler=None,
+ unicodedata_handler=None):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_decode
+
+ if size == 0:
+ return '', 0
+
+ builder = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ # Non-escape characters are interpreted as Unicode ordinals
+ if ch != '\\':
+ builder.append(ch)
+ pos += 1
+ continue
+
+ # - Escapes
+ pos += 1
+ if pos >= size:
+ message = "\\ at end of string"
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, size)
+ builder.append(res)
+ continue
+
+ ch = s[pos]
+ pos += 1
+ # \x escapes
+ if ch == '\n': pass
+ elif ch == '\\': builder.append('\\')
+ elif ch == '\'': builder.append('\'')
+ elif ch == '\"': builder.append('\"')
+ elif ch == 'b' : builder.append('\b')
+ elif ch == 'f' : builder.append('\f')
+ elif ch == 't' : builder.append('\t')
+ elif ch == 'n' : builder.append('\n')
+ elif ch == 'r' : builder.append('\r')
+ elif ch == 'v' : builder.append('\v')
+ elif ch == 'a' : builder.append('\a')
+ elif '0' <= ch <= '7':
+ xxx
+ x = ord(ch) - ord('0')
+ if pos < size:
+ ch = s[pos]
+ if '0' <= ch <= '7':
+ pos += 1
+ x = (x<<3) + ord(ch) - ord('0')
+ if pos < size:
+ ch = s[pos]
+ if '0' <= ch <= '7':
+ pos += 1
+ x = (x<<3) + ord(ch) - ord('0')
+ builder.append(unichr(x))
+ # hex escapes
+ # \xXX
+ elif ch == 'x':
+ xxx
+ digits = 2
+ message = "truncated \\xXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+
+ # \uXXXX
+ elif ch == 'u':
+ xxx
+ digits = 4
+ message = "truncated \\uXXXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+
+ # \UXXXXXXXX
+ elif ch == 'U':
+ xxx
+ digits = 8
+ message = "truncated \\UXXXXXXXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+
+ # \N{name}
+ elif ch == 'N' and unicodedata_handler is not None:
+ xxx
+ message = "malformed \\N character escape"
+ look = pos
+
+ if look < size and s[look] == '{':
+ # look for the closing brace
+ while look < size and s[look] != '}':
+ look += 1
+ if look < size and s[look] == '}':
+ # found a name. look it up in the unicode database
+ message = "unknown Unicode character name"
+ name = s[pos+1:look]
+ code = unicodedata_handler.call(name)
+ if code < 0:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ continue
+ pos = look + 1
+ if code <= MAXUNICODE:
+ builder.append(UNICHR(code))
+ else:
+ code -= 0x10000L
+ builder.append(unichr(0xD800 + (code >> 10)))
+ builder.append(unichr(0xDC00 + (code & 0x03FF)))
+ else:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ else:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ else:
+ builder.append('\\')
+ builder.append(ch)
+
+ return builder.build(), pos
+
def make_unicode_escape_function(pass_printable=False, unicode_output=False,
quotes=False, prefix=None):
# Python3 has two similar escape functions: One to implement
@@ -1497,6 +1650,54 @@
# ____________________________________________________________
# Raw unicode escape
+def str_decode_raw_utf8_escape(s, size, errors, final=False,
+ errorhandler=None):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_decode
+ if size == 0:
+ return '', 0
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ # Non-escape characters are interpreted as Unicode ordinals
+ if ch != '\\':
+ result.append(ch)
+ pos += 1
+ continue
+
+ # \u-escapes are only interpreted iff the number of leading
+ # backslashes is odd
+ bs = pos
+ while pos < size:
+ pos += 1
+ if pos == size or s[pos] != '\\':
+ break
+ result.append('\\')
+
+ # we have a backslash at the end of the string, stop here
+ if pos >= size:
+ result.append('\\')
+ break
+
+ if ((pos - bs) & 1 == 0 or
+ pos >= size or
+ (s[pos] != 'u' and s[pos] != 'U')):
+ result.append('\\')
+ result.append(s[pos])
+ pos += 1
+ continue
+
+ digits = 4 if s[pos] == 'u' else 8
+ message = "truncated \\uXXXX"
+ pos += 1
+ xxx # change hexescape to deal with utf8
+ pos = hexescape(result, s, pos, digits,
+ "rawunicodeescape", errorhandler, message, errors)
+
+ return result.build(), pos
+
def str_decode_raw_unicode_escape(s, size, errors, final=False,
errorhandler=None):
if errorhandler is None:
More information about the pypy-commit
mailing list