[pypy-commit] pypy default: Try to make at least the utf-8 encoding/decoding functions elidable.
arigo
pypy.commits at gmail.com
Sat Feb 18 11:02:51 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r90178:bfe85246978d
Date: 2017-02-18 15:54 +0100
http://bitbucket.org/pypy/pypy/changeset/bfe85246978d/
Log: Try to make at least the utf-8 encoding/decoding functions elidable.
This is messy for several reasons. Now if you call the functions
with an errorhandler that can't be used inside an @elidable, then
you have to fix the calls to invoke another version.
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -442,7 +442,10 @@
if errors is None:
errors = 'strict'
state = space.fromcache(CodecState)
- result = runicode.unicode_encode_utf_8(
+ # NB. can't call unicode_encode_utf_8() directly because that's
+ # an @elidable function nowadays. Instead, we need the _impl().
+ # (The problem is the errorhandler, which calls arbitrary Python.)
+ result = runicode.unicode_encode_utf_8_impl(
uni, len(uni), errors, state.encode_error_handler,
allow_surrogates=True)
return space.newtuple([space.newbytes(result), space.newint(len(uni))])
@@ -454,7 +457,10 @@
errors = 'strict'
final = space.is_true(w_final)
state = space.fromcache(CodecState)
- result, consumed = runicode.str_decode_utf_8(
+ # NB. can't call str_decode_utf_8() directly because that's
+ # an @elidable function nowadays. Instead, we need the _impl().
+ # (The problem is the errorhandler, which calls arbitrary Python.)
+ result, consumed = runicode.str_decode_utf_8_impl(
string, len(string), errors,
final, state.decode_error_handler,
allow_surrogates=True)
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -3,8 +3,10 @@
from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
from rpython.rlib.rarithmetic import r_uint, intmask, widen
from rpython.rlib.unicodedata import unicodedb
+from rpython.tool.sourcetools import func_with_new_name
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.rlib import jit
+from rpython.rlib.nonconst import NonConstant
if rffi.sizeof(lltype.UniChar) == 4:
@@ -127,15 +129,24 @@
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 # F0-F4 - F5-FF
]
+# if you can't use the @elidable version, call str_decode_utf_8_impl()
+# directly
+ at jit.elidable
def str_decode_utf_8(s, size, errors, final=False,
errorhandler=None, allow_surrogates=allow_surrogate_by_default):
if errorhandler is None:
errorhandler = default_unicode_error_decode
- result = UnicodeBuilder(size)
- pos = str_decode_utf_8_impl(s, size, errors, final, errorhandler,
- allow_surrogates=allow_surrogates,
- result=result)
- return result.build(), pos
+ # NB. a bit messy because rtyper/rstr.py also calls the same
+ # function. Make sure we annotate for the args it passes, too
+ if NonConstant(False):
+ s = '?????'
+ size = 12345
+ errors = 'strict'
+ final = True
+ errorhandler = default_unicode_error_decode
+ allow_surrogates = False
+ return str_decode_utf_8_elidable(s, size, errors, final, errorhandler,
+ allow_surrogates=allow_surrogates)
def _invalid_cont_byte(ordch):
return ordch>>6 != 0x2 # 0b10
@@ -157,14 +168,12 @@
(ordch1 == 0xf0 and ordch2 < 0x90) or
(ordch1 == 0xf4 and ordch2 > 0x8f))
-# note: this specialize() is here for rtyper/rstr.py, which calls this
-# function too but with its own fixed errorhandler
- at specialize.arg_or_var(4)
def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
- allow_surrogates, result):
+ allow_surrogates):
if size == 0:
- return 0
+ return u'', 0
+ result = UnicodeBuilder(size)
pos = 0
while pos < size:
ordch1 = ord(s[pos])
@@ -316,7 +325,9 @@
result.append(unichr(0xDC00 + (c & 0x03FF)))
pos += 4
- return pos
+ return result.build(), pos
+str_decode_utf_8_elidable = jit.elidable(
+ func_with_new_name(str_decode_utf_8_impl, "str_decode_utf_8_elidable"))
def _encodeUCS4(result, ch):
# Encode UCS4 Unicode ordinals
@@ -325,6 +336,9 @@
result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
result.append((chr((0x80 | (ch & 0x3f)))))
+# if you can't use the @elidable version, call unicode_encode_utf_8_impl()
+# directly
+ at jit.elidable
def unicode_encode_utf_8(s, size, errors, errorhandler=None,
allow_surrogates=allow_surrogate_by_default):
# In this function, allow_surrogates can be:
@@ -339,12 +353,17 @@
#
if errorhandler is None:
errorhandler = default_unicode_error_encode
- return unicode_encode_utf_8_impl(s, size, errors, errorhandler,
- allow_surrogates=allow_surrogates)
+ # NB. a bit messy because rtyper/rstr.py also calls the same
+ # function. Make sure we annotate for the args it passes, too
+ if NonConstant(False):
+ s = u'?????'
+ size = 12345
+ errors = 'strict'
+ errorhandler = default_unicode_error_encode
+ allow_surrogates = False
+ return unicode_encode_utf_8_elidable(s, size, errors, errorhandler,
+ allow_surrogates=allow_surrogates)
-# note: this specialize() is here for rtyper/rstr.py, which calls this
-# function too but with its own fixed errorhandler
- at specialize.arg_or_var(3)
def unicode_encode_utf_8_impl(s, size, errors, errorhandler,
allow_surrogates=False):
assert(size >= 0)
@@ -400,6 +419,9 @@
else:
_encodeUCS4(result, ch)
return result.build()
+unicode_encode_utf_8_elidable = jit.elidable(
+ func_with_new_name(unicode_encode_utf_8_impl,
+ "unicode_encode_utf_8_elidable"))
def unicode_encode_utf8sp(s, size):
# Surrogate-preserving utf-8 encoding. Any surrogate character
diff --git a/rpython/rtyper/rstr.py b/rpython/rtyper/rstr.py
--- a/rpython/rtyper/rstr.py
+++ b/rpython/rtyper/rstr.py
@@ -9,7 +9,6 @@
from rpython.rtyper.rfloat import FloatRepr
from rpython.tool.pairtype import pairtype, pair
from rpython.tool.sourcetools import func_with_new_name
-from rpython.rlib.rstring import UnicodeBuilder
class AbstractStringRepr(Repr):
@@ -18,31 +17,21 @@
Repr.__init__(self, *args)
self.rstr_decode_utf_8 = None
- def ensure_ll_decode_utf8(self):
- from rpython.rlib.runicode import str_decode_utf_8_impl
- self.rstr_decode_utf_8 = func_with_new_name(str_decode_utf_8_impl,
- 'rstr_decode_utf_8_impl')
-
@jit.elidable
def ll_decode_utf8(self, llvalue):
from rpython.rtyper.annlowlevel import hlstr
+ from rpython.rlib import runicode
value = hlstr(llvalue)
assert value is not None
- result = UnicodeBuilder(len(value))
- self.rstr_decode_utf_8(
- value, len(value), 'strict', final=True,
- errorhandler=self.ll_raise_unicode_exception_decode,
- allow_surrogates=False, result=result)
+ errorhandler = runicode.default_unicode_error_decode
+ u, pos = runicode.str_decode_utf_8_elidable(
+ value, len(value), 'strict', True, errorhandler, False)
# XXX should it really be 'allow_surrogates=False'? In RPython,
# unicode.decode('utf-8') happily accepts surrogates. This
# makes it hard to test untranslated (it's the cause of a
# failure in lib-python's test_warnings on PyPy3, for example)
- return self.ll.llunicode(result.build())
-
- @staticmethod
- def ll_raise_unicode_exception_decode(errors, encoding, msg, s,
- startingpos, endingpos):
- raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg)
+ # XXX maybe the whole ''.decode('utf-8') should be not RPython.
+ return self.ll.llunicode(u)
def _str_reprs(self, hop):
return hop.args_r[0].repr, hop.args_r[1].repr
@@ -351,7 +340,6 @@
elif encoding == 'latin-1':
return hop.gendirectcall(self.ll_decode_latin1, v_self)
elif encoding == 'utf-8':
- self.ensure_ll_decode_utf8()
return hop.gendirectcall(self.ll_decode_utf8, v_self)
else:
raise TyperError("encoding %s not implemented" % (encoding, ))
@@ -394,11 +382,6 @@
AbstractStringRepr.__init__(self, *args)
self.runicode_encode_utf_8 = None
- def ensure_ll_encode_utf8(self):
- from rpython.rlib.runicode import unicode_encode_utf_8_impl
- self.runicode_encode_utf_8 = func_with_new_name(
- unicode_encode_utf_8_impl, 'runicode_encode_utf_8')
-
def rtype_method_upper(self, hop):
raise TyperError("Cannot do toupper on unicode string")
@@ -408,19 +391,15 @@
@jit.elidable
def ll_encode_utf8(self, ll_s):
from rpython.rtyper.annlowlevel import hlunicode
+ from rpython.rlib import runicode
s = hlunicode(ll_s)
assert s is not None
- bytes = self.runicode_encode_utf_8(
+ errorhandler = runicode.default_unicode_error_encode
+ bytes = runicode.unicode_encode_utf_8_elidable(
s, len(s), 'strict',
- errorhandler=self.ll_raise_unicode_exception_encode,
- allow_surrogates=False)
+ errorhandler=errorhandler, allow_surrogates=False)
return self.ll.llstr(bytes)
- @staticmethod
- def ll_raise_unicode_exception_encode(errors, encoding, msg, u,
- startingpos, endingpos):
- raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
-
def rtype_method_encode(self, hop):
if not hop.args_s[1].is_constant():
raise TyperError("encoding must be constant")
@@ -436,7 +415,6 @@
elif encoding == "latin-1":
return hop.gendirectcall(self.ll_encode_latin1, v_self)
elif encoding == 'utf-8':
- self.ensure_ll_encode_utf8()
return hop.gendirectcall(self.ll_encode_utf8, v_self)
else:
raise TyperError("encoding %s not implemented" % (encoding, ))
diff --git a/rpython/rtyper/test/test_runicode.py b/rpython/rtyper/test/test_runicode.py
--- a/rpython/rtyper/test/test_runicode.py
+++ b/rpython/rtyper/test/test_runicode.py
@@ -188,7 +188,7 @@
else:
errors = 'foo'
# the annotation of y is SomeUnicodeString(can_be_None=False)
- y, _ = str_decode_utf_8(x, len(x), errors, errorhandler)
+ y, _ = str_decode_utf_8(x, len(x), errors, errorhandler=errorhandler)
return x.decode('utf-8') + y
assert self.ll_to_string(self.interpret(f, [1])) == f(1)
More information about the pypy-commit
mailing list