[pypy-commit] pypy default: Try to make at least the utf-8 encoding/decoding functions elidable.

Sat Feb 18 11:02:51 EST 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r90178:bfe85246978d
Date: 2017-02-18 15:54 +0100
http://bitbucket.org/pypy/pypy/changeset/bfe85246978d/

Log:	Try to make at least the utf-8 encoding/decoding functions elidable.
	This is messy for several reasons. Now if you call the functions
	with an errorhandler that can't be used inside an @elidable, then
	you have to fix the calls to invoke another version.

diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -442,7 +442,10 @@
     if errors is None:
         errors = 'strict'
     state = space.fromcache(CodecState)
-    result = runicode.unicode_encode_utf_8(
+    # NB. can't call unicode_encode_utf_8() directly because that's
+    # an @elidable function nowadays.  Instead, we need the _impl().
+    # (The problem is the errorhandler, which calls arbitrary Python.)
+    result = runicode.unicode_encode_utf_8_impl(
         uni, len(uni), errors, state.encode_error_handler,
         allow_surrogates=True)
     return space.newtuple([space.newbytes(result), space.newint(len(uni))])
@@ -454,7 +457,10 @@
         errors = 'strict'
     final = space.is_true(w_final)
     state = space.fromcache(CodecState)
-    result, consumed = runicode.str_decode_utf_8(
+    # NB. can't call str_decode_utf_8() directly because that's
+    # an @elidable function nowadays.  Instead, we need the _impl().
+    # (The problem is the errorhandler, which calls arbitrary Python.)
+    result, consumed = runicode.str_decode_utf_8_impl(
         string, len(string), errors,
         final, state.decode_error_handler,
         allow_surrogates=True)
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -3,8 +3,10 @@
 from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
 from rpython.rlib.rarithmetic import r_uint, intmask, widen
 from rpython.rlib.unicodedata import unicodedb
+from rpython.tool.sourcetools import func_with_new_name
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.rlib import jit
+from rpython.rlib.nonconst import NonConstant
 
 
 if rffi.sizeof(lltype.UniChar) == 4:
@@ -127,15 +129,24 @@
     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  # F0-F4 - F5-FF
 ]
 
+# if you can't use the @elidable version, call str_decode_utf_8_impl()
+# directly
+ at jit.elidable
 def str_decode_utf_8(s, size, errors, final=False,
                      errorhandler=None, allow_surrogates=allow_surrogate_by_default):
     if errorhandler is None:
         errorhandler = default_unicode_error_decode
-    result = UnicodeBuilder(size)
-    pos = str_decode_utf_8_impl(s, size, errors, final, errorhandler,
-                                 allow_surrogates=allow_surrogates,
-                                 result=result)
-    return result.build(), pos
+    # NB. a bit messy because rtyper/rstr.py also calls the same
+    # function.  Make sure we annotate for the args it passes, too
+    if NonConstant(False):
+        s = '?????'
+        size = 12345
+        errors = 'strict'
+        final = True
+        errorhandler = default_unicode_error_decode
+        allow_surrogates = False
+    return str_decode_utf_8_elidable(s, size, errors, final, errorhandler,
+                                     allow_surrogates=allow_surrogates)
 
 def _invalid_cont_byte(ordch):
     return ordch>>6 != 0x2    # 0b10
@@ -157,14 +168,12 @@
             (ordch1 == 0xf0 and ordch2 < 0x90) or
             (ordch1 == 0xf4 and ordch2 > 0x8f))
 
-# note: this specialize() is here for rtyper/rstr.py, which calls this
-# function too but with its own fixed errorhandler
- at specialize.arg_or_var(4)
 def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
-                          allow_surrogates, result):
+                          allow_surrogates):
     if size == 0:
-        return 0
+        return u'', 0
 
+    result = UnicodeBuilder(size)
     pos = 0
     while pos < size:
         ordch1 = ord(s[pos])
@@ -316,7 +325,9 @@
                 result.append(unichr(0xDC00 + (c & 0x03FF)))
             pos += 4
 
-    return pos
+    return result.build(), pos
+str_decode_utf_8_elidable = jit.elidable(
+    func_with_new_name(str_decode_utf_8_impl, "str_decode_utf_8_elidable"))
 
 def _encodeUCS4(result, ch):
     # Encode UCS4 Unicode ordinals
@@ -325,6 +336,9 @@
     result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
     result.append((chr((0x80 | (ch & 0x3f)))))
 
+# if you can't use the @elidable version, call unicode_encode_utf_8_impl()
+# directly
+ at jit.elidable
 def unicode_encode_utf_8(s, size, errors, errorhandler=None,
                          allow_surrogates=allow_surrogate_by_default):
     # In this function, allow_surrogates can be:
@@ -339,12 +353,17 @@
     #
     if errorhandler is None:
         errorhandler = default_unicode_error_encode
-    return unicode_encode_utf_8_impl(s, size, errors, errorhandler,
-                                     allow_surrogates=allow_surrogates)
+    # NB. a bit messy because rtyper/rstr.py also calls the same
+    # function.  Make sure we annotate for the args it passes, too
+    if NonConstant(False):
+        s = u'?????'
+        size = 12345
+        errors = 'strict'
+        errorhandler = default_unicode_error_encode
+        allow_surrogates = False
+    return unicode_encode_utf_8_elidable(s, size, errors, errorhandler,
+                                         allow_surrogates=allow_surrogates)
 
-# note: this specialize() is here for rtyper/rstr.py, which calls this
-# function too but with its own fixed errorhandler
- at specialize.arg_or_var(3)
 def unicode_encode_utf_8_impl(s, size, errors, errorhandler,
                               allow_surrogates=False):
     assert(size >= 0)
@@ -400,6 +419,9 @@
             else:
                 _encodeUCS4(result, ch)
     return result.build()
+unicode_encode_utf_8_elidable = jit.elidable(
+    func_with_new_name(unicode_encode_utf_8_impl,
+                       "unicode_encode_utf_8_elidable"))
 
 def unicode_encode_utf8sp(s, size):
     # Surrogate-preserving utf-8 encoding.  Any surrogate character
diff --git a/rpython/rtyper/rstr.py b/rpython/rtyper/rstr.py
--- a/rpython/rtyper/rstr.py
+++ b/rpython/rtyper/rstr.py
@@ -9,7 +9,6 @@
 from rpython.rtyper.rfloat import FloatRepr
 from rpython.tool.pairtype import pairtype, pair
 from rpython.tool.sourcetools import func_with_new_name
-from rpython.rlib.rstring import UnicodeBuilder
 
 
 class AbstractStringRepr(Repr):
@@ -18,31 +17,21 @@
         Repr.__init__(self, *args)
         self.rstr_decode_utf_8 = None
 
-    def ensure_ll_decode_utf8(self):
-        from rpython.rlib.runicode import str_decode_utf_8_impl
-        self.rstr_decode_utf_8 = func_with_new_name(str_decode_utf_8_impl,
-                                                    'rstr_decode_utf_8_impl')
-
     @jit.elidable
     def ll_decode_utf8(self, llvalue):
         from rpython.rtyper.annlowlevel import hlstr
+        from rpython.rlib import runicode
         value = hlstr(llvalue)
         assert value is not None
-        result = UnicodeBuilder(len(value))
-        self.rstr_decode_utf_8(
-            value, len(value), 'strict', final=True,
-            errorhandler=self.ll_raise_unicode_exception_decode,
-            allow_surrogates=False, result=result)
+        errorhandler = runicode.default_unicode_error_decode
+        u, pos = runicode.str_decode_utf_8_elidable(
+            value, len(value), 'strict', True, errorhandler, False)
         # XXX should it really be 'allow_surrogates=False'?  In RPython,
         # unicode.decode('utf-8') happily accepts surrogates.  This
         # makes it hard to test untranslated (it's the cause of a
         # failure in lib-python's test_warnings on PyPy3, for example)
-        return self.ll.llunicode(result.build())
-
-    @staticmethod
-    def ll_raise_unicode_exception_decode(errors, encoding, msg, s,
-                                       startingpos, endingpos):
-        raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg)
+        # XXX maybe the whole ''.decode('utf-8') should be not RPython.
+        return self.ll.llunicode(u)
 
     def _str_reprs(self, hop):
         return hop.args_r[0].repr, hop.args_r[1].repr
@@ -351,7 +340,6 @@
         elif encoding == 'latin-1':
             return hop.gendirectcall(self.ll_decode_latin1, v_self)
         elif encoding == 'utf-8':
-            self.ensure_ll_decode_utf8()
             return hop.gendirectcall(self.ll_decode_utf8, v_self)
         else:
             raise TyperError("encoding %s not implemented" % (encoding, ))
@@ -394,11 +382,6 @@
         AbstractStringRepr.__init__(self, *args)
         self.runicode_encode_utf_8 = None
 
-    def ensure_ll_encode_utf8(self):
-        from rpython.rlib.runicode import unicode_encode_utf_8_impl
-        self.runicode_encode_utf_8 = func_with_new_name(
-            unicode_encode_utf_8_impl, 'runicode_encode_utf_8')
-
     def rtype_method_upper(self, hop):
         raise TyperError("Cannot do toupper on unicode string")
 
@@ -408,19 +391,15 @@
     @jit.elidable
     def ll_encode_utf8(self, ll_s):
         from rpython.rtyper.annlowlevel import hlunicode
+        from rpython.rlib import runicode
         s = hlunicode(ll_s)
         assert s is not None
-        bytes = self.runicode_encode_utf_8(
+        errorhandler = runicode.default_unicode_error_encode
+        bytes = runicode.unicode_encode_utf_8_elidable(
             s, len(s), 'strict',
-            errorhandler=self.ll_raise_unicode_exception_encode,
-            allow_surrogates=False)
+            errorhandler=errorhandler, allow_surrogates=False)
         return self.ll.llstr(bytes)
 
-    @staticmethod
-    def ll_raise_unicode_exception_encode(errors, encoding, msg, u,
-                                          startingpos, endingpos):
-        raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
-
     def rtype_method_encode(self, hop):
         if not hop.args_s[1].is_constant():
             raise TyperError("encoding must be constant")
@@ -436,7 +415,6 @@
         elif encoding == "latin-1":
             return hop.gendirectcall(self.ll_encode_latin1, v_self)
         elif encoding == 'utf-8':
-            self.ensure_ll_encode_utf8()
             return hop.gendirectcall(self.ll_encode_utf8, v_self)
         else:
             raise TyperError("encoding %s not implemented" % (encoding, ))
diff --git a/rpython/rtyper/test/test_runicode.py b/rpython/rtyper/test/test_runicode.py
--- a/rpython/rtyper/test/test_runicode.py
+++ b/rpython/rtyper/test/test_runicode.py
@@ -188,7 +188,7 @@
             else:
                 errors = 'foo'
             # the annotation of y is SomeUnicodeString(can_be_None=False)
-            y, _ = str_decode_utf_8(x, len(x), errors, errorhandler)
+            y, _ = str_decode_utf_8(x, len(x), errors, errorhandler=errorhandler)
             return x.decode('utf-8') + y
 
         assert self.ll_to_string(self.interpret(f, [1])) == f(1)