[pypy-svn] pypy fast-forward: optimize unicode.join to use a UnicodeBuilder

Thu Jan 6 18:40:52 CET 2011

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: fast-forward
Changeset: r40420:3c25108a2274
Date: 2011-01-06 18:18 +0100
http://bitbucket.org/pypy/pypy/changeset/3c25108a2274/

Log:	optimize unicode.join to use a UnicodeBuilder Optimize str.join when
	there is only one item in the list (this also fixes a test for
	identity in CPython test suite)

diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -11,7 +11,7 @@
 from pypy.objspace.std.tupleobject import W_TupleObject
 from pypy.rlib.rarithmetic import intmask, ovfcheck
 from pypy.rlib.objectmodel import compute_hash
-from pypy.rlib.rstring import string_repeat
+from pypy.rlib.rstring import UnicodeBuilder, string_repeat
 from pypy.rlib.runicode import unicode_encode_unicode_escape
 from pypy.module.unicodedata import unicodedb
 from pypy.tool.sourcetools import func_with_new_name
@@ -182,28 +182,35 @@
     return space.newbool(container.find(item) != -1)
 
 def unicode_join__Unicode_ANY(space, w_self, w_list):
-    l = space.unpackiterable(w_list)
-    delim = w_self._value
-    totlen = 0
-    if len(l) == 0:
+    list_w = space.unpackiterable(w_list)
+    size = len(list_w)
+
+    if size == 0:
         return W_UnicodeObject.EMPTY
-    if (len(l) == 1 and
-        space.is_w(space.type(l[0]), space.w_unicode)):
-        return l[0]
-    
-    values_list = [None] * len(l)
-    for i in range(len(l)):
-        item = l[i]
-        if isinstance(item, W_UnicodeObject):
-            # shortcut for performane
-            item = item._value
-        elif space.is_true(space.isinstance(item, space.w_str)):
-            item = space.unicode_w(item)
+
+    if size == 1:
+        w_s = list_w[0]
+        if space.is_w(space.type(w_s), space.w_unicode):
+            return w_s
+
+    self = w_self._value
+    sb = UnicodeBuilder()
+    for i in range(size):
+        if self and i != 0:
+            sb.append(self)
+        w_s = list_w[i]
+        if isinstance(w_s, W_UnicodeObject):
+            # shortcut for performance
+            sb.append(w_s._value)
         else:
-            raise operationerrfmt(space.w_TypeError,
-                "sequence item %d: expected string or Unicode", i)
-        values_list[i] = item
-    return W_UnicodeObject(w_self._value.join(values_list))
+            try:
+                sb.append(space.unicode_w(w_s))
+            except OperationError, e:
+                if not e.match(space, space.w_TypeError):
+                    raise
+                raise operationerrfmt(space.w_TypeError,
+                    "sequence item %d: expected string or Unicode", i)
+    return space.wrap(sb.build())
 
 def hash__Unicode(space, w_uni):
     s = w_uni._value

diff --git a/pypy/objspace/std/stringobject.py b/pypy/objspace/std/stringobject.py
--- a/pypy/objspace/std/stringobject.py
+++ b/pypy/objspace/std/stringobject.py
@@ -326,33 +326,42 @@
 
 def str_join__String_ANY(space, w_self, w_list):
     list_w = space.listview(w_list)
-    if list_w:
-        self = w_self._value
-        reslen = 0
-        for i in range(len(list_w)):
-            w_s = list_w[i]
-            if not space.is_true(space.isinstance(w_s, space.w_str)):
-                if space.is_true(space.isinstance(w_s, space.w_unicode)):
-                    # we need to rebuild w_list here, because the original
-                    # w_list might be an iterable which we already consumed
-                    w_list = space.newlist(list_w)
-                    w_u = space.call_function(space.w_unicode, w_self)
-                    return space.call_method(w_u, "join", w_list)
-                raise operationerrfmt(
-                    space.w_TypeError,
-                    "sequence item %d: expected string, %s "
-                    "found", i, space.type(w_s).getname(space))
-            reslen += len(space.str_w(w_s))
-        reslen += len(self) * (len(list_w) - 1)
-        sb = StringBuilder(reslen)
-        for i in range(len(list_w)):
-            if self and i != 0:
-                sb.append(self)
-            sb.append(space.str_w(list_w[i]))
-        return space.wrap(sb.build())
-    else:
+    size = len(list_w)
+
+    if size == 0:
         return W_StringObject.EMPTY
 
+    if size == 1:
+        w_s = list_w[0]
+        # only one item,  return it if it's not a subclass of str
+        if (space.is_w(space.type(w_s), space.w_str) or
+            space.is_w(space.type(w_s), space.w_unicode)):
+            return w_s
+
+    self = w_self._value
+    reslen = len(self) * (size - 1)
+    for i in range(size):
+        w_s = list_w[i]
+        if not space.is_true(space.isinstance(w_s, space.w_str)):
+            if space.is_true(space.isinstance(w_s, space.w_unicode)):
+                # we need to rebuild w_list here, because the original
+                # w_list might be an iterable which we already consumed
+                w_list = space.newlist(list_w)
+                w_u = space.call_function(space.w_unicode, w_self)
+                return space.call_method(w_u, "join", w_list)
+            raise operationerrfmt(
+                space.w_TypeError,
+                "sequence item %d: expected string, %s "
+                "found", i, space.type(w_s).getname(space))
+        reslen += len(space.str_w(w_s))
+
+    sb = StringBuilder(reslen)
+    for i in range(size):
+        if self and i != 0:
+            sb.append(self)
+        sb.append(space.str_w(list_w[i]))
+    return space.wrap(sb.build())
+
 def str_rjust__String_ANY_ANY(space, w_self, w_arg, w_fillchar):
     u_arg = space.int_w(w_arg)
     u_self = w_self._value

diff --git a/pypy/objspace/std/test/test_stringobject.py b/pypy/objspace/std/test/test_stringobject.py
--- a/pypy/objspace/std/test/test_stringobject.py
+++ b/pypy/objspace/std/test/test_stringobject.py
@@ -494,6 +494,8 @@
         assert ", ".join(['a', 'b', 'c']) == "a, b, c"
         assert "".join([]) == ""
         assert "-".join(['a', 'b']) == 'a-b'
+        text = 'text'
+        assert "".join([text]) is text
         raises(TypeError, ''.join, 1)
         raises(TypeError, ''.join, [1])
         raises(TypeError, ''.join, [[1]])