[pypy-svn] r48781 - in pypy/branch/ropes-unicode/pypy/objspace/std: . test

cfbolz at codespeak.net cfbolz at codespeak.net
Mon Nov 19 12:16:47 CET 2007


Author: cfbolz
Date: Mon Nov 19 12:16:46 2007
New Revision: 48781

Modified:
   pypy/branch/ropes-unicode/pypy/objspace/std/floattype.py
   pypy/branch/ropes-unicode/pypy/objspace/std/inttype.py
   pypy/branch/ropes-unicode/pypy/objspace/std/longtype.py
   pypy/branch/ropes-unicode/pypy/objspace/std/model.py
   pypy/branch/ropes-unicode/pypy/objspace/std/rope.py
   pypy/branch/ropes-unicode/pypy/objspace/std/ropeobject.py
   pypy/branch/ropes-unicode/pypy/objspace/std/ropeunicodeobject.py
   pypy/branch/ropes-unicode/pypy/objspace/std/stringobject.py
   pypy/branch/ropes-unicode/pypy/objspace/std/test/test_rope.py
   pypy/branch/ropes-unicode/pypy/objspace/std/test/test_unicodeobject.py
   pypy/branch/ropes-unicode/pypy/objspace/std/unicodetype.py
Log:
try to fix ropes that represent unicode chars


Modified: pypy/branch/ropes-unicode/pypy/objspace/std/floattype.py
==============================================================================
--- pypy/branch/ropes-unicode/pypy/objspace/std/floattype.py	(original)
+++ pypy/branch/ropes-unicode/pypy/objspace/std/floattype.py	Mon Nov 19 12:16:46 2007
@@ -19,7 +19,10 @@
             raise OperationError(space.w_ValueError,
                                  space.wrap(e.msg))
     elif space.is_true(space.isinstance(w_value, space.w_unicode)):
-        from unicodeobject import unicode_to_decimal_w
+        if space.config.objspace.std.withropeunicode:
+            from pypy.objspace.std.ropeunicodeobject import unicode_to_decimal_w
+        else:
+            from unicodeobject import unicode_to_decimal_w
         strvalue = unicode_to_decimal_w(space, w_value)
         try:
             if USE_NEW_S2F:

Modified: pypy/branch/ropes-unicode/pypy/objspace/std/inttype.py
==============================================================================
--- pypy/branch/ropes-unicode/pypy/objspace/std/inttype.py	(original)
+++ pypy/branch/ropes-unicode/pypy/objspace/std/inttype.py	Mon Nov 19 12:16:46 2007
@@ -66,7 +66,10 @@
             except ParseStringOverflowError, e:
                  w_longval = retry_to_w_long(space, e.parser)                
         elif space.is_true(space.isinstance(w_value, space.w_unicode)):
-            from unicodeobject import unicode_to_decimal_w
+            if space.config.objspace.std.withropeunicode:
+                from pypy.objspace.std.ropeunicodeobject import unicode_to_decimal_w
+            else:
+                from pypy.objspace.std.unicodeobject import unicode_to_decimal_w
             string = unicode_to_decimal_w(space, w_value)
             try:
                 value = string_to_int(string)
@@ -95,7 +98,10 @@
         base = space.int_w(w_base)
 
         if space.is_true(space.isinstance(w_value, space.w_unicode)):
-            from pypy.objspace.std.unicodeobject import unicode_to_decimal_w
+            if space.config.objspace.std.withropeunicode:
+                from pypy.objspace.std.ropeunicodeobject import unicode_to_decimal_w
+            else:
+                from pypy.objspace.std.unicodeobject import unicode_to_decimal_w
             s = unicode_to_decimal_w(space, w_value)
         else:
             try:

Modified: pypy/branch/ropes-unicode/pypy/objspace/std/longtype.py
==============================================================================
--- pypy/branch/ropes-unicode/pypy/objspace/std/longtype.py	(original)
+++ pypy/branch/ropes-unicode/pypy/objspace/std/longtype.py	Mon Nov 19 12:16:46 2007
@@ -18,7 +18,10 @@
                                      space.wrap(e.msg))
         elif space.is_true(space.isinstance(w_value, space.w_unicode)):
             try:
-                from unicodeobject import unicode_to_decimal_w
+                if space.config.objspace.std.withropeunicode:
+                    from pypy.objspace.std.ropeunicodeobject import unicode_to_decimal_w
+                else:
+                    from pypy.objspace.std.unicodeobject import unicode_to_decimal_w
                 w_value = string_to_w_long(space, unicode_to_decimal_w(space, w_value))
             except ParseStringError, e:
                 raise OperationError(space.w_ValueError,

Modified: pypy/branch/ropes-unicode/pypy/objspace/std/model.py
==============================================================================
--- pypy/branch/ropes-unicode/pypy/objspace/std/model.py	(original)
+++ pypy/branch/ropes-unicode/pypy/objspace/std/model.py	Mon Nov 19 12:16:46 2007
@@ -17,6 +17,8 @@
     "withmultilist"  : ["listmultiobject.W_ListMultiObject"],
     "withrope"       : ["ropeobject.W_RopeObject",
                         "ropeobject.W_RopeIterObject"],
+    "withropeunicode": ["ropeunicodeobject.W_RopeUnicodeObject",
+                        "ropeunicodeobject.W_RopeUnicodeIterObject"],
     "withrangelist"  : ["rangeobject.W_RangeListObject",
                         "rangeobject.W_RangeIterObject"],
     "withtproxy" : ["proxyobject.W_TransparentList",
@@ -68,6 +70,7 @@
         from pypy.objspace.std import listmultiobject
         from pypy.objspace.std import stringobject
         from pypy.objspace.std import ropeobject
+        from pypy.objspace.std import ropeunicodeobject
         from pypy.objspace.std import strsliceobject
         from pypy.objspace.std import strjoinobject
         from pypy.objspace.std import typeobject
@@ -186,9 +189,15 @@
              (unicodeobject.W_UnicodeObject, unicodeobject.delegate_String2Unicode),
                 ]
         else:
-            self.typeorder[ropeobject.W_RopeObject] += [
-             (unicodeobject.W_UnicodeObject, unicodeobject.delegate_String2Unicode),
-                ]
+            if config.objspace.std.withropeunicode:
+                self.typeorder[ropeobject.W_RopeObject] += [
+                 (ropeunicodeobject.W_RopeUnicodeObject,
+                     ropeunicodeobject.delegate_Rope2RopeUnicode),
+                 ]
+            else:
+                self.typeorder[ropeobject.W_RopeObject] += [
+                 (unicodeobject.W_UnicodeObject, unicodeobject.delegate_String2Unicode),
+                    ]
 
         if config.objspace.std.withstrslice:
             self.typeorder[strsliceobject.W_StringSliceObject] += [

Modified: pypy/branch/ropes-unicode/pypy/objspace/std/rope.py
==============================================================================
--- pypy/branch/ropes-unicode/pypy/objspace/std/rope.py	(original)
+++ pypy/branch/ropes-unicode/pypy/objspace/std/rope.py	Mon Nov 19 12:16:46 2007
@@ -151,7 +151,7 @@
         return self.s[index]
 
     def getunichar(self, index):
-        return unicode(self.s[index])
+        return unichr(ord(self.s[index]))
 
     def getint(self, index):
         return ord(self.s[index])
@@ -404,12 +404,18 @@
         start, stop, node = find_straddling(node, start, stop)
         iter = SeekableItemIterator(node)
         iter.seekforward(start)
-        #XXX doesn't work for unicode
-        result = [iter.nextchar()]
-        for i in range(slicelength - 1):
-            iter.seekforward(step - 1)
-            result.append(iter.nextchar())
-        return rope_from_charlist(result)
+        if node.is_bytestring():
+            result = [iter.nextchar()]
+            for i in range(slicelength - 1):
+                iter.seekforward(step - 1)
+                result.append(iter.nextchar())
+            return rope_from_charlist(result)
+        else:
+            result = [iter.nextunichar()]
+            for i in range(slicelength - 1):
+                iter.seekforward(step - 1)
+                result.append(iter.nextunichar())
+            return rope_from_unicharlist(result)
     return getslice_one(node, start, stop)
 
 def getslice_one(node, start, stop):
@@ -589,32 +595,61 @@
 def rope_from_unicharlist(charlist):
     nodelist = []
     length = len(charlist)
-    if length:
+    if not length:
         return LiteralStringNode.EMPTY
     i = 0
     while i < length:
-        chunk = []
+        unichunk = []
         while i < length:
             c = ord(charlist[i])
             if c < 256:
                 break
-            chunk.append(unichr(c))
+            unichunk.append(unichr(c))
             i += 1
-        if chunk:
-            nodelist.append(LiteralUnicodeNode("".join(chunk)))
-        chunck = []
+        if unichunk:
+            nodelist.append(LiteralUnicodeNode("".join(unichunk)))
+        strchunk = []
         while i < length:
             c = ord(charlist[i])
             if c >= 256:
                 break
-            chunk.append(chr(c))
+            strchunk.append(chr(c))
             i += 1
-        if chunk:
-            nodelist.append(LiteralStringNode("".join(chunk)))
+        if strchunk:
+            nodelist.append(LiteralStringNode("".join(strchunk)))
     return rebalance(nodelist, length)
-rope_from_unicharlist._annspecialcase_ = "specialize:argtype(0)"
 
-rope_from_unicode = rope_from_unicharlist
+def rope_from_unicode(uni):
+    nodelist = []
+    length = len(uni)
+    if not length:
+        return LiteralStringNode.EMPTY
+    i = 0
+    while i < length:
+        start = i
+        while i < length:
+            c = ord(uni[i])
+            if c < 256:
+                break
+            i += 1
+        if i != start:
+            nodelist.append(LiteralUnicodeNode(uni[start:i]))
+        start = i
+        strchunk = []
+        while i < length:
+            c = ord(uni[i])
+            if c >= 256:
+                break
+            i += 1
+        if i != start:
+            nodelist.append(LiteralStringNode(uni[start:i].encode("latin-1")))
+    return rebalance(nodelist, length)
+
+def rope_from_unichar(unichar):
+    intval = ord(unichar)
+    if intval > 256:
+        return LiteralUnicodeNode(unichar)
+    return LiteralStringNode.PREBUILT[intval]
 
 # __________________________________________________________________________
 # searching
@@ -881,12 +916,27 @@
 
 
 class ItemIterator(object):
-    def __init__(self, node):
+    def __init__(self, node, start=0):
         self.iter = FringeIterator(node)
         self.node = None
         self.nodelength = 0
         self.index = 0
-
+        if start:
+            self._advance_to(start)
+    
+    def _advance_to(self, index):
+        assert index > 0
+        assert self.index == 0
+        while 1:
+            node = self.iter.next()
+            length = node.length()
+            if index < length:
+                self.index = index
+                self.node = node
+                self.nodelength = length
+                break
+            index -= length
+            assert index >= 0
 
     def getnode(self):
         node = self.node
@@ -1160,8 +1210,7 @@
     stop = start + prefix.length()
     if stop > end:
         return False
-    iter1 = SeekableItemIterator(self)
-    iter1.seekforward(start)
+    iter1 = ItemIterator(self, start)
     iter2 = ItemIterator(prefix)
     for i in range(prefix.length()):
         if iter1.nextint() != iter2.nextint():
@@ -1176,15 +1225,15 @@
     begin = end - suffix.length()
     if begin < start:
         return False
-    iter1 = SeekableItemIterator(self)
-    iter1.seekforward(begin)
+    iter1 = ItemIterator(self, begin)
     iter2 = ItemIterator(suffix)
     for i in range(suffix.length()):
         if iter1.nextint() != iter2.nextint():
             return False
     return True
 
-def strip(node, left=True, right=True, predicate=lambda i: chr(i).isspace()):
+def strip(node, left=True, right=True, predicate=lambda i: chr(i).isspace(),
+          *extraargs):
     length = node.length()
     
     lpos = 0
@@ -1192,12 +1241,12 @@
     
     if left:
         iter = ItemIterator(node)
-        while lpos < rpos and predicate(iter.nextint()):
+        while lpos < rpos and predicate(iter.nextint(), *extraargs):
            lpos += 1
        
     if right:
         iter = ReverseItemIterator(node)
-        while rpos > lpos and predicate(iter.nextint()):
+        while rpos > lpos and predicate(iter.nextint(), *extraargs):
            rpos -= 1
        
     assert rpos >= lpos
@@ -1219,6 +1268,76 @@
     substrings.append(getslice_one(node, startidx, node.length()))
     return substrings
 
+
+def split_chars(node, maxsplit=-1, predicate=lambda x: chr(x).isspace()):
+    result = []
+    length = node.length()
+    if not length:
+        return result
+    i = 0
+    iter = ItemIterator(node)
+    while True:
+        # find the beginning of the next word
+        while i < length:
+            if not predicate(iter.nextint()):
+                break   # found
+            i += 1
+        else:
+            break  # end of string, finished
+
+        # find the end of the word
+        if maxsplit == 0:
+            j = length   # take all the rest of the string
+        else:
+            j = i + 1
+            while j < length and not predicate(iter.nextint()):
+                j += 1
+            maxsplit -= 1   # NB. if it's already < 0, it stays < 0
+
+        # the word is value[i:j]
+        result.append(getslice_one(node, i, j))
+
+        # continue to look from the character following the space after the word
+        i = j + 1
+    return result
+
+
+def rsplit_chars(node, maxsplit=-1, predicate=lambda x: chr(x).isspace()):
+    result = []
+    length = node.length()
+    i = length - 1
+    iter = ReverseItemIterator(node)
+    while True:
+        # starting from the end, find the end of the next word
+        while i >= 0:
+            if not predicate(iter.nextint()):
+                break   # found
+            i -= 1
+        else:
+            break  # end of string, finished
+
+        # find the start of the word
+        # (more precisely, 'j' will be the space character before the word)
+        if maxsplit == 0:
+            j = -1   # take all the rest of the string
+        else:
+            j = i - 1
+            while j >= 0 and not predicate(iter.nextint()):
+                j -= 1
+            maxsplit -= 1   # NB. if it's already < 0, it stays < 0
+
+        # the word is value[j+1:i+1]
+        j1 = j + 1
+        assert j1 >= 0
+        result.append(getslice_one(node, j1, i + 1))
+
+        # continue to look from the character before the space before the word
+        i = j - 1
+
+    result.reverse()
+    return result
+
+
 def split_completely(node, maxsplit=-1):
     upper = node.length()
     if maxsplit > 0 and maxsplit < upper + 2:
@@ -1231,6 +1350,53 @@
     substrings.append(rope.getslice_one(node, upper, length))
 
 
+def splitlines(node, keepends=False):
+    length = node.length()
+    if length == 0:
+        return []
+
+    result = []
+    iter = ItemIterator(node)
+    i = j = 0
+    last = ord(" ")
+    char = iter.nextint()
+    while i < length:
+        # Find a line and append it
+        while char != ord('\n') and char != ord('\r'):
+            try:
+                i += 1
+                last = char
+                char = iter.nextint()
+            except StopIteration:
+                break
+        # Skip the line break reading CRLF as one line break
+        eol = i
+        i += 1
+        last = char
+        try:
+            char = iter.nextint()
+        except StopIteration:
+            pass
+        else:
+            if last == ord('\r') and char == ord('\n'):
+                i += 1
+                try:
+                    last = char
+                    char = iter.nextint()
+                except StopIteration:
+                    pass
+        if keepends:
+            eol = i
+        result.append(getslice_one(node, j, eol))
+        j = i
+
+    if j == 0:
+        result.append(node)
+    elif j < length:
+        result.append(getslice_one(node, j, length))
+
+    return result
+
 # __________________________________________________________________________
 # misc
 
@@ -1247,54 +1413,65 @@
 # ____________________________________________________________
 # to and from unicode conversion
 
-def str_decode(rope, encoding):
+def str_decode_ascii(rope):
     assert rope.is_bytestring()
-    if encoding == "ascii":
-        if rope.is_ascii():
-            return rope
-    elif encoding == "latin-1":
+    if rope.is_ascii():
+        return rope
+    return None
+
+def str_decode_latin1(rope):
+    assert rope.is_bytestring()
+    return rope
+
+def str_decode_utf8(rope):
+    from pypy.rlib.runicode import str_decode_utf_8
+    if rope.is_ascii():
         return rope
-    elif encoding == "utf-8":
-        from pypy.rlib.runicode import str_decode_utf_8
-        if rope.is_ascii():
-            return rope
-        elif isinstance(rope, BinaryConcatNode):
-            lresult = str_decode(rope.left, "utf-8")
-            if result is not None:
-                return BinaryConcatNode(lresult,
-                                        str_decode(rope.right, "utf-8"))
-        elif isinstance(rope, LiteralStringNode):
+    elif isinstance(rope, BinaryConcatNode):
+        lresult = str_decode_utf8(rope.left)
+        if lresult is not None:
+            return BinaryConcatNode(lresult,
+                                    str_decode_utf8(rope.right))
+    elif isinstance(rope, LiteralStringNode):
+        try:
             result, consumed = str_decode_utf_8(rope.s, len(rope.s), False,
                                                 "strict")
-            if consumed < len(rope.s):
-                return None
-            return rope_from_unicode(result)
-        s = rope.flatten_string()
-        return str_decode_utf_8(s, len(s), True)
-    else:
-        raise NotImplementedError("unknown encoding")
+        except UnicodeDecodeError:
+            return None
+        if consumed < len(rope.s):
+            return None
+        return rope_from_unicode(result)
+    s = rope.flatten_string()
+    try:
+        result, consumed = str_decode_utf_8(s, len(s), True)
+        return rope_from_unicode(result)
+    except UnicodeDecodeError:
+        pass
+
 
-def unicode_encode(rope, encoding):
-    if encoding == "ascii":
-        if rope.is_ascii():
-            return rope
-    elif encoding == "latin-1":
+def unicode_encode_ascii(rope):
+    if rope.is_ascii():
         return rope
-    elif encoding == "utf-8":
-        from pypy.rlib.runicode import unicode_encode_utf_8
-        if rope.is_ascii():
-            return rope
-        elif isinstance(rope, BinaryConcatNode):
-            return BinaryConcatNode(unicode_encode(rope.left, "utf-8"),
-                                    unicode_encode(rope.right, "utf-8"))
-        elif isinstance(rope, LiteralUnicodeNode):
-            return unicode_encode_utf_8(rope.u, len(rope.u), "strict")
-        elif isinstance(rope, LiteralStringNode):
-            return LiteralStringNode(_str_encode_utf_8(rope.s))
-        s = rope.flatten_string()
-        return str_decode_utf_8(s, len(s), True)
-    else:
-        raise NotImplementedError("unknown encoding")
+
+def unicode_encode_latin1(rope):
+    if rope.is_bytestring():
+        return rope
+
+def unicode_encode_utf8(rope):
+    from pypy.rlib.runicode import unicode_encode_utf_8
+    if rope.is_ascii():
+        return rope
+    elif isinstance(rope, BinaryConcatNode):
+        return BinaryConcatNode(unicode_encode_utf8(rope.left),
+                                unicode_encode_utf8(rope.right))
+    elif isinstance(rope, LiteralUnicodeNode):
+        try:
+            return LiteralStringNode(
+                unicode_encode_utf_8(rope.u, len(rope.u), "strict"))
+        except UnicodeDecodeError:
+            return None
+    elif isinstance(rope, LiteralStringNode):
+        return LiteralStringNode(_str_encode_utf_8(rope.s))
 
 def _str_encode_utf_8(s):
     size = len(s)

Modified: pypy/branch/ropes-unicode/pypy/objspace/std/ropeobject.py
==============================================================================
--- pypy/branch/ropes-unicode/pypy/objspace/std/ropeobject.py	(original)
+++ pypy/branch/ropes-unicode/pypy/objspace/std/ropeobject.py	Mon Nov 19 12:16:46 2007
@@ -219,41 +219,14 @@
     return W_RopeObject(rope.rope_from_charlist(buffer))
 
 def str_split__Rope_None_ANY(space, w_self, w_none, w_maxsplit=-1):
+    selfnode = w_self._node
     maxsplit = space.int_w(w_maxsplit)
-    res_w = []
-    node = w_self._node
-    length = node.length()
-    i = 0
-    iter = rope.ItemIterator(node)
-    while True:
-        # find the beginning of the next word
-        while i < length:
-            if not iter.nextchar().isspace():
-                break   # found
-            i += 1
-        else:
-            break  # end of string, finished
-
-        # find the end of the word
-        if maxsplit == 0:
-            j = length   # take all the rest of the string
-        else:
-            j = i + 1
-            while j < length and not iter.nextchar().isspace():
-                j += 1
-            maxsplit -= 1   # NB. if it's already < 0, it stays < 0
-
-        # the word is value[i:j]
-        res_w.append(W_RopeObject(rope.getslice_one(node, i, j)))
-
-        # continue to look from the character following the space after the word
-        i = j + 1
-
+    res_w = [W_RopeObject(node)
+                for node in rope.split_chars(selfnode, maxsplit)]
     return space.newlist(res_w)
 
 def str_split__Rope_Rope_ANY(space, w_self, w_by, w_maxsplit=-1):
     maxsplit = space.int_w(w_maxsplit)
-    start = 0
     selfnode = w_self._node
     bynode = w_by._node
     bylen = bynode.length()
@@ -263,42 +236,15 @@
                 for node in rope.split(selfnode, bynode, maxsplit)]
     return space.newlist(res_w)
 
+
 def str_rsplit__Rope_None_ANY(space, w_self, w_none, w_maxsplit=-1):
-    # XXX works but flattens
+    selfnode = w_self._node
     maxsplit = space.int_w(w_maxsplit)
-    res_w = []
-    value = w_self._node.flatten_string()
-    i = len(value)-1
-    while True:
-        # starting from the end, find the end of the next word
-        while i >= 0:
-            if not value[i].isspace():
-                break   # found
-            i -= 1
-        else:
-            break  # end of string, finished
-
-        # find the start of the word
-        # (more precisely, 'j' will be the space character before the word)
-        if maxsplit == 0:
-            j = -1   # take all the rest of the string
-        else:
-            j = i - 1
-            while j >= 0 and not value[j].isspace():
-                j -= 1
-            maxsplit -= 1   # NB. if it's already < 0, it stays < 0
-
-        # the word is value[j+1:i+1]
-        j1 = j + 1
-        assert j1 >= 0
-        res_w.append(space.wrap(value[j1:i+1]))
-
-        # continue to look from the character before the space before the word
-        i = j - 1
-
-    res_w.reverse()
+    res_w = [W_RopeObject(node)
+                for node in rope.rsplit_chars(selfnode, maxsplit)]
     return space.newlist(res_w)
 
+
 def str_rsplit__Rope_Rope_ANY(space, w_self, w_by, w_maxsplit=-1):
     # XXX works but flattens
     maxsplit = space.int_w(w_maxsplit)
@@ -498,7 +444,7 @@
         substrings = [by]
         iter = rope.ItemIterator(node)
         for i in range(upper):
-            substrings.append(iter.nextrope()])
+            substrings.append(iter.nextrope())
             substrings.append(by)
         substrings.append(rope.getslice_one(node, upper, length))
         try:
@@ -515,43 +461,27 @@
         raise OperationError(space.w_OverflowError,
                              space.wrap("string too long"))
 
-def _strip(space, w_self, w_chars, left, right):
-    "internal function called by str_xstrip methods"
-    node = w_self._node
-    length = node.length()
-    u_chars = w_chars._node.flatten_string()
-    
-    lpos = 0
-    rpos = length
-    
-    if left:
-        #print "while %d < %d and -%s- in -%s-:"%(lpos, rpos, u_self[lpos],w_chars)
-        iter = rope.ItemIterator(node)
-        while lpos < rpos and iter.nextchar() in u_chars:
-           lpos += 1
-       
-    if right:
-        iter = rope.ReverseItemIterator(node)
-        while rpos > lpos and iter.nextchar() in u_chars:
-           rpos -= 1
-       
-    return W_RopeObject(rope.getslice_one(node, lpos, rpos))
 
+def _contains(i, string):
+    return chr(i) in string
 
 def str_strip__Rope_Rope(space, w_self, w_chars):
-    return _strip(space, w_self, w_chars, left=1, right=1)
+    return W_RopeObject(rope.strip(w_self._node, True, True,
+                                   _contains, w_chars._node.flatten_string()))
 
 def str_strip__Rope_None(space, w_self, w_chars):
     return W_RopeObject(rope.strip(w_self._node, left=True, right=True))
    
 def str_rstrip__Rope_Rope(space, w_self, w_chars):
-    return _strip(space, w_self, w_chars, left=0, right=1)
+    return W_RopeObject(rope.strip(w_self._node, False, True,
+                                   _contains, w_chars._node.flatten_string()))
 
 def str_rstrip__Rope_None(space, w_self, w_chars):
-    return W_RopeObject(rope.strip(w_self._node, left=False, right=True))
+    return W_RopeObject(rope.strip(w_self._node, False, True))
 
 def str_lstrip__Rope_Rope(space, w_self, w_chars):
-    return _strip(space, w_self, w_chars, left=1, right=0)
+    return W_RopeObject(rope.strip(w_self._node, True, False,
+                                   _contains, w_chars._node.flatten_string()))
 
 def str_lstrip__Rope_None(space, w_self, w_chars):
     return W_RopeObject(rope.strip(w_self._node, left=True, right=False))
@@ -606,6 +536,10 @@
     (self, _, start, end) = _convert_idx_params(space, w_self,
                                                   space.wrap(''), w_start, w_end)
     for w_suffix in space.unpacktuple(w_suffixes):
+        if space.is_true(space.isinstance(w_suffix, space.w_unicode)):
+            w_u = space.call_function(space.w_unicode, w_self)
+            return space.call_method(w_u, "endswith", w_suffixes, w_start,
+                                     w_end)
         suffix = rope_w(space, w_suffix) 
         if rope.endswith(self, suffix, start, end):
             return space.w_True
@@ -621,6 +555,10 @@
     (self, _, start, end) = _convert_idx_params(space, w_self, space.wrap(''),
                                                   w_start, w_end)
     for w_prefix in space.unpacktuple(w_prefixes):
+        if space.is_true(space.isinstance(w_prefix, space.w_unicode)):
+            w_u = space.call_function(space.w_unicode, w_self)
+            return space.call_method(w_u, "startswith", w_prefixes, w_start,
+                                     w_end)
         prefix = rope_w(space, w_prefix)
         if rope.startswith(self, prefix, start, end):
             return space.w_True
@@ -673,53 +611,10 @@
  
  
 def str_splitlines__Rope_ANY(space, w_self, w_keepends):
-    keepends  = bool(space.int_w(w_keepends))  # truth value, but type checked
+    keepends = bool(space.int_w(w_keepends))  # truth value, but type checked
     node = w_self._node
-    length = node.length()
-    if length == 0:
-        return space.newlist([])
-
-    strs_w = []
-    iter = rope.ItemIterator(node)
-    i = j = 0
-    last = " "
-    char = iter.nextchar()
-    while i < length:
-        # Find a line and append it
-        while char != '\n' and char != '\r':
-            try:
-                i += 1
-                last = char
-                char = iter.nextchar()
-            except StopIteration:
-                break
-        # Skip the line break reading CRLF as one line break
-        eol = i
-        i += 1
-        last = char
-        try:
-            char = iter.nextchar()
-        except StopIteration:
-            pass
-        else:
-            if last == '\r' and char == '\n':
-                i += 1
-                try:
-                    last = char
-                    char = iter.nextchar()
-                except StopIteration:
-                    pass
-        if keepends:
-            eol = i
-        strs_w.append(W_RopeObject(rope.getslice_one(node, j, eol)))
-        j = i
-
-    if j == 0:
-        strs_w.append(w_self.create_if_subclassed())
-    elif j < length:
-        strs_w.append(W_RopeObject(rope.getslice_one(node, j, length)))
-
-    return space.newlist(strs_w)
+    return space.newlist(
+        [W_RopeObject(n) for n in rope.splitlines(node, keepends)])
 
 def str_zfill__Rope_ANY(space, w_self, w_width):
     node = w_self._node
@@ -839,8 +734,8 @@
         return w_str
     return W_RopeObject(w_str._node)
 
-def iter__Rope(space, w_list):
-    return W_RopeIterObject(w_list)
+def iter__Rope(space, w_str):
+    return W_RopeIterObject(w_str)
 
 def ord__Rope(space, w_str):
     node = w_str._node
@@ -934,11 +829,12 @@
 
 def str_decode__Rope_ANY_ANY(space, w_string, w_encoding=None, w_errors=None):
     from pypy.objspace.std.unicodetype import _get_encoding_and_errors, \
-        unicode_from_string, decode_object
+        getdefaultencoding
+    from pypy.objspace.std.ropeunicodeobject import decode_string
     encoding, errors = _get_encoding_and_errors(space, w_encoding, w_errors)
-    if encoding is None and errors is None:
-        return unicode_from_string(space, w_string)
-    return decode_object(space, w_string, encoding, errors)
+    if encoding is None:
+        encoding = getdefaultencoding(space)
+    return decode_string(space, w_string, encoding, errors)
 
 def str_encode__Rope_ANY_ANY(space, w_string, w_encoding=None, w_errors=None):
     from pypy.objspace.std.unicodetype import _get_encoding_and_errors, \

Modified: pypy/branch/ropes-unicode/pypy/objspace/std/ropeunicodeobject.py
==============================================================================
--- pypy/branch/ropes-unicode/pypy/objspace/std/ropeunicodeobject.py	(original)
+++ pypy/branch/ropes-unicode/pypy/objspace/std/ropeunicodeobject.py	Mon Nov 19 12:16:46 2007
@@ -19,23 +19,47 @@
     return W_RopeUnicodeObject(rope.rope_from_unicode(uni))
 
 def unicode_from_string(space, w_str):
-    from pypy.objspace.std.unicodetype import getdefaultencoding, \
-        unicode_from_encoded_object
+    from pypy.objspace.std.unicodetype import getdefaultencoding
     assert isinstance(w_str, W_RopeObject)
-    node = w_str._node
     encoding = getdefaultencoding(space)
-    if encoding == 'ascii':
-        result = rope.str_decode_ascii(node)
-        if result is not None:
-            return W_RopeUnicodeObject(result)
-    elif encoding == 'latin-1':
-        assert node.is_bytestring()
-        return W_RopeUnicodeObject(node)
-    elif encoding == "utf-8":
-        result = rope.str_decode_utf8(node)
-        if result is not None:
-            return W_RopeUnicodeObject(result)
-    return unicode_from_encoded_object(space, w_str, encoding, "strict")
+    return decode_string(space, w_str, encoding, "strict")
+
+def decode_string(space, w_str, encoding, errors):
+    from pypy.objspace.std.unicodetype import unicode_from_encoded_object
+    if errors is None or errors == "strict":
+        node = w_str._node
+        if encoding == 'ascii':
+            result = rope.str_decode_ascii(node)
+            if result is not None:
+                return W_RopeUnicodeObject(result)
+        elif encoding == 'latin-1':
+            assert node.is_bytestring()
+            return W_RopeUnicodeObject(node)
+        elif encoding == "utf-8":
+            result = rope.str_decode_utf8(node)
+            if result is not None:
+                return W_RopeUnicodeObject(result)
+    return unicode_from_encoded_object(space, w_str, encoding, errors)
+
+def encode_unicode(space, w_unistr, encoding, errors):
+    from pypy.objspace.std.unicodetype import getdefaultencoding, \
+        _get_encoding_and_errors, encode_object
+    from pypy.objspace.std.ropeobject import W_RopeObject
+    if errors is None or errors == "strict":
+        node = w_unistr._node
+        if encoding == 'ascii':
+            result = rope.unicode_encode_ascii(node)
+            if result is not None:
+                return W_RopeObject(result)
+        elif encoding == 'latin-1':
+            result = rope.unicode_encode_latin1(node)
+            if result is not None:
+                return W_RopeObject(result)
+        elif encoding == "utf-8":
+            result = rope.unicode_encode_utf8(node)
+            if result is not None:
+                return W_RopeObject(result)
+    return encode_object(space, w_unistr, encoding, errors)
 
 
 class W_RopeUnicodeObject(W_Object):
@@ -55,20 +79,34 @@
     def create_if_subclassed(w_self):
         if type(w_self) is W_RopeUnicodeObject:
             return w_self
-        return W_RopeUnicodeObject(w_self._value)
+        return W_RopeUnicodeObject(w_self._node)
 
 W_RopeUnicodeObject.EMPTY = W_RopeUnicodeObject(rope.LiteralStringNode.EMPTY)
 
 registerimplementation(W_RopeUnicodeObject)
 
+def _isspace(uchar_ord):
+    return unicodedb.isspace(uchar_ord)
 
 def ropeunicode_w(space, w_str):
     if isinstance(w_str, W_RopeUnicodeObject):
         return w_str._node
-    # XXX do the right thing for W_RopeObject
+    if isinstance(w_str, W_RopeObject):
+        return unicode_from_string(space, w_str)._node
     return rope.LiteralUnicodeNode(space.unicode_w(w_str))
 
 
+class W_RopeUnicodeIterObject(W_Object):
+    from pypy.objspace.std.itertype import iter_typedef as typedef
+
+    def __init__(w_self, w_rope, index=0):
+        w_self.node = node = w_rope._node
+        w_self.item_iter = rope.ItemIterator(node)
+        w_self.index = index
+
+def iter__RopeUnicode(space, w_uni):
+    return W_RopeUnicodeIterObject(w_uni)
+
 # Helper for converting int/long
 def unicode_to_decimal_w(space, w_unistr):
     if not isinstance(w_unistr, W_RopeUnicodeObject):
@@ -145,7 +183,7 @@
 
 
 def ord__RopeUnicode(space, w_uni):
-    if w_uni.length() != 1:
+    if w_uni._node.length() != 1:
         raise OperationError(space.w_TypeError, space.wrap('ord() expected a character'))
     return space.wrap(w_uni._node.getint(0))
 
@@ -177,7 +215,7 @@
 
 def unicode_join__RopeUnicode_ANY(space, w_self, w_list):
     l_w = space.unpackiterable(w_list)
-    delim = w_self._value
+    delim = w_self._node
     totlen = 0
     if len(l_w) == 0:
         return W_RopeUnicodeObject.EMPTY
@@ -199,7 +237,7 @@
             raise OperationError(space.w_TypeError, w_msg)
         values_list.append(item)
     try:
-        return W_RopeUnicodeObject(rope.join(self, values_list))
+        return W_RopeUnicodeObject(rope.join(w_self._node, values_list))
     except OverflowError:
         raise OperationError(space.w_OverflowError,
                              space.wrap("string too long"))
@@ -223,7 +261,7 @@
     return W_RopeUnicodeObject(uni.getrope(ival))
 
 def getitem__RopeUnicode_Slice(space, w_uni, w_slice):
-    node = w_uni._noed
+    node = w_uni._node
     length = node.length()
     start, stop, step, sl = w_slice.indices4(space, length)
     if sl == 0:
@@ -239,7 +277,7 @@
         raise
     node = w_uni._node
     try:
-        return W_RopeUnicodeObject(rope.multiply(node, mul))
+        return W_RopeUnicodeObject(rope.multiply(node, times))
     except OverflowError:
         raise OperationError(space.w_OverflowError,
                              space.wrap("string too long"))
@@ -317,47 +355,33 @@
             previous_is_cased = False
 
 
-def _strip(space, w_self, w_chars, left, right):
-    "internal function called by str_xstrip methods"
-    XXX
-    u_self = w_self._value
-    u_chars = w_chars._value
-    
-    lpos = 0
-    rpos = len(u_self)
-    
-    if left:
-        while lpos < rpos and u_self[lpos] in u_chars:
-           lpos += 1
-       
-    if right:
-        while rpos > lpos and u_self[rpos - 1] in u_chars:
-           rpos -= 1
-           
-    assert rpos >= 0
-    result = u_self[lpos: rpos]
-    return W_UnicodeObject(result)
+def _contains(i, uni):
+    return unichr(i) in uni
 
 def unicode_strip__RopeUnicode_None(space, w_self, w_chars):
-    return W_RopeUnicodeObject(rope.strip(w_self._none, True, True, _isspace))
+    return W_RopeUnicodeObject(rope.strip(w_self._node, True, True, _isspace))
 def unicode_strip__RopeUnicode_RopeUnicode(space, w_self, w_chars):
-    return _strip(space, w_self, w_chars, 1, 1)
+    return W_RopeUnicodeObject(rope.strip(w_self._node, True, True, _contains,
+                               w_chars._node.flatten_unicode()))
+
 def unicode_strip__RopeUnicode_Rope(space, w_self, w_chars):
     return space.call_method(w_self, 'strip',
                              unicode_from_string(space, w_chars))
 
 def unicode_lstrip__RopeUnicode_None(space, w_self, w_chars):
-    return W_RopeUnicodeObject(rope.strip(w_self._none, True, False, _isspace))
+    return W_RopeUnicodeObject(rope.strip(w_self._node, True, False, _isspace))
 def unicode_lstrip__RopeUnicode_RopeUnicode(space, w_self, w_chars):
-    return _strip(space, w_self, w_chars, 1, 0)
+    return W_RopeUnicodeObject(rope.strip(w_self._node, True, False, _contains,
+                               w_chars._node.flatten_unicode()))
 def unicode_lstrip__RopeUnicode_Rope(space, w_self, w_chars):
     return space.call_method(w_self, 'lstrip',
                              unicode_from_string(space, w_chars))
 
 def unicode_rstrip__RopeUnicode_None(space, w_self, w_chars):
-    return W_RopeUnicodeObject(rope.strip(w_self._none, False, True, _isspace))
+    return W_RopeUnicodeObject(rope.strip(w_self._node, False, True, _isspace))
 def unicode_rstrip__RopeUnicode_RopeUnicode(space, w_self, w_chars):
-    return _strip(space, w_self, w_chars, 0, 1)
+    return W_RopeUnicodeObject(rope.strip(w_self._node, False, True, _contains,
+                               w_chars._node.flatten_unicode()))
 def unicode_rstrip__RopeUnicode_Rope(space, w_self, w_chars):
     return space.call_method(w_self, 'rstrip',
                              unicode_from_string(space, w_chars))
@@ -383,7 +407,7 @@
     iter = rope.ItemIterator(input)
 
     previous_is_cased = False
-    for i in range(len(input)):
+    for i in range(input.length()):
         unichar = iter.nextint()
         if previous_is_cased:
             result[i] = unichr(unicodedb.tolower(unichar))
@@ -436,30 +460,6 @@
 
     return (self, start, end)
 
-def _check_startswith_substring(str, substr, start, end):
-    XXX
-    substr_len = len(substr)
-    
-    if end - start < substr_len:
-        return False # substring is too long
-    
-    for i in range(substr_len):
-        if str[start + i] != substr[i]:
-            return False
-    return True    
-
-def _check_endswith_substring(str, substr, start, end):
-    XXX
-    substr_len = len(substr)
-
-    if end - start < substr_len:
-        return False # substring is too long
-    start = end - substr_len
-    for i in range(substr_len):
-        if str[start + i] != substr[i]:
-            return False
-    return True
-
 def unicode_endswith__RopeUnicode_RopeUnicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
     self, start, end = _convert_idx_params(space, w_self, w_start, w_end)
     return space.newbool(rope.endswith(self, w_substr._node, start, end))
@@ -477,8 +477,8 @@
                                               w_start, w_end):
     unistr, start, end = _convert_idx_params(space, w_unistr, w_start, w_end)
     for w_prefix in space.unpacktuple(w_prefixes):
-        prefix = unicoderope_w(space, w_prefix)
-        if rope.startswith(self, prefxix, start, end)
+        prefix = ropeunicode_w(space, w_prefix)
+        if rope.startswith(unistr, prefix, start, end):
             return space.w_True
     return space.w_False
 
@@ -486,8 +486,8 @@
                                             w_start, w_end):
     unistr, start, end = _convert_idx_params(space, w_unistr, w_start, w_end)
     for w_suffix in space.unpacktuple(w_suffixes):
-        suffix = unicoderope_w(space, w_suffix)
-        if _check_endswith_substring(unistr, suffix, start, end):
+        suffix = ropeunicode_w(space, w_suffix)
+        if rope.endswith(unistr, suffix, start, end):
             return space.w_True
     return space.w_False
 
@@ -508,7 +508,7 @@
 
 def unicode_center__RopeUnicode_ANY_ANY(space, w_self, w_width, w_fillchar):
     self = w_self._node
-    length = self.length
+    length = self.length()
     width = space.int_w(w_width)
     fillchar = _to_unichar_w(space, w_fillchar)
     padding = width - length
@@ -517,82 +517,57 @@
     offset = padding // 2
     pre = rope.multiply(fillchar, offset)
     post = rope.multiply(fillchar, (padding - offset))
-    centered = rope.rebalance([pre, node, post])
+    centered = rope.rebalance([pre, self, post])
     return W_RopeUnicodeObject(centered)
 
 def unicode_ljust__RopeUnicode_ANY_ANY(space, w_self, w_width, w_fillchar):
     self = w_self._node
-    length = self.length
+    length = self.length()
     width = space.int_w(w_width)
     fillchar = _to_unichar_w(space, w_fillchar)
     padding = width - length
     if padding < 0:
         return w_self.create_if_subclassed()
-    resultnode = rope.concatenate(self, rope.multiply(fillchar, d))
+    resultnode = rope.concatenate(self, rope.multiply(fillchar, padding))
     return W_RopeUnicodeObject(resultnode)
 
 def unicode_rjust__RopeUnicode_ANY_ANY(space, w_self, w_width, w_fillchar):
     self = w_self._node
-    length = self.length
+    length = self.length()
     width = space.int_w(w_width)
     fillchar = _to_unichar_w(space, w_fillchar)
     padding = width - length
     if padding < 0:
         return w_self.create_if_subclassed()
-    resultnode = rope.concatenate(rope.multiply(fillchar, d), self)
+    resultnode = rope.concatenate(rope.multiply(fillchar, padding), self)
     return W_RopeUnicodeObject(resultnode)
     
 def unicode_zfill__RopeUnicode_ANY(space, w_self, w_width):
-    self = w_self._value
-    length = self.length
+    self = w_self._node
+    length = self.length()
     width = space.int_w(w_width)
+    zero = rope.LiteralStringNode.PREBUILT[ord("0")]
     if self.length() == 0:
         return W_RopeUnicodeObject(
-            rope.multiply(rope.LiteralStringNode.PREBUILT[ord(" ")], width))
-    padding = width - self.length()
+            rope.multiply(zero, width))
+    padding = width - length
     if padding <= 0:
         return w_self.create_if_subclassed()
     firstchar = self.getunichar(0)
     if firstchar in (u'+', u'-'):
         return W_RopeUnicodeObject(rope.rebalance(
             [rope.LiteralStringNode.PREBUILT[ord(firstchar)],
-             rope.multiply(zero, middle),
-             rope.getslice_one(node, 1, length)]))
+             rope.multiply(zero, padding),
+             rope.getslice_one(self, 1, length)]))
     else:
-        middle = width - length
         return W_RopeUnicodeObject(rope.concatenate(
-            rope.multiply(zero, middle), node))
+            rope.multiply(zero, padding), self))
 
 def unicode_splitlines__RopeUnicode_ANY(space, w_self, w_keepends):
-    XXX
-    self = w_self._value
-    keepends = 0
-    if space.int_w(w_keepends):
-        keepends = 1
-    if len(self) == 0:
-        return space.newlist([])
-    
-    start = 0
-    end = len(self)
-    pos = 0
-    lines = []
-    while pos < end:
-        if unicodedb.islinebreak(ord(self[pos])):
-            if (self[pos] == u'\r' and pos + 1 < end and
-                self[pos + 1] == u'\n'):
-                # Count CRLF as one linebreak
-                lines.append(W_UnicodeObject(self[start:pos + keepends * 2]))
-                pos += 1
-            else:
-                lines.append(W_UnicodeObject(self[start:pos + keepends]))
-            pos += 1
-            start = pos
-        else:
-            pos += 1
-    if not unicodedb.islinebreak(ord(self[end - 1])):
-        lines.append(W_UnicodeObject(self[start:]))
-    return space.newlist(lines)
-
+    keepends = bool(space.int_w(w_keepends))  # truth value, but type checked
+    node = w_self._node
+    return space.newlist(
+        [W_RopeUnicodeObject(n) for n in rope.splitlines(node, keepends)])
 
 def unicode_find__RopeUnicode_RopeUnicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
     self, start, end = _convert_idx_params(space, w_self, w_start, w_end)
@@ -629,86 +604,44 @@
 
 def unicode_count__RopeUnicode_RopeUnicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
     self, start, end = _convert_idx_params(space, w_self, w_start, w_end)
-    substr = w_substr._value
-    return space.wrap(self.count(substr, start, end))
+    assert start >= 0
+    assert end >= 0
+    iter = rope.FindIterator(self, w_substr._node, start, end)
+    i = 0
+    while 1:
+        try:
+            index = iter.next()
+        except StopIteration:
+            break
+        i += 1
+    return wrapint(space, i)
 
 def unicode_split__RopeUnicode_None_ANY(space, w_self, w_none, w_maxsplit):
-    self = w_self._value
+    selfnode = w_self._node
     maxsplit = space.int_w(w_maxsplit)
-    parts = []
-    if len(self) == 0:
-        return space.newlist([])
-    start = 0
-    end = len(self)
-    inword = 0
-
-    while maxsplit != 0 and start < end:
-        index = start
-        for index in range(start, end):
-            if _isspace(self[index]):
-                break
-            else:
-                inword = 1
-        else:
-            break
-        if inword == 1:
-            parts.append(W_UnicodeObject(self[start:index]))
-            maxsplit -= 1
-        # Eat whitespace
-        for start in range(index + 1, end):
-            if not _isspace(self[start]):
-                break
-        else:
-            return space.newlist(parts)
-
-    parts.append(W_UnicodeObject(self[start:]))
-    return space.newlist(parts)
+    res_w = [W_RopeUnicodeObject(node)
+                for node in rope.split_chars(selfnode, maxsplit, _isspace)]
+    return space.newlist(res_w)
 
 def unicode_split__RopeUnicode_RopeUnicode_ANY(space, w_self, w_delim, w_maxsplit):
     maxsplit = space.int_w(w_maxsplit)
     start = 0
     selfnode = w_self._node
-    bynode = w_by._node
-    bylen = bynode.length()
-    if bylen == 0:
+    delimnode = w_delim._node
+    delimlen = delimnode.length()
+    if delimlen == 0:
         raise OperationError(space.w_ValueError, space.wrap("empty separator"))
     res_w = [W_RopeUnicodeObject(node)
-                for node in rope.split(selfnode, bynode, maxsplit)]
+                for node in rope.split(selfnode, delimnode, maxsplit)]
     return space.newlist(res_w)
 
 def unicode_rsplit__RopeUnicode_None_ANY(space, w_self, w_none, w_maxsplit):
-    XXX
-    self = w_self._value
+    selfnode = w_self._node
     maxsplit = space.int_w(w_maxsplit)
-    parts = []
-    if len(self) == 0:
-        return space.newlist([])
-    start = 0
-    end = len(self)
-    inword = 0
-
-    while maxsplit != 0 and start < end:
-        index = end
-        for index in range(end-1, start-1, -1):
-            if _isspace(self[index]):
-                break
-            else:
-                inword = 1
-        else:
-            break
-        if inword == 1:
-            parts.append(W_UnicodeObject(self[index+1:end]))
-            maxsplit -= 1
-        # Eat whitespace
-        for end in range(index, start-1, -1):
-            if not _isspace(self[end-1]):
-                break
-        else:
-            return space.newlist(parts)
+    res_w = [W_RopeUnicodeObject(node)
+                for node in rope.rsplit_chars(selfnode, maxsplit, _isspace)]
+    return space.newlist(res_w)
 
-    parts.append(W_UnicodeObject(self[:end]))
-    parts.reverse()
-    return space.newlist(parts)
 
 def unicode_rsplit__RopeUnicode_RopeUnicode_ANY(space, w_self, w_delim, w_maxsplit):
     # XXX works but flattens
@@ -733,7 +666,7 @@
         end = index
         maxsplit -= 1
     parts.append(W_RopeUnicodeObject(
-        rope.getslice_one(w_self._node, 0, :end)))
+        rope.getslice_one(w_self._node, 0, end)))
     parts.reverse()
     return space.newlist(parts)
 
@@ -741,7 +674,7 @@
     if maxsplit == 0:
         return [self]
     index = 0
-    end = len(self)
+    end = self.length()
     parts = [rope.LiteralStringNode.EMPTY]
     maxsplit -= 1
     while maxsplit != 0:
@@ -750,50 +683,40 @@
         parts.append(self.getrope(index))
         index += 1
         maxsplit -= 1
-    parts.append(rope.getslice_one(self, index, self.length())
+    parts.append(rope.getslice_one(self, index, self.length()))
     return parts
 
 def unicode_replace__RopeUnicode_RopeUnicode_RopeUnicode_ANY(
         space, w_self, w_old, w_new, w_maxsplit):
+    self = w_self._node
     old = w_old._node
-    oldlength = old.length
+    maxsplit = space.int_w(w_maxsplit)
+    oldlength = old.length()
     if not oldlength:
-        self = w_self._value
-        maxsplit = space.int_w(w_maxsplit)
         parts = _split_into_chars(self, maxsplit)
         return W_RopeUnicodeObject(rope.join(w_new._node, parts))
-    substrings = rope.split(node, old, maxsplit)
+    substrings = rope.split(self, old, maxsplit)
     if not substrings:
         return w_self.create_if_subclassed()
-    substrings = rope.split(node, sub, maxsplit)
-    if substrings is None:
-        return w_self.create_if_subclassed()
     try:
-        return W_RopeObject(rope.join(by, substrings))
+        return W_RopeUnicodeObject(rope.join(w_new._node, substrings))
     except OverflowError:
         raise OperationError(space.w_OverflowError,
                              space.wrap("string too long"))
-    try:
-        return W_RopeObject(rope.join(by, substrings))
-    except OverflowError:
-        raise OperationError(space.w_OverflowError,
-                             space.wrap("string too long"))
-    return W_UnicodeObject(w_new._value.join(parts))
     
 
-def unicode_encode__Unicode_ANY_ANY(space, w_unistr,
-                                    w_encoding=None,
-                                    w_errors=None):
+def unicode_encode__RopeUnicode_ANY_ANY(space, w_unistr,
+                                        w_encoding=None,
+                                        w_errors=None):
 
     from pypy.objspace.std.unicodetype import getdefaultencoding, \
-        _get_encoding_and_errors, encode_object
+        _get_encoding_and_errors
     encoding, errors = _get_encoding_and_errors(space, w_encoding, w_errors)
     if encoding is None:
         encoding = getdefaultencoding(space)
-    w_retval = encode_object(space, w_unistr, encoding, errors)
-    return w_retval
+    return encode_unicode(space, w_unistr, encoding, errors)
 
-def unicode_partition__Unicode_Unicode(space, w_unistr, w_unisub):
+def unicode_partition__RopeUnicode_RopeUnicode(space, w_unistr, w_unisub):
     self = w_unistr._node
     sub = w_unisub._node
     if not sub.length():
@@ -810,7 +733,7 @@
              W_RopeUnicodeObject(rope.getslice_one(self, pos + sub.length(),
                                             self.length()))])
 
-def unicode_rpartition__Unicode_Unicode(space, w_unistr, w_unisub):
+def unicode_rpartition__RopeUnicode_RopeUnicode(space, w_unistr, w_unisub):
     # XXX works but flattens
     unistr = w_unistr._node.flatten_unicode()
     unisub = w_unisub._node.flatten_unicode()
@@ -819,8 +742,8 @@
                              space.wrap("empty separator"))
     pos = unistr.rfind(unisub)
     if pos == -1:
-        return space.newtuple([W_UnicodeObject.EMPTY,
-                               W_UnicodeObject.EMPTY, w_unistr])
+        return space.newtuple([W_RopeUnicodeObject.EMPTY,
+                               W_RopeUnicodeObject.EMPTY, w_unistr])
     else:
         assert pos > 0
         return space.newtuple([space.wrap(unistr[:pos]), w_unisub,
@@ -828,9 +751,10 @@
 
 
 def unicode_expandtabs__RopeUnicode_ANY(space, w_self, w_tabsize):
+    from pypy.objspace.std.ropeobject import _tabindent
     self = w_self._node
     tabsize  = space.int_w(w_tabsize)
-    splitted = rope.split(node, rope.LiteralStringNode.PREBUILT[ord('\t')])
+    splitted = rope.split(self, rope.LiteralStringNode.PREBUILT[ord('\t')])
     last = splitted[0]
     expanded = [last]
     for i in range(1, len(splitted)):
@@ -865,9 +789,9 @@
                     raise OperationError(
                             space.w_TypeError,
                             space.wrap("character mapping must be in range(0x%x)" % (maxunicode + 1,)))
-                result.append(unichr(newval))
+                result.append(rope.rope_from_unichar(unichr(newval)))
             elif space.is_true(space.isinstance(w_newval, space.w_unicode)):
-                result.append(unicoderope_w(w_newval))
+                result.append(ropeunicode_w(space, w_newval))
             else:
                 raise OperationError(
                     space.w_TypeError,
@@ -877,11 +801,11 @@
 # Move this into the _codecs module as 'unicodeescape_string (Remember to cater for quotes)'
 def repr__RopeUnicode(space, w_unicode):
     hexdigits = "0123456789abcdef"
-    chars = w_unicode._node
+    node = w_unicode._node
     size = node.length()
     
     singlequote = doublequote = False
-    iter = rope.ItemIterator()
+    iter = rope.ItemIterator(node)
     for i in range(size):
         c = iter.nextunichar()
         if singlequote and doublequote:
@@ -895,7 +819,7 @@
     else:
         quote = '\''
     result = ['u', quote]
-    iter = rope.ItemIterator()
+    iter = rope.ItemIterator(node)
     j = 0
     while j < size:
         code = iter.nextint()
@@ -915,6 +839,8 @@
         if code >= 0xD800 and code < 0xDC00:
             if j < size - 1:
                 code2 = iter.nextint()
+                # XXX this is wrong: if the next if is false,
+                # code2 is lost
                 if code2 >= 0xDC00 and code2 <= 0xDFFF:
                     code = (((code & 0x03FF) << 10) | (code2 & 0x03FF)) + 0x00010000
                     result.extend(["U",
@@ -975,6 +901,34 @@
     return mod_format(space, w_format, w_values, do_unicode=True)
 
 
+# methods of the iterator
+
+def iter__RopeUnicodeIter(space, w_ropeiter):
+    return w_ropeiter
+
+def next__RopeUnicodeIter(space, w_ropeiter):
+    if w_ropeiter.node is None:
+        raise OperationError(space.w_StopIteration, space.w_None) 
+    try:
+        unichar = w_ropeiter.item_iter.nextunichar()
+        w_item = space.wrap(unichar)
+    except StopIteration:
+        w_ropeiter.node = None
+        w_ropeiter.char_iter = None
+        raise OperationError(space.w_StopIteration, space.w_None) 
+    w_ropeiter.index += 1 
+    return w_item
+
+def len__RopeUnicodeIter(space,  w_ropeiter):
+    if w_ropeiter.node is None:
+        return wrapint(space, 0)
+    index = w_ropeiter.index
+    length = w_ropeiter.node.length()
+    result = length - index
+    if result < 0:
+        return wrapint(space, 0)
+    return wrapint(space, result)
+
 import unicodetype
 register_all(vars(), unicodetype)
 

Modified: pypy/branch/ropes-unicode/pypy/objspace/std/stringobject.py
==============================================================================
--- pypy/branch/ropes-unicode/pypy/objspace/std/stringobject.py	(original)
+++ pypy/branch/ropes-unicode/pypy/objspace/std/stringobject.py	Mon Nov 19 12:16:46 2007
@@ -589,6 +589,10 @@
     (u_self, _, start, end) = _convert_idx_params(space, w_self,
                                                   space.wrap(''), w_start, w_end)
     for w_suffix in space.unpacktuple(w_suffixes):
+        if space.is_true(space.isinstance(w_suffix, space.w_unicode)):
+            w_u = space.call_function(space.w_unicode, w_self)
+            return space.call_method(w_u, "endswith", w_suffixes, w_start,
+                                     w_end)
         suffix = space.str_w(w_suffix) 
         if stringendswith(u_self, suffix, start, end):
             return space.w_True
@@ -603,6 +607,10 @@
     (u_self, _, start, end) = _convert_idx_params(space, w_self, space.wrap(''),
                                                   w_start, w_end)
     for w_prefix in space.unpacktuple(w_prefixes):
+        if space.is_true(space.isinstance(w_prefix, space.w_unicode)):
+            w_u = space.call_function(space.w_unicode, w_self)
+            return space.call_method(w_u, "startswith", w_prefixes, w_start,
+                                     w_end)
         prefix = space.str_w(w_prefix)
         if stringstartswith(u_self, prefix, start, end):
             return space.w_True

Modified: pypy/branch/ropes-unicode/pypy/objspace/std/test/test_rope.py
==============================================================================
--- pypy/branch/ropes-unicode/pypy/objspace/std/test/test_rope.py	(original)
+++ pypy/branch/ropes-unicode/pypy/objspace/std/test/test_rope.py	Mon Nov 19 12:16:46 2007
@@ -129,6 +129,21 @@
                 for step in range(1, stop - start):
                     assert getslice(s, start, stop, step).flatten_string() == result[start:stop:step]
 
+def test_getslice_step_unicode():
+    s1 = (LiteralUnicodeNode(u"\uaaaa") +
+          LiteralUnicodeNode(u"\ubbbb" * 5) +
+          LiteralUnicodeNode(u"\uaaaa\ubbbb\u1000\u2000") +
+          LiteralUnicodeNode(u"vwxyz") + 
+          LiteralUnicodeNode(u"zyxwvu\u1234" * 2) +
+          LiteralUnicodeNode(u"12355"))
+    s2 = s1.rebalance()
+    result = s1.flatten_unicode()
+    assert s2.flatten_unicode() == result
+    for s in [s1, s2]:
+        for start in range(0, len(result)):
+            for stop in range(start, len(result)):
+                for step in range(1, stop - start):
+                    assert getslice(s, start, stop, step).flatten_unicode() == result[start:stop:step]
 
 def test_random_addition_and_slicing():
     seed = random.randrange(10000)
@@ -181,6 +196,17 @@
         assert c2 == ord(c)
     py.test.raises(StopIteration, iter.nextchar)
 
+def test_iteration_startpos():
+    rope, real_st = make_random_string(200)
+    for i in range(0, len(real_st), len(real_st) // 20):
+        iter = ItemIterator(rope, i)
+        x = i
+        for c in real_st[i:]:
+            x += 1
+            c2 = iter.nextchar()
+            assert c2 == c
+        py.test.raises(StopIteration, iter.nextchar)
+
 def test_iteration_unicode():
     rope, real_st = make_random_string(200, unicode=True)
     iter = ItemIterator(rope)
@@ -381,7 +407,6 @@
     rope = getslice_one(rope, 10, 100)
     st = st[10:100]
     for i in range(len(st)):
-        print i
         for j in range(i + 1, len(st)):
             c = st[i:j][(j - i) // 2]
             pos = find_int(rope, ord(c), i, j)
@@ -518,7 +543,6 @@
             hashes[(h & 0xff0000) >> 16] += 1
     for h in hashes:
         assert h > 300
-    print hashes
 
 def test_hash_distribution_small_strings():
     random.seed(42) # prevent randomly failing test
@@ -581,7 +605,7 @@
             assert s.hash_part() == h
 
 def test_hash_part_unicode():
-    a, st = make_random_string(unicode=True)
+    a, st = make_random_string(5, unicode=True)
     h = a.hash_part()
     for split in range(1, len(st) - 1):
         s1 = LiteralUnicodeNode(st[:split])
@@ -714,3 +738,31 @@
         assert len(l1) == len(l2)
         for n, s in zip(l1, l2):
             assert n.flatten_string() == s
+
+def test_splitlines():
+    seps = [(LiteralStringNode("\n"), "\n"), (LiteralStringNode("\r"), "\r"),
+            (LiteralStringNode("\r\n"), "\r\n")]
+    l, strs = zip(*[(LiteralStringNode("xafnarsp"), "xafnarsp"),
+                    (LiteralStringNode("xyzaaaa"), "xyzaaaa"),
+                    (LiteralStringNode("wxxxx"), "wxxxx")])
+    l = list(l)
+    for s, st in seps:
+        node = join(s, l)
+        l2 = splitlines(node)
+        for n1, n2 in zip(l, l2):
+            assert n1.flatten_string() == n2.flatten_string()
+    for keepends in [True, False]:
+        l1 = splitlines(LiteralStringNode("ab\nab\n\raba\rba"), keepends)
+        l2 = "ab\nab\n\raba\rba".splitlines(keepends)
+        assert len(l1) == len(l2)
+        for n, s in zip(l1, l2):
+            assert n.flatten_string() == s
+
+def test_rope_from_unicode():
+    node = rope_from_unicode(u"aaabbbbbcccdddeeefffggggnnn")
+    assert node.is_bytestring()
+    assert node.is_ascii()
+    node = rope_from_unicode(u"a" * 30 + u"\ufffd" * 30 + "x" * 30)
+    assert node.length() == 90
+    assert not node.is_ascii()
+    assert not node.is_bytestring()

Modified: pypy/branch/ropes-unicode/pypy/objspace/std/test/test_unicodeobject.py
==============================================================================
--- pypy/branch/ropes-unicode/pypy/objspace/std/test/test_unicodeobject.py	(original)
+++ pypy/branch/ropes-unicode/pypy/objspace/std/test/test_unicodeobject.py	Mon Nov 19 12:16:46 2007
@@ -307,11 +307,10 @@
         raises(TypeError, u'hello'.startswith, (42,))
 
     def test_startswith_endswith_convert(self):
-        skip("fix me")
         assert 'hello'.startswith((u'he\u1111', u'he'))
         assert not 'hello'.startswith((u'lo\u1111', u'llo'))
         assert 'hello'.startswith((u'hellox\u1111', u'hello'))
-        assert 'hello'.startswith((u'lo', u'he\u1111'), 0, -1)
+        assert not 'hello'.startswith((u'lo', u'he\u1111'), 0, -1)
         assert not 'hello'.endswith((u'he\u1111', u'he'))
         assert 'hello'.endswith((u'\u1111lo', u'llo'))
         assert 'hello'.endswith((u'\u1111hellox', u'hello'))

Modified: pypy/branch/ropes-unicode/pypy/objspace/std/unicodetype.py
==============================================================================
--- pypy/branch/ropes-unicode/pypy/objspace/std/unicodetype.py	(original)
+++ pypy/branch/ropes-unicode/pypy/objspace/std/unicodetype.py	Mon Nov 19 12:16:46 2007
@@ -265,7 +265,7 @@
             w_value = unicode_from_object(space, w_obj)
     else:
         w_value = unicode_from_encoded_object(space, w_obj, encoding, errors)
-    if space.config.objspace.std.withrope:
+    if space.config.objspace.std.withropeunicode:
         assert isinstance(w_value, W_RopeUnicodeObject)
         w_newobj = space.allocate_instance(W_RopeUnicodeObject, w_unicodetype)
         W_RopeUnicodeObject.__init__(w_newobj, w_value._node)



More information about the Pypy-commit mailing list