[pypy-svn] r12419 - in pypy/branch/non-fake-unicode/pypy/objspace/std: . test

ac at codespeak.net ac at codespeak.net
Tue May 17 17:56:58 CEST 2005


Author: ac
Date: Tue May 17 17:56:58 2005
New Revision: 12419

Modified:
   pypy/branch/non-fake-unicode/pypy/objspace/std/test/test_unicodestring.py
   pypy/branch/non-fake-unicode/pypy/objspace/std/unicodeobject.py
   pypy/branch/non-fake-unicode/pypy/objspace/std/unicodetype.py
Log:
Add methods to unicode. Only __mod__ missing now.

Modified: pypy/branch/non-fake-unicode/pypy/objspace/std/test/test_unicodestring.py
==============================================================================
--- pypy/branch/non-fake-unicode/pypy/objspace/std/test/test_unicodestring.py	(original)
+++ pypy/branch/non-fake-unicode/pypy/objspace/std/test/test_unicodestring.py	Tue May 17 17:56:58 2005
@@ -38,3 +38,29 @@
     def test_contains(self):
         assert u'a' in 'abc'
         assert 'a' in u'abc'
+
+    def test_splitlines(self):
+        assert u''.splitlines() == []
+        assert u''.splitlines(1) == []
+        assert u'\n'.splitlines() == [u'']
+        assert u'a'.splitlines() == [u'a']
+        assert u'one\ntwo'.splitlines() == [u'one', u'two']
+        assert u'\ntwo\nthree'.splitlines() == [u'', u'two', u'three']
+        assert u'\n\n'.splitlines() == [u'', u'']
+        assert u'a\nb\nc'.splitlines(1) == [u'a\n', u'b\n', u'c']
+        assert u'\na\nb\n'.splitlines(1) == [u'\n', u'a\n', u'b\n']
+
+    def test_zfill(self):
+        assert u'123'.zfill(6) == u'000123'
+        assert u'123'.zfill(2) == u'123'
+        assert u'123'.zfill(6) == u'000123'
+        assert u'+123'.zfill(2) == u'+123'
+        assert u'+123'.zfill(4) == u'+123'
+        assert u'+123'.zfill(6) == u'+00123'
+
+    def test_split(self):
+        assert (u'this is the split function'.split() ==
+                [u'this', u'is', u'the', u'split', u'function'])
+        assert (u'this!is!the!split!function'.split('!') ==
+                [u'this', u'is', u'the', u'split', u'function'])
+    

Modified: pypy/branch/non-fake-unicode/pypy/objspace/std/unicodeobject.py
==============================================================================
--- pypy/branch/non-fake-unicode/pypy/objspace/std/unicodeobject.py	(original)
+++ pypy/branch/non-fake-unicode/pypy/objspace/std/unicodeobject.py	Tue May 17 17:56:58 2005
@@ -1,4 +1,5 @@
 from pypy.objspace.std.objspace import *
+from pypy.interpreter import gateway
 from pypy.objspace.std.fake import wrap_exception
 from pypy.objspace.std.stringobject import W_StringObject
 from pypy.objspace.std.noneobject import W_NoneObject
@@ -25,26 +26,23 @@
 registerimplementation(W_UnicodeObject)
 
 # Helper for converting int/long
-import unicodedata
 def unicode_to_decimal_w(space, w_unistr):
     unistr = w_unistr._value
     result = ['\0'] * len(unistr)
     digits = [ '0', '1', '2', '3', '4',
                '5', '6', '7', '8', '9']
     for i in xrange(len(unistr)):
-        uchr = unistr[i]
-        if _isspace(uchr):
+        uchr = ord(unistr[i])
+        if unicodedb.isspace(uchr):
             result[i] = ' '
             continue
         try:
-            result[i] = digits[unicodedata.decimal(uchr)]
-            continue
-        except ValueError:
-            ch = ord(uchr)
-            if 0 < ch < 256:
-                result[i] = chr(ch)
-                continue
-        raise OperationError(space.w_UnicodeEncodeError, space.wrap('invalid decimal Unicode string'))
+            result[i] = digits[unicodedb.decimal(uchr)]
+        except KeyError:
+            if 0 < uchr < 256:
+                result[i] = chr(uchr)
+            else:
+                raise OperationError(space.w_UnicodeEncodeError, space.wrap('invalid decimal Unicode string'))
     return ''.join(result)
 
 # string-to-unicode delegation
@@ -77,7 +75,14 @@
     if test > 0:
         return space.wrap(1)
     return space.wrap(0)
-    
+
+def cmp__Unicode_ANY(space, w_left, w_right):
+    try:
+        w_right = space.call_function(space.w_unicode, w_right)
+    except:
+        return space.wrap(1)
+    return space.cmp(w_left, w_right)
+
 def ord__Unicode(space, w_uni):
     if len(w_uni._value) != 1:
         raise OperationError(space.w_TypeError, space.wrap('ord() expected a character'))
@@ -104,18 +109,36 @@
 def contains__String_Unicode(space, w_container, w_item):
     return space.contains(space.call_function(space.w_unicode, w_container), w_item )
 
+def _find(self, sub, start, end):
+    if len(sub) == 0:
+        return start
+    if start >= end:
+        return -1
+    for i in range(start, end - len(sub) + 1):
+        for j in range(len(sub)):
+            if self[i + j]  != sub[j]:
+                break
+        else:
+            return i
+    return -1
+
+def _rfind(self, sub, start, end):
+    if len(sub) == 0:
+        return end
+    if end - start < len(sub):
+        return -1
+    for i in range(end - len(sub), start - 1, -1):
+        for j in range(len(sub)):
+            if self[i + j]  != sub[j]:
+                break
+        else:
+            return i
+    return -1
+
 def contains__Unicode_Unicode(space, w_container, w_item):
     item = w_item._value
     container = w_container._value
-    if len(item) == 0:
-        return space.w_True
-    for i in range(len(container) - len(item) + 1):
-        for j in range(len(item)):
-            if container[i + j]  != item[j]:
-                break
-        else:
-            return space.w_True
-    return space.w_False
+    return space.newbool(_find(container, item, 0, len(container)) >= 0)
 
 def unicode_join__Unicode_ANY(space, w_self, w_list):
     list = space.unpackiterable(w_list)
@@ -123,12 +146,20 @@
     totlen = 0
     if len(list) == 0:
         return W_UnicodeObject(space, [])
-    if len(list) == 1:
-        return space.call_function(space.w_unicode, list[0])
     for i in range(len(list)):
-        list[i] = space.call_function(space.w_unicode, list[i])._value
+        item = list[i]
+        if space.is_true(space.isinstance(item, space.w_unicode)):
+            list[i] = item._value
+        elif space.is_true(space.isinstance(item, space.w_str)):
+            list[i] = space.call_function(space.w_unicode, item)._value
+        else:
+            w_msg = space.mod(space.wrap('sequence item %d: expected string or Unicode'),
+                              space.wrap(i))
+            raise OperationError(space.w_TypeError, w_msg)
         totlen += len(list[i])
     totlen += len(delim) * (len(list) - 1)
+    if len(list) == 1:
+        return W_UnicodeObject(space, list[0])
     # Allocate result
     result = [u'\0'] * totlen
     first = list[0]
@@ -152,6 +183,7 @@
         return space.wrap(u''.join(w_self._value).encode(space.str_w(w_encoding), space.str_w(w_errors)))
     except:
         wrap_exception(space)
+
 def unicode_encode__Unicode_String_None(space, w_self, w_encoding, w_none):
     try:
         return space.wrap(u''.join(w_self._value).encode(space.str_w(w_encoding)))
@@ -198,6 +230,12 @@
     r = [uni[start + i*step] for i in range(sl)]
     return W_UnicodeObject(space, r)
 
+def unicode_getslice__Unicode_ANY_ANY(space, w_uni, w_start, w_end):
+    w_slice = space.call_function(space.w_slice, w_start, w_end)
+    uni = w_uni._value
+    length = len(uni)
+    start, stop, step, sl = slicetype.indices4(space, w_slice, length)
+    return W_UnicodeObject(space, uni[start:stop])
 
 def mul__Unicode_ANY(space, w_uni, w_times):
     chars = w_uni._value
@@ -206,11 +244,14 @@
     if times <= 0 or charlen == 0:
         return W_UnicodeObject(space, [])
     if times == 1:
-        return w_uni
+        return space.call_function(space.w_unicode, w_uni)
     if charlen == 1:
         return W_UnicodeObject(space, [w_uni._value[0]] * times)
 
-    result = [u'\0'] * (charlen * times)
+    try:
+        result = [u'\0'] * (charlen * times)
+    except OverflowError:
+        raise OperationError(space.w_OverflowError, space.wrap('repeated string is too long'))
     for i in range(times):
         offset = i * charlen
         for j in range(charlen):
@@ -221,11 +262,93 @@
     return space.mul(w_uni, w_times)
 
 def _isspace(uchar):
-    code = ord(uchar)
-    try:
-        return unicodedb.category[code] == 'Zs' or unicodedb.bidirectional[code] in ("WS", "B", "S")
-    except:
-        return False
+    return unicodedb.isspace(ord(uchar))
+
+def unicode_isspace__Unicode(space, w_unicode):
+    if len(w_unicode._value) == 0:
+        return space.w_False
+    for uchar in w_unicode._value:
+        if not unicodedb.isspace(ord(uchar)):
+            return space.w_False
+    return space.w_True
+
+def unicode_isalpha__Unicode(space, w_unicode):
+    if len(w_unicode._value) == 0:
+        return space.w_False
+    for uchar in w_unicode._value:
+        if not unicodedb.isalpha(ord(uchar)):
+            return space.w_False
+    return space.w_True
+
+def unicode_isalnum__Unicode(space, w_unicode):
+    if len(w_unicode._value) == 0:
+        return space.w_False
+    for uchar in w_unicode._value:
+        if not (unicodedb.isalpha(ord(uchar)) or
+                unicodedb.isnumeric(ord(uchar))):
+            return space.w_False
+    return space.w_True
+
+def unicode_isdecimal__Unicode(space, w_unicode):
+    if len(w_unicode._value) == 0:
+        return space.w_False
+    for uchar in w_unicode._value:
+        if not unicodedb.isdecimal(ord(uchar)):
+            return space.w_False
+    return space.w_True
+
+def unicode_isdigit__Unicode(space, w_unicode):
+    if len(w_unicode._value) == 0:
+        return space.w_False
+    for uchar in w_unicode._value:
+        if not unicodedb.isdigit(ord(uchar)):
+            return space.w_False
+    return space.w_True
+
+def unicode_isnumeric__Unicode(space, w_unicode):
+    if len(w_unicode._value) == 0:
+        return space.w_False
+    for uchar in w_unicode._value:
+        if not unicodedb.isnumeric(ord(uchar)):
+            return space.w_False
+    return space.w_True
+
+def unicode_islower__Unicode(space, w_unicode):
+    cased = False
+    for uchar in w_unicode._value:
+        if (unicodedb.isupper(ord(uchar)) or
+            unicodedb.istitle(ord(uchar))):
+            return space.w_False
+        if not cased and unicodedb.islower(ord(uchar)):
+            cased = True
+    return space.newbool(cased)
+
+def unicode_isupper__Unicode(space, w_unicode):
+    cased = False
+    for uchar in w_unicode._value:
+        if (unicodedb.islower(ord(uchar)) or
+            unicodedb.istitle(ord(uchar))):
+            return space.w_False
+        if not cased and unicodedb.isupper(ord(uchar)):
+            cased = True
+    return space.newbool(cased)
+
+def unicode_istitle__Unicode(space, w_unicode):
+    cased = False
+    previous_is_cased = False
+    for uchar in w_unicode._value:
+        if (unicodedb.isupper(ord(uchar)) or
+            unicodedb.istitle(ord(uchar))):
+            if previous_is_cased:
+                return space.w_False
+            previous_is_cased = cased = True
+        elif unicodedb.islower(ord(uchar)):
+            if not previous_is_cased:
+                return space.w_False
+            previous_is_cased = cased = True
+        else:
+            previous_is_cased = False
+    return space.newbool(cased)
 
 def _strip(space, w_self, w_chars, left, right):
     "internal function called by str_xstrip methods"
@@ -292,6 +415,361 @@
     return space.call_method(w_self, 'rstrip',
                              space.call_function(space.w_unicode, w_chars))
 
+def unicode_capitalize__Unicode(space, w_self):
+    input = w_self._value
+    if len(input) == 0:
+        return W_UnicodeObject(space, [])
+    result = [u'\0'] * len(input)
+    result[0] = unichr(unicodedb.toupper(ord(input[0])))
+    for i in range(1, len(input)):
+        result[i] = unichr(unicodedb.tolower(ord(input[i])))
+    return W_UnicodeObject(space, result)
+
+def unicode_title__Unicode(space, w_self):
+    input = w_self._value
+    if len(input) == 0:
+        return w_self
+    result = [u'\0'] * len(input)
+
+    previous_is_cased = 0
+    for i in range(len(input)):
+        unichar = ord(input[i])
+        if previous_is_cased:
+            result[i] = unichr(unicodedb.tolower(unichar))
+        else:
+            result[i] = unichr(unicodedb.totitle(unichar))
+        previous_is_cased = unicodedb.iscased(unichar)
+    return W_UnicodeObject(space, result)
+
+def unicode_lower__Unicode(space, w_self):
+    input = w_self._value
+    result = [u'\0'] * len(input)
+    for i in range(len(input)):
+        result[i] = unichr(unicodedb.tolower(ord(input[i])))
+    return W_UnicodeObject(space, result)
+
+def unicode_upper__Unicode(space, w_self):
+    input = w_self._value
+    result = [u'\0'] * len(input)
+    for i in range(len(input)):
+        result[i] = unichr(unicodedb.toupper(ord(input[i])))
+    return W_UnicodeObject(space, result)
+
+def unicode_swapcase__Unicode(space, w_self):
+    input = w_self._value
+    result = [u'\0'] * len(input)
+    for i in range(len(input)):
+        unichar = ord(input[i])
+        if unicodedb.islower(unichar):
+            result[i] = unichr(unicodedb.toupper(unichar))
+        elif unicodedb.isupper(unichar):
+            result[i] = unichr(unicodedb.tolower(unichar))
+        else:
+            result[i] = input[i]
+    return W_UnicodeObject(space, result)
+
+def _normalize_index(length, index):
+    if index < 0:
+        index += length
+        if index < 0:
+            index = 0
+    elif index > length:
+        index = length
+    return index
+
+def unicode_endswith__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+    self = w_self._value
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
+
+    substr = w_substr._value
+    substr_len = len(substr)
+    
+    if end - start < substr_len:
+        return space.w_False # substring is too long
+    start = end - substr_len
+    for i in range(substr_len):
+        if self[start + i] != substr[i]:
+            return space.w_False
+    return space.w_True
+
+def unicode_startswith__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+    self = w_self._value
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
+
+    substr = w_substr._value
+    substr_len = len(substr)
+    
+    if end - start < substr_len:
+        return space.w_False # substring is too long
+    
+    for i in range(substr_len):
+        if self[start + i] != substr[i]:
+            return space.w_False
+    return space.w_True
+
+def unicode_center__Unicode_ANY(space, w_self, w_width):
+    self = w_self._value
+    width = space.int_w(w_width)
+    padding = width - len(self)
+    if padding < 0:
+        return space.call_function(space.w_unicode, w_self)
+    leftpad = padding // 2 + (padding & width & 1)
+    result = [u' '] * width
+    for i in range(len(self)):
+        result[leftpad + i] = self[i]
+    return W_UnicodeObject(space, result)
+
+
+def unicode_ljust__Unicode_ANY(space, w_self, w_width):
+    self = w_self._value
+    width = space.int_w(w_width)
+    padding = width - len(self)
+    if padding < 0:
+        return space.call_function(space.w_unicode, w_self)
+    result = [u' '] * width
+    for i in range(len(self)):
+        result[i] = self[i]
+    return W_UnicodeObject(space, result)
+
+def unicode_rjust__Unicode_ANY(space, w_self, w_width):
+    self = w_self._value
+    width = space.int_w(w_width)
+    padding = width - len(self)
+    if padding < 0:
+        return space.call_function(space.w_unicode, w_self)
+    result = [u' '] * width
+    for i in range(len(self)):
+        result[padding + i] = self[i]
+    return W_UnicodeObject(space, result)
+    
+def unicode_zfill__Unicode_ANY(space, w_self, w_width):
+    self = w_self._value
+    width = space.int_w(w_width)
+    if len(self) == 0:
+        return W_UnicodeObject(space, [u'0'] * width)
+    padding = width - len(self)
+    if padding <= 0:
+        return space.call_function(space.w_unicode, w_self)
+    result = [u'0'] * width
+    for i in range(len(self)):
+        result[padding + i] = self[i]
+    # Move sign to first position
+    if self[0] in (u'+', u'-'):
+        result[0] = self[0]
+        result[padding] = u'0'
+    return W_UnicodeObject(space, result)
+
+def unicode_splitlines__Unicode_ANY(space, w_self, w_keepends):
+    self = w_self._value
+    keepends = 0
+    if space.int_w(w_keepends):
+        keepends = 1
+    if len(self) == 0:
+        return space.newlist([])
+    
+    start = 0
+    end = len(self)
+    pos = 0
+    lines = []
+    while pos < end:
+        if unicodedb.islinebreak(ord(self[pos])):
+            if (self[pos] == u'\r' and pos + 1 < end and
+                self[pos + 1] == u'\n'):
+                # Count CRLF as one linebreak
+                lines.append(W_UnicodeObject(space,
+                                             self[start:pos + keepends * 2]))
+                pos += 1
+            else:
+                lines.append(W_UnicodeObject(space,
+                                             self[start:pos + keepends]))
+            pos += 1
+            start = pos
+        else:
+            pos += 1
+    if not unicodedb.islinebreak(ord(self[end - 1])):
+        lines.append(W_UnicodeObject(space, self[start:]))
+    return space.newlist(lines)
+
+def unicode_find__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+    self = w_self._value
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
+    substr = w_substr._value
+    return space.wrap(_find(self, substr, start, end))
+
+def unicode_rfind__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+    self = w_self._value
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
+    substr = w_substr._value
+    return space.wrap(_rfind(self, substr, start, end))
+
+def unicode_index__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+    self = w_self._value
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
+    substr = w_substr._value
+    index = _find(self, substr, start, end)
+    if index < 0:
+        raise OperationError(space.w_ValueError,
+                             space.wrap('substring not found'))
+    return space.wrap(index)
+
+def unicode_rindex__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+    self = w_self._value
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
+    substr = w_substr._value
+    index = _rfind(self, substr, start, end)
+    if index < 0:
+        raise OperationError(space.w_ValueError,
+                             space.wrap('substring not found'))
+    return space.wrap(index)
+
+def unicode_count__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+    self = w_self._value
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
+    substr = w_substr._value
+    count = 0
+    while start <= end:
+        index = _find(self, substr, start, end)
+        if index < 0:
+            break
+        start = index + 1
+        count += 1
+    return space.wrap(count)
+
+
+def unicode_split__Unicode_None_ANY(space, w_self, w_none, w_maxsplit):
+    self = w_self._value
+    maxsplit = space.int_w(w_maxsplit)
+    parts = []
+    if len(self) == 0:
+        return space.newlist([])
+    start = 0
+    end = len(self)
+    while maxsplit != 0 and start < end:
+        index = start
+        for index in range(start, end):
+            if _isspace(self[index]):
+                break
+        else:
+            break
+        parts.append(W_UnicodeObject(space, self[start:index]))
+        maxsplit -= 1
+        # Eat whitespace
+        for start in range(index + 1, end):
+            if not _isspace(self[start]):
+                break
+        else:
+            return space.newlist(parts)
+    parts.append(W_UnicodeObject(space, self[start:]))
+    return space.newlist(parts)
+
+
+def unicode_split__Unicode_Unicode_ANY(space, w_self, w_delim, w_maxsplit):
+    self = w_self._value
+    delim = w_delim._value
+    maxsplit = space.int_w(w_maxsplit)
+    delim_len = len(delim)
+    if delim_len == 0:
+        raise OperationError(space.w_ValueError,
+                             space.wrap('empty separator'))
+    parts = []
+    if len(self) == 0:
+        return space.newlist([])
+    start = 0
+    end = len(self)
+    while maxsplit != 0:
+        index = _find(self, delim, start, end)
+        if index < 0:
+            break
+        parts.append(W_UnicodeObject(space, self[start:index]))
+        start = index + delim_len
+        maxsplit -= 1
+    parts.append(W_UnicodeObject(space, self[start:]))
+    return space.newlist(parts)
+
+def _split(space, self, maxsplit):
+    if len(self) == 0:
+        return []
+    if maxsplit == 0:
+        return [W_UnicodeObject(space, self)]
+    index = 0
+    end = len(self)
+    parts = [W_UnicodeObject(space, [])]
+    maxsplit -= 1
+    while maxsplit != 0:
+        if index >= end:
+            break
+        parts.append(W_UnicodeObject(space, [self[index]]))
+        index += 1
+        maxsplit -= 1
+    parts.append(W_UnicodeObject(space, self[index:]))
+    return parts
+    
+def unicode_replace__Unicode_Unicode_Unicode_ANY(space, w_self, w_old,
+                                                 w_new, w_maxsplit):
+    if len(w_old._value):
+        w_parts = space.call_method(w_self, 'split', w_old, w_maxsplit)
+    else:
+        self = w_self._value
+        maxsplit = space.int_w(w_maxsplit)
+        w_parts = space.newlist(_split(space, self, maxsplit))
+    return space.call_method(w_new, 'join', w_parts)
+    
+
+'translate'
+app = gateway.applevel(r'''
+import sys
+
+def unicode_expandtabs__Unicode_ANY(self, tabsize):
+    parts = self.split(u'\t')
+    result = [ parts[0] ]
+    prevsize = 0
+    for ch in parts[0]:
+        prevsize += 1
+        if ch in (u"\n", u"\r"):
+            prevsize = 0
+    for i in range(1, len(parts)):
+        pad = tabsize - prevsize % tabsize
+        result.append(u' ' * pad)
+        nextpart = parts[i]
+        result.append(nextpart)
+        prevsize = 0
+        for ch in nextpart:
+            prevsize += 1
+            if ch in (u"\n", u"\r"):
+                prevsize = 0
+    return u''.join(result)
+
+def unicode_translate__Unicode_ANY(self, table):
+    result = []
+    for unichar in self:
+        try:
+            newval = table[ord(unichar)]
+        except KeyError:
+            result.append(unichar)
+        else:
+            if newval is None:
+                continue
+            elif isinstance(newval, int):
+                if newval < 0 or newval > sys.maxunicode:
+                    raise TypeError("character mapping must be in range(0x%x)"%(sys.maxunicode + 1,))
+                result.append(unichr(newval))
+            elif isinstance(newval, unicode):
+                result.append(newval)
+            else:
+                raise TypeError("character mapping must return integer, None or unicode")
+    return ''.join(result)
+                
+''')
+unicode_expandtabs__Unicode_ANY = app.interphook('unicode_expandtabs__Unicode_ANY')
+unicode_translate__Unicode_ANY = app.interphook('unicode_translate__Unicode_ANY')
+
 import unicodetype
 register_all(vars(), unicodetype)
 
@@ -301,15 +779,38 @@
     import stringtype
     W_UnicodeObject = W_UnicodeObject
     from pypy.objspace.std.stringobject import W_StringObject
-    def str_strip__String_Unicode(space, w_self, w_chars ):
+    def str_strip__String_Unicode(space, w_self, w_chars):
         return space.call_method(space.call_function(space.w_unicode, w_self),
                                  'strip', w_chars)
-    def str_lstrip__String_Unicode(space, w_self, w_chars ):
+    def str_lstrip__String_Unicode(space, w_self, w_chars):
         return space.call_method(space.call_function(space.w_unicode, w_self),
                                  'lstrip', w_chars)
         self = w_self._value
-    def str_rstrip__String_Unicode(space, w_self, w_chars ):
+    def str_rstrip__String_Unicode(space, w_self, w_chars):
         return space.call_method(space.call_function(space.w_unicode, w_self),
                                  'rstrip', w_chars)
+    def str_count__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'count', w_substr, w_start, w_end)
+    def str_find__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'find', w_substr, w_start, w_end)
+    def str_rfind__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'rfind', w_substr, w_start, w_end)
+    def str_index__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'index', w_substr, w_start, w_end)
+    def str_rindex__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'rindex', w_substr, w_start, w_end)
 
+    def str_replace__String_Unicode_Unicode_ANY(space, w_self, w_old, w_new, w_maxsplit):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'replace', w_old, w_new, w_maxsplit)
+
+    def str_split__String_Unicode_ANY(space, w_self, w_delim, w_maxsplit):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'split', w_delim, w_maxsplit)
+        
     register_all(vars(), stringtype)

Modified: pypy/branch/non-fake-unicode/pypy/objspace/std/unicodetype.py
==============================================================================
--- pypy/branch/non-fake-unicode/pypy/objspace/std/unicodetype.py	(original)
+++ pypy/branch/non-fake-unicode/pypy/objspace/std/unicodetype.py	Tue May 17 17:56:58 2005
@@ -8,7 +8,7 @@
 unicode_center     = MultiMethod('center', 2, )
 unicode_count      = MultiMethod('count', 4, defaults=(0, maxint))      
 unicode_encode     = MultiMethod('encode', 3, defaults=(None, None))
-unicode_endswith   = MultiMethod('endswith', 2) #[optional arguments not supported now]
+unicode_endswith   = MultiMethod('endswith', 4, defaults=(0,maxint))
 unicode_expandtabs = MultiMethod('expandtabs', 2, defaults=(8,))
 unicode_find       = MultiMethod('find', 4, defaults=(0, maxint))
 unicode_index      = MultiMethod('index', 4, defaults=(0, maxint))
@@ -32,47 +32,78 @@
 unicode_rstrip     = MultiMethod('rstrip', 2, defaults=(None,))
 unicode_split      = MultiMethod('split', 3, defaults=(None,-1))
 unicode_splitlines = MultiMethod('splitlines', 2, defaults=(0,))
-unicode_startswith = MultiMethod('startswith', 3, defaults=(0,))
+unicode_startswith = MultiMethod('startswith', 4, defaults=(0,maxint))
 unicode_strip      = MultiMethod('strip',  2, defaults=(None,))
 unicode_swapcase   = MultiMethod('swapcase', 1)
 unicode_title      = MultiMethod('title', 1)
-unicode_translate  = MultiMethod('translate', 3, defaults=('',))
+unicode_translate  = MultiMethod('translate', 2)
 unicode_upper      = MultiMethod('upper', 1)
 unicode_zfill      = MultiMethod('zfill', 2)
-
+unicode_getslice   = MultiMethod('__getslice__', 3)
 # ____________________________________________________________
+
+app = gateway.applevel('''
+import codecs, sys
+
+def unicode_from_encoded_object(obj, encoding, errors):
+    # Fix later for buffer
+    if type(obj).__name__ == 'buffer':
+        obj = obj.buf
+    if encoding is None:
+        encoding = sys.getdefaultencoding()
+    decoder = codecs.getdecoder(encoding)
+    if errors is None:
+        retval, lenght = decoder(obj)
+    else:
+        retval, length = decoder(obj, errors)
+    if not isinstance(retval, unicode):
+        raise TypeError("decoder did not return an unicode object (type=%s)" %
+                        type(retval).__name__)
+    return retval
+
+def unicode_from_object(obj):
+    if isinstance(obj, str):
+        res = obj
+    else:
+        try:
+            unicode_method = obj.__unicode__
+        except AttributeError:
+            res = str(obj)
+        else:
+            res = unicode_method()
+    if isinstance(res, unicode):
+        return res
+    return unicode_from_encoded_object(res, None, "strict")
+    
+''')
+unicode_from_object = app.interphook('unicode_from_object')
+unicode_from_encoded_object = app.interphook('unicode_from_encoded_object')
+
+
 def descr__new__(space, w_unicodetype, w_obj=None, w_encoding=None, w_errors=None):
     from pypy.objspace.std.unicodeobject import W_UnicodeObject
     w_obj_type = space.type(w_obj)
     
     if space.is_w(w_obj_type, space.w_unicode):
+        if (not space.is_w(w_encoding, space.w_None) or
+            not space.is_w(w_errors, space.w_None)):
+            raise OperationError(space.w_TypeError,
+                                 space.wrap('decoding Unicode is not supported'))
         if space.is_w(w_unicodetype, space.w_unicode):
             return w_obj
-        value = w_obj._value
+        w_value = w_obj
     elif space.is_w(w_obj, space.w_None):
-        value = []
-    elif space.is_true(space.isinstance(w_obj, space.w_unicode)):
-        value = w_obj._value
-    elif space.is_w(w_obj_type, space.w_str):
-        try:
-            if space.is_w(w_encoding, space.w_None):
-                value = [ u for u in unicode(space.str_w(w_obj)) ]
-            elif space.is_w(w_errors, space.w_None): 
-                value = [ u for u in unicode(space.str_w(w_obj),
-                                             space.str_w(w_encoding)) ]
-            else:
-                value = [u for u in unicode(space.str_w(w_obj),
-                                            space.str_w(w_encoding),
-                                            space.str_w(w_errors)) ]
-        except UnicodeDecodeError, e:
-            raise OperationError(space.w_UnicodeDecodeError,
-                                 space.wrap(e.reason))
+        w_value = W_UnicodeObject(space, [])
+    elif (space.is_w(w_encoding, space.w_None) and
+          space.is_w(w_errors, space.w_None)):
+        if space.is_true(space.isinstance(w_obj, space.w_unicode)):
+            w_value = w_obj
+        else:
+            w_value = unicode_from_object(space, w_obj)
     else:
-        # try with __unicode__
-        raise OperationError(space.w_ValueError,
-                             space.wrap('Can not create unicode from other than strings'%w_obj_type))
+        w_value = unicode_from_encoded_object(space, w_obj, w_encoding, w_errors)
     w_newobj = space.allocate_instance(W_UnicodeObject, w_unicodetype)
-    w_newobj.__init__(space, value)
+    w_newobj.__init__(space, w_value._value)
     return w_newobj
 
 # ____________________________________________________________



More information about the Pypy-commit mailing list