[pypy-svn] r12735 - in pypy/dist/pypy: documentation interpreter lib module/__builtin__ module/unicodedata objspace/std objspace/std/test

ac at codespeak.net ac at codespeak.net
Mon May 23 13:05:59 CEST 2005


Author: ac
Date: Mon May 23 13:05:59 2005
New Revision: 12735

Added:
   pypy/dist/pypy/module/unicodedata/   (props changed)
      - copied from r12617, pypy/branch/non-fake-unicode/pypy/module/unicodedata/
Modified:
   pypy/dist/pypy/documentation/objspace.txt
   pypy/dist/pypy/interpreter/baseobjspace.py
   pypy/dist/pypy/lib/_formatting.py
   pypy/dist/pypy/module/__builtin__/__init__.py
   pypy/dist/pypy/module/__builtin__/app_misc.py
   pypy/dist/pypy/module/__builtin__/compiling.py
   pypy/dist/pypy/module/__builtin__/operation.py
   pypy/dist/pypy/module/unicodedata/__init__.py   (contents, props changed)
   pypy/dist/pypy/module/unicodedata/functions.py   (props changed)
   pypy/dist/pypy/module/unicodedata/generate_unicodedb.py   (props changed)
   pypy/dist/pypy/module/unicodedata/unicodedb.py   (props changed)
   pypy/dist/pypy/objspace/std/floattype.py
   pypy/dist/pypy/objspace/std/inttype.py
   pypy/dist/pypy/objspace/std/longtype.py
   pypy/dist/pypy/objspace/std/objspace.py
   pypy/dist/pypy/objspace/std/test/test_unicodestring.py
   pypy/dist/pypy/objspace/std/unicodeobject.py
   pypy/dist/pypy/objspace/std/unicodetype.py
Log:
Merge the 'non-fake-unicode' branch.



Modified: pypy/dist/pypy/documentation/objspace.txt
==============================================================================
--- pypy/dist/pypy/documentation/objspace.txt	(original)
+++ pypy/dist/pypy/documentation/objspace.txt	Mon May 23 13:05:59 2005
@@ -93,6 +93,9 @@
 **newstring(asciilist):**
   Creates a string from a list of wrapped integers.
 
+**newunicode(codelist):**
+  Creates a unicode string from a list of wrapped integers.
+
 Conversions from Application Level to Interpreter Level
 ----------------------------------------------------------
 

Modified: pypy/dist/pypy/interpreter/baseobjspace.py
==============================================================================
--- pypy/dist/pypy/interpreter/baseobjspace.py	(original)
+++ pypy/dist/pypy/interpreter/baseobjspace.py	Mon May 23 13:05:59 2005
@@ -129,6 +129,7 @@
         w_builtin = self.wrap(self.builtin)
         self.setitem(w_modules, w_name, w_builtin) 
         self.setitem(self.builtin.w_dict, self.wrap('__builtins__'), w_builtin) 
+        self.setbuiltinmodule('unicodedata')
 
         # XXX we need to resolve unwrapping issues to 
         #     make this the default _sre module
@@ -532,6 +533,7 @@
 #                  newtuple([w_1, w_2,...]) -> w_tuple
 #                   newlist([w_1, w_2,...]) -> w_list
 #                 newstring([w_1, w_2,...]) -> w_string from ascii numbers (bytes)
+#                newunicode([w_1, w_2,...]) -> w_unicode from numbers
 #            newdict([(w_key,w_value),...]) -> w_dict
 #           newslice(w_start,w_stop,w_step) -> w_slice
 #              call_args(w_obj,Arguments()) -> w_result
@@ -549,6 +551,7 @@
     'newtuple',
     'newlist',
     'newstring',
+    'newunicode',
     'newdict',
     'newslice',
     'call_args'

Modified: pypy/dist/pypy/lib/_formatting.py
==============================================================================
--- pypy/dist/pypy/lib/_formatting.py	(original)
+++ pypy/dist/pypy/lib/_formatting.py	Mon May 23 13:05:59 2005
@@ -6,6 +6,7 @@
 # (1) rounding isn't always right (see comments in _float_formatting).
 # (2) something goes wrong in the f_alt case of %g handling.
 # (3) it's really, really slow.
+import sys
 
 class _Flags(object):
     def __repr__(self):
@@ -103,6 +104,9 @@
     return (c, flags, width, prec, value)
 
 
+class NeedUnicodeFormattingError(Exception):
+    pass
+
 class Formatter(object):
     def __init__(self, char, flags, width, prec, value):
         self.char = char
@@ -314,6 +318,9 @@
             v = self.value
             if len(v) != 1:
                 raise TypeError, "%c requires int or char"
+        
+        elif isinstance(self.value, unicode):
+            raise NeedUnicodeFormattingError
         else:
             i = maybe_int(self.value)
             if not 0 <= i <= 255:
@@ -323,8 +330,15 @@
         self.prec = None
         return self.std_wp(v)
 
+class StringFormatter(Formatter):
+    def format(self):
+        if isinstance(self.value, unicode):
+            raise NeedUnicodeFormattingError
+        return self.std_wp(str(self.value))
+
 
-format_registry = {
+
+str_format_registry = {
     'd':IntFormatter,
     'i':IntFormatter,
     'o':OctFormatter,
@@ -338,13 +352,61 @@
     'g':FloatGFormatter,
     'G':FloatGFormatter,
     'c':CharFormatter,
-    's':funcFormatter(str),
+    's':StringFormatter,
     'r':funcFormatter(repr),
     # this *can* get accessed, by e.g. '%()4%'%{'':1}.
     # The usual %% case has to be handled specially as it
     # doesn't consume a value.
     '%':funcFormatter(lambda x:'%'),
     }
+    
+class UnicodeStringFormatter(Formatter):
+    def format(self):
+        if isinstance(self.value, unicode):
+            return self.std_wp(self.value)
+        return self.std_wp(str(self.value))
+
+class UnicodeCharFormatter(Formatter):
+    def format(self):
+        if isinstance(self.value, unicode):
+            v = self.value
+            if len(v) != 1:
+                raise TypeError, "%c requires int or unicode char"
+        elif isinstance(self.value, str):
+            v = unicode(self.value)
+            if len(v) != 1:
+                raise TypeError, "%c requires int or unicode char"
+        else:
+            i = maybe_int(self.value)
+            if not 0 <= i <= sys.maxunicode:
+                raise OverflowError("OverflowError: unsigned byte "
+                                    "integer is greater than maximum")
+            v = unichr(i)
+        self.prec = None
+        return self.std_wp(v)
+
+unicode_format_registry = {
+    u'd':IntFormatter,
+    u'i':IntFormatter,
+    u'o':OctFormatter,
+    u'u':IntFormatter,
+    u'x':HexFormatter,
+    u'X':HexFormatter,
+    u'e':FloatEFormatter,
+    u'E':FloatEFormatter,
+    u'f':FloatFFormatter,
+    u'F':FloatFFormatter,
+    u'g':FloatGFormatter,
+    u'G':FloatGFormatter,
+    u'c':UnicodeCharFormatter,
+    u's':UnicodeStringFormatter,
+    u'r':funcFormatter(repr),
+    # this *can* get accessed, by e.g. '%()4%'%{'':1}.
+    # The usual %% case has to be handled specially as it
+    # doesn't consume a value.
+    u'%':funcFormatter(lambda x:u'%'),
+    }
+    
 
 del funcFormatter # don't irritate flow space
 
@@ -375,7 +437,12 @@
             return self.fmt[i:j]
 
 
-def format(fmt, values, valuedict=None):
+def format(fmt, values, valuedict=None, do_unicode=False):
+    if do_unicode:
+        format_registry = unicode_format_registry
+    else:
+        format_registry = str_format_registry
+        
     fmtiter = FmtIter(fmt)
     valueiter = iter(values)
     r = []
@@ -394,7 +461,20 @@
                 # so let's be explicit about the args:
                 # r.append(f(*t).format())
                 char, flags, width, prec, value = t
-                r.append(f(char, flags, width, prec, value).format())
+                try:
+                    r.append(f(char, flags, width, prec, value).format())
+                except NeedUnicodeFormattingError:
+                    # Switch to using the unicode formatters and retry.
+                    do_unicode = True
+                    format_registry = unicode_format_registry
+                    try:
+                        f = format_registry[t[0]]
+                    except KeyError:
+                        raise ValueError("unsupported format character "
+                                         "'%s' (0x%x) at index %d"
+                                         %(t[0], ord(t[0]), fmtiter.i-1))
+                    r.append(f(char, flags, width, prec, value).format())
+ 
             else:
                 # efficiency hack:
                 r.append(c + fmtiter.skip_to_fmt())
@@ -408,5 +488,7 @@
         if valuedict is None:
             raise TypeError('not all arguments converted '
                             'during string formatting')
+    if do_unicode:
+        return u''.join(r)
     return ''.join(r)
 

Modified: pypy/dist/pypy/module/__builtin__/__init__.py
==============================================================================
--- pypy/dist/pypy/module/__builtin__/__init__.py	(original)
+++ pypy/dist/pypy/module/__builtin__/__init__.py	Mon May 23 13:05:59 2005
@@ -47,7 +47,6 @@
         'complex'       : 'app_complex.complex',
 
         'intern'        : 'app_misc.intern',
-        'unichr'        : 'app_misc.unichr',
         'buffer'        : 'app_buffer.buffer',
         'reload'        : 'app_misc.reload',
     }
@@ -64,7 +63,7 @@
         'object'        : '(space.w_object)',
         'file'          : '(space.wrap(file))',
         'open'          : '(space.wrap(file))',
-        'unicode'       : '(space.wrap(unicode))',  # XXX faked
+        'unicode'       : '(space.w_unicode)',
 
         # old-style classes dummy support
         '_classobj'     : 'space.w_classobj',
@@ -76,6 +75,7 @@
         # interp-level function definitions
         'abs'           : 'operation.abs',
         'chr'           : 'operation.chr',
+        'unichr'        : 'operation.unichr',
         'len'           : 'operation.len',
         'ord'           : 'operation.ord',
         'pow'           : 'operation.pow',

Modified: pypy/dist/pypy/module/__builtin__/app_misc.py
==============================================================================
--- pypy/dist/pypy/module/__builtin__/app_misc.py	(original)
+++ pypy/dist/pypy/module/__builtin__/app_misc.py	Mon May 23 13:05:59 2005
@@ -11,13 +11,6 @@
     return _stringtable.setdefault(s,s)
 
 
-def unichr(code):
-    import sys
-    if (code < 0 or code > sys.maxunicode):
-        raise ValueError('unichr() arg not in range(%#x)'%(sys.maxunicode + 1))
-    return unicode('\\U%08x' %(code), 'unicode-escape')
-
-
 def reload(module):
     import imp, sys, errno
 

Modified: pypy/dist/pypy/module/__builtin__/compiling.py
==============================================================================
--- pypy/dist/pypy/module/__builtin__/compiling.py	(original)
+++ pypy/dist/pypy/module/__builtin__/compiling.py	Mon May 23 13:05:59 2005
@@ -9,7 +9,7 @@
 
 def compile(space, w_source, filename, mode, flags=0, dont_inherit=0):
     if space.is_true(space.isinstance(w_source, space.w_unicode)):
-        str_ = space.unwrap(w_source) # xxx generic unwrap
+        str_ = u''.join(w_source._value) # Bad exposing of unicode internals
     else:
         str_ = space.str_w(w_source)
 

Modified: pypy/dist/pypy/module/__builtin__/operation.py
==============================================================================
--- pypy/dist/pypy/module/__builtin__/operation.py	(original)
+++ pypy/dist/pypy/module/__builtin__/operation.py	Mon May 23 13:05:59 2005
@@ -15,6 +15,9 @@
     w_character = space.newstring([w_ascii])
     return w_character
 
+def unichr(space, w_code):
+    return space.newunicode([w_code])
+
 def len(space, w_obj):
     "len(object) -> integer\n\nReturn the number of items of a sequence or mapping."
     return space.len(w_obj)

Modified: pypy/dist/pypy/module/unicodedata/__init__.py
==============================================================================
--- pypy/branch/non-fake-unicode/pypy/module/unicodedata/__init__.py	(original)
+++ pypy/dist/pypy/module/unicodedata/__init__.py	Mon May 23 13:05:59 2005
@@ -1,6 +1,6 @@
-from pypy.interpreter.lazymodule import LazyModule
+from pypy.interpreter.mixedmodule import MixedModule
     
-class Module(LazyModule):
+class Module(MixedModule):
     appleveldefs = {
     }
     interpleveldefs = {

Modified: pypy/dist/pypy/objspace/std/floattype.py
==============================================================================
--- pypy/dist/pypy/objspace/std/floattype.py	(original)
+++ pypy/dist/pypy/objspace/std/floattype.py	Mon May 23 13:05:59 2005
@@ -1,5 +1,6 @@
 from pypy.objspace.std.stdtypedef import *
 from pypy.interpreter.error import OperationError
+from pypy.objspace.std.strutil import ParseStringError
 
 def descr__new__(space, w_floattype, w_x=0.0):
     from pypy.objspace.std.floatobject import W_FloatObject
@@ -10,6 +11,14 @@
         except ValueError, e:
             raise OperationError(space.w_ValueError,
                                  space.wrap(str(e)))
+    elif space.is_true(space.isinstance(w_value, space.w_unicode)):
+        try:
+            # XXX can produce unwrapped long
+            from unicodeobject import unicode_to_decimal_w
+            value = float(unicode_to_decimal_w(space, w_value))
+        except ParseStringError, e:
+            raise OperationError(space.w_ValueError,
+                                 space.wrap(e.msg))
     else:
         w_obj = space.float(w_value)
         if space.is_true(space.is_(w_floattype, space.w_float)):

Modified: pypy/dist/pypy/objspace/std/inttype.py
==============================================================================
--- pypy/dist/pypy/objspace/std/inttype.py	(original)
+++ pypy/dist/pypy/objspace/std/inttype.py	Mon May 23 13:05:59 2005
@@ -28,6 +28,15 @@
                                      space.wrap(e.msg))
             except ParseStringOverflowError, e:
                  w_longval = retry_to_w_long(space, e.parser)                
+        elif space.is_true(space.isinstance(w_value, space.w_unicode)):
+            try:
+                from unicodeobject import unicode_to_decimal_w
+                value = string_to_int(space, unicode_to_decimal_w(space, w_value))
+            except ParseStringError, e:
+                raise OperationError(space.w_ValueError,
+                                     space.wrap(e.msg))
+            except ParseStringOverflowError, e:
+                 w_longval = retry_to_w_long(space, e.parser)                
         else:
             # otherwise, use the __int__() method
             w_obj = space.int(w_value)

Modified: pypy/dist/pypy/objspace/std/longtype.py
==============================================================================
--- pypy/dist/pypy/objspace/std/longtype.py	(original)
+++ pypy/dist/pypy/objspace/std/longtype.py	Mon May 23 13:05:59 2005
@@ -18,6 +18,14 @@
             except ParseStringError, e:
                 raise OperationError(space.w_ValueError,
                                      space.wrap(e.msg))
+        elif space.is_true(space.isinstance(w_value, space.w_unicode)):
+            try:
+                # XXX can produce unwrapped long
+                from unicodeobject import unicode_to_decimal_w
+                value = string_to_long(unicode_to_decimal_w(space, w_value))
+            except ParseStringError, e:
+                raise OperationError(space.w_ValueError,
+                                     space.wrap(e.msg))
         else:
             # otherwise, use the __long__() method
             w_obj = space.long(w_value)

Modified: pypy/dist/pypy/objspace/std/objspace.py
==============================================================================
--- pypy/dist/pypy/objspace/std/objspace.py	(original)
+++ pypy/dist/pypy/objspace/std/objspace.py	Mon May 23 13:05:59 2005
@@ -199,6 +199,8 @@
             return W_IntObject(self, x)
         if isinstance(x, str):
             return W_StringObject(self, x)
+        if isinstance(x, unicode):
+            return W_UnicodeObject(self, [u for u in x])
         if isinstance(x, dict):
             items_w = [(self.wrap(k), self.wrap(v)) for (k, v) in x.iteritems()]
             return W_DictObject(self, items_w)
@@ -283,6 +285,14 @@
                                  self.wrap("character code not in range(256)"))
         return W_StringObject(self, ''.join(chars))
 
+    def newunicode(self, chars_w):
+        try:
+            chars = [unichr(self.int_w(w_c)) for w_c in chars_w]
+        except ValueError, e:  # unichr(out-of-range)
+            raise OperationError(self.w_ValueError,
+                                 self.wrap("character code not in range(0x110000)"))
+        return W_UnicodeObject(self, chars)
+
     def newseqiter(self, w_obj):
         return W_SeqIterObject(self, w_obj)
 

Modified: pypy/dist/pypy/objspace/std/test/test_unicodestring.py
==============================================================================
--- pypy/dist/pypy/objspace/std/test/test_unicodestring.py	(original)
+++ pypy/dist/pypy/objspace/std/test/test_unicodestring.py	Mon May 23 13:05:59 2005
@@ -38,3 +38,29 @@
     def test_contains(self):
         assert u'a' in 'abc'
         assert 'a' in u'abc'
+
+    def test_splitlines(self):
+        assert u''.splitlines() == []
+        assert u''.splitlines(1) == []
+        assert u'\n'.splitlines() == [u'']
+        assert u'a'.splitlines() == [u'a']
+        assert u'one\ntwo'.splitlines() == [u'one', u'two']
+        assert u'\ntwo\nthree'.splitlines() == [u'', u'two', u'three']
+        assert u'\n\n'.splitlines() == [u'', u'']
+        assert u'a\nb\nc'.splitlines(1) == [u'a\n', u'b\n', u'c']
+        assert u'\na\nb\n'.splitlines(1) == [u'\n', u'a\n', u'b\n']
+
+    def test_zfill(self):
+        assert u'123'.zfill(6) == u'000123'
+        assert u'123'.zfill(2) == u'123'
+        assert u'123'.zfill(6) == u'000123'
+        assert u'+123'.zfill(2) == u'+123'
+        assert u'+123'.zfill(4) == u'+123'
+        assert u'+123'.zfill(6) == u'+00123'
+
+    def test_split(self):
+        assert (u'this is the split function'.split() ==
+                [u'this', u'is', u'the', u'split', u'function'])
+        assert (u'this!is!the!split!function'.split('!') ==
+                [u'this', u'is', u'the', u'split', u'function'])
+    

Modified: pypy/dist/pypy/objspace/std/unicodeobject.py
==============================================================================
--- pypy/dist/pypy/objspace/std/unicodeobject.py	(original)
+++ pypy/dist/pypy/objspace/std/unicodeobject.py	Mon May 23 13:05:59 2005
@@ -1,129 +1,824 @@
 from pypy.objspace.std.objspace import *
-from pypy.objspace.std.fake import fake_type, wrap_exception
+from pypy.interpreter import gateway
+from pypy.objspace.std.fake import wrap_exception
 from pypy.objspace.std.stringobject import W_StringObject
-from pypy.objspace.std.strutil import string_to_w_long, ParseStringError
+from pypy.objspace.std.noneobject import W_NoneObject
+from pypy.objspace.std.sliceobject import W_SliceObject
+from pypy.objspace.std import slicetype
+from pypy.objspace.std.strutil import string_to_int, string_to_long, ParseStringError
+from pypy.rpython.rarithmetic import intmask
+from pypy.module.unicodedata import unicodedb
+
+class W_UnicodeObject(W_Object):
+    from pypy.objspace.std.unicodetype import unicode_typedef as typedef
+
+    def __init__(w_self, space, unicodechars):
+        W_Object.__init__(w_self, space)
+        w_self._value = unicodechars
+        if len(unicodechars) == 0:
+            w_self.w_hash = space.wrap(0)
+        else:
+            w_self.w_hash = None
+    def __repr__(w_self):
+        """ representation for debugging purposes """
+        return "%s(%r)" % (w_self.__class__.__name__, w_self._value)
 
-W_UnicodeObject = fake_type(unicode)
+registerimplementation(W_UnicodeObject)
 
 # Helper for converting int/long
-import unicodedata
 def unicode_to_decimal_w(space, w_unistr):
-    result = []
-    for uchr in space.unwrap(w_unistr):
-        if uchr.isspace():
-            result.append(' ')
+    unistr = w_unistr._value
+    result = ['\0'] * len(unistr)
+    digits = [ '0', '1', '2', '3', '4',
+               '5', '6', '7', '8', '9']
+    for i in xrange(len(unistr)):
+        uchr = ord(unistr[i])
+        if unicodedb.isspace(uchr):
+            result[i] = ' '
             continue
         try:
-            result.append(chr(ord('0') + unicodedata.decimal(uchr)))
-            continue
-        except ValueError:
-            ch = ord(uchr)
-            if 0 < ch < 256:
-                result.append(chr(ch))
-                continue
-        raise OperationError(space.w_UnicodeEncodeError, space.wrap('invalid decimal Unicode string'))
+            result[i] = digits[unicodedb.decimal(uchr)]
+        except KeyError:
+            if 0 < uchr < 256:
+                result[i] = chr(uchr)
+            else:
+                raise OperationError(space.w_UnicodeEncodeError, space.wrap('invalid decimal Unicode string'))
     return ''.join(result)
 
 # string-to-unicode delegation
 def delegate_String2Unicode(w_str):
     space = w_str.space
-    return W_UnicodeObject(space, unicode(space.str_w(w_str)))
+    return space.call_function(space.w_unicode, w_str)
 
 def str_w__Unicode(space, w_uni):
-    return space.str_w(space.call_method(w_uni, 'encode'))
+    return space.str_w(space.str(w_uni))
 
-def eq__Unicode_ANY(space, w_uni, w_other):
-    try:
-        return space.newbool(space.unwrap(w_uni) == space.unwrap(w_other))
-    except:
-        wrap_exception(space)
+def repr__Unicode(space, w_uni):
+    return space.wrap(repr(u''.join(w_uni._value)))
 
-def ne__Unicode_ANY(space, w_uni, w_other):
-    try:
-        return space.newbool(space.unwrap(w_uni) != space.unwrap(w_other))
-    except:
-        wrap_exception(space)
+def str__Unicode(space, w_uni):
+    return space.call_method(w_uni, 'encode')
 
+def cmp__Unicode_Unicode(space, w_left, w_right):
+    left = w_left._value
+    right = w_right._value
+    for i in range(min(len(left), len(right))):
+        test = ord(left[i]) - ord(right[i])
+        if test < 0:
+            return space.wrap(-1)
+        if test > 0:
+            return space.wrap(1)
+            
+    test = len(left) - len(right)
+    if test < 0:
+        return space.wrap(-1)
+    if test > 0:
+        return space.wrap(1)
+    return space.wrap(0)
 
-def lt__Unicode_ANY(space, w_uni, w_other):
+def cmp__Unicode_ANY(space, w_left, w_right):
     try:
-        return space.newbool(space.unwrap(w_uni) < space.unwrap(w_other))
+        w_right = space.call_function(space.w_unicode, w_right)
     except:
-        wrap_exception(space)
+        return space.wrap(1)
+    return space.cmp(w_left, w_right)
 
-def gt__Unicode_ANY(space, w_uni, w_other):
-    try:
-        return space.newbool(space.unwrap(w_uni) > space.unwrap(w_other))
-    except:
-        wrap_exception(space)
+def ord__Unicode(space, w_uni):
+    if len(w_uni._value) != 1:
+        raise OperationError(space.w_TypeError, space.wrap('ord() expected a character'))
+    return space.wrap(ord(w_uni._value[0]))
 
-def le__Unicode_ANY(space, w_uni, w_other):
-    try:
-        return space.newbool(space.unwrap(w_uni) <= space.unwrap(w_other))
-    except:
-        wrap_exception(space)
+def add__Unicode_Unicode(space, w_left, w_right):
+    left = w_left._value
+    right = w_right._value
+    leftlen = len(left)
+    rightlen = len(right)
+    result = [u'\0'] * (leftlen + rightlen)
+    for i in range(leftlen):
+        result[i] = left[i]
+    for i in range(rightlen):
+        result[i + leftlen] = right[i]
+    return W_UnicodeObject(space, result)
+
+def add__String_Unicode(space, w_left, w_right):
+    return space.add(space.call_function(space.w_unicode, w_left) , w_right)
+
+def add__Unicode_String(space, w_left, w_right):
+    return space.add(w_left, space.call_function(space.w_unicode, w_right))
+
+def contains__String_Unicode(space, w_container, w_item):
+    return space.contains(space.call_function(space.w_unicode, w_container), w_item )
+
+def _find(self, sub, start, end):
+    if len(sub) == 0:
+        return start
+    if start >= end:
+        return -1
+    for i in range(start, end - len(sub) + 1):
+        for j in range(len(sub)):
+            if self[i + j]  != sub[j]:
+                break
+        else:
+            return i
+    return -1
+
+def _rfind(self, sub, start, end):
+    if len(sub) == 0:
+        return end
+    if end - start < len(sub):
+        return -1
+    for i in range(end - len(sub), start - 1, -1):
+        for j in range(len(sub)):
+            if self[i + j]  != sub[j]:
+                break
+        else:
+            return i
+    return -1
+
+def contains__Unicode_Unicode(space, w_container, w_item):
+    item = w_item._value
+    container = w_container._value
+    return space.newbool(_find(container, item, 0, len(container)) >= 0)
+
+def unicode_join__Unicode_ANY(space, w_self, w_list):
+    list = space.unpackiterable(w_list)
+    delim = w_self._value
+    totlen = 0
+    if len(list) == 0:
+        return W_UnicodeObject(space, [])
+    for i in range(len(list)):
+        item = list[i]
+        if space.is_true(space.isinstance(item, space.w_unicode)):
+            list[i] = item._value
+        elif space.is_true(space.isinstance(item, space.w_str)):
+            list[i] = space.call_function(space.w_unicode, item)._value
+        else:
+            w_msg = space.mod(space.wrap('sequence item %d: expected string or Unicode'),
+                              space.wrap(i))
+            raise OperationError(space.w_TypeError, w_msg)
+        totlen += len(list[i])
+    totlen += len(delim) * (len(list) - 1)
+    if len(list) == 1:
+        return W_UnicodeObject(space, list[0])
+    # Allocate result
+    result = [u'\0'] * totlen
+    first = list[0]
+    for i in range(len(first)):
+        result[i] = first[i]
+    offset = len(first)
+    for i in range(1, len(list)):
+        item = list[i]
+        # Add delimiter
+        for j in range(len(delim)):
+            result[offset + j] = delim[j]
+        offset += len(delim)
+        # Add item from list
+        for j in range(len(item)):
+            result[offset + j] = item[j]
+        offset += len(item)
+    return W_UnicodeObject(space, result)
+
+def unicode_encode__Unicode_String_String(space, w_self, w_encoding, w_errors):
+    try:
+        return space.wrap(u''.join(w_self._value).encode(space.str_w(w_encoding), space.str_w(w_errors)))
+    except:
+        wrap_exception(space)
+
+def unicode_encode__Unicode_String_None(space, w_self, w_encoding, w_none):
+    try:
+        return space.wrap(u''.join(w_self._value).encode(space.str_w(w_encoding)))
+    except:
+        wrap_exception(space)
+
+def unicode_encode__Unicode_None_None(space, w_self, w_encoding, w_errors):
+    try:
+        return space.wrap(u''.join(w_self._value).encode())
+    except:
+        wrap_exception(space)
+
+def hash__Unicode(space, w_uni):
+    if w_uni.w_hash is None:
+        chars = w_uni._value
+        x = ord(chars[0]) << 7
+        for c in chars:
+            x = intmask((1000003 * x) ^ ord(c))
+        h = intmask(x ^ len(chars))
+        if h == -1:
+            h = -2
+        w_uni.w_hash = space.wrap(h)
+    return w_uni.w_hash
+
+def len__Unicode(space, w_uni):
+    return space.wrap(len(w_uni._value))
+
+def getitem__Unicode_ANY(space, w_uni, w_index):
+    ival = space.int_w(w_index)
+    uni = w_uni._value
+    ulen = len(uni)
+    if ival < 0:
+        ival += ulen
+    if ival < 0 or ival >= ulen:
+        exc = space.call_function(space.w_IndexError,
+                                  space.wrap("unicode index out of range"))
+        raise OperationError(space.w_IndexError, exc)
+    return W_UnicodeObject(space, [uni[ival]])
+
+def getitem__Unicode_Slice(space, w_uni, w_slice):
+    uni = w_uni._value
+    length = len(uni)
+    start, stop, step, sl = slicetype.indices4(space, w_slice, length)
+    r = [uni[start + i*step] for i in range(sl)]
+    return W_UnicodeObject(space, r)
+
+def unicode_getslice__Unicode_ANY_ANY(space, w_uni, w_start, w_end):
+    w_slice = space.call_function(space.w_slice, w_start, w_end)
+    uni = w_uni._value
+    length = len(uni)
+    start, stop, step, sl = slicetype.indices4(space, w_slice, length)
+    return W_UnicodeObject(space, uni[start:stop])
+
+def mul__Unicode_ANY(space, w_uni, w_times):
+    chars = w_uni._value
+    charlen = len(chars)
+    times = space.int_w(w_times)
+    if times <= 0 or charlen == 0:
+        return W_UnicodeObject(space, [])
+    if times == 1:
+        return space.call_function(space.w_unicode, w_uni)
+    if charlen == 1:
+        return W_UnicodeObject(space, [w_uni._value[0]] * times)
+
+    try:
+        result = [u'\0'] * (charlen * times)
+    except OverflowError:
+        raise OperationError(space.w_OverflowError, space.wrap('repeated string is too long'))
+    for i in range(times):
+        offset = i * charlen
+        for j in range(charlen):
+            result[offset + j] = chars[j]
+    return W_UnicodeObject(space, result)
+
+def mul__ANY_Unicode(space, w_times, w_uni):
+    return space.mul(w_uni, w_times)
+
+def _isspace(uchar):
+    return unicodedb.isspace(ord(uchar))
+
+def unicode_isspace__Unicode(space, w_unicode):
+    if len(w_unicode._value) == 0:
+        return space.w_False
+    for uchar in w_unicode._value:
+        if not unicodedb.isspace(ord(uchar)):
+            return space.w_False
+    return space.w_True
+
+def unicode_isalpha__Unicode(space, w_unicode):
+    if len(w_unicode._value) == 0:
+        return space.w_False
+    for uchar in w_unicode._value:
+        if not unicodedb.isalpha(ord(uchar)):
+            return space.w_False
+    return space.w_True
+
+def unicode_isalnum__Unicode(space, w_unicode):
+    if len(w_unicode._value) == 0:
+        return space.w_False
+    for uchar in w_unicode._value:
+        if not (unicodedb.isalpha(ord(uchar)) or
+                unicodedb.isnumeric(ord(uchar))):
+            return space.w_False
+    return space.w_True
+
+def unicode_isdecimal__Unicode(space, w_unicode):
+    if len(w_unicode._value) == 0:
+        return space.w_False
+    for uchar in w_unicode._value:
+        if not unicodedb.isdecimal(ord(uchar)):
+            return space.w_False
+    return space.w_True
+
+def unicode_isdigit__Unicode(space, w_unicode):
+    if len(w_unicode._value) == 0:
+        return space.w_False
+    for uchar in w_unicode._value:
+        if not unicodedb.isdigit(ord(uchar)):
+            return space.w_False
+    return space.w_True
+
+def unicode_isnumeric__Unicode(space, w_unicode):
+    if len(w_unicode._value) == 0:
+        return space.w_False
+    for uchar in w_unicode._value:
+        if not unicodedb.isnumeric(ord(uchar)):
+            return space.w_False
+    return space.w_True
+
+def unicode_islower__Unicode(space, w_unicode):
+    cased = False
+    for uchar in w_unicode._value:
+        if (unicodedb.isupper(ord(uchar)) or
+            unicodedb.istitle(ord(uchar))):
+            return space.w_False
+        if not cased and unicodedb.islower(ord(uchar)):
+            cased = True
+    return space.newbool(cased)
+
+def unicode_isupper__Unicode(space, w_unicode):
+    cased = False
+    for uchar in w_unicode._value:
+        if (unicodedb.islower(ord(uchar)) or
+            unicodedb.istitle(ord(uchar))):
+            return space.w_False
+        if not cased and unicodedb.isupper(ord(uchar)):
+            cased = True
+    return space.newbool(cased)
+
+def unicode_istitle__Unicode(space, w_unicode):
+    cased = False
+    previous_is_cased = False
+    for uchar in w_unicode._value:
+        if (unicodedb.isupper(ord(uchar)) or
+            unicodedb.istitle(ord(uchar))):
+            if previous_is_cased:
+                return space.w_False
+            previous_is_cased = cased = True
+        elif unicodedb.islower(ord(uchar)):
+            if not previous_is_cased:
+                return space.w_False
+            previous_is_cased = cased = True
+        else:
+            previous_is_cased = False
+    return space.newbool(cased)
+
+def _strip(space, w_self, w_chars, left, right):
+    "internal function called by str_xstrip methods"
+    u_self = w_self._value
+    u_chars = w_chars._value
+    
+    lpos = 0
+    rpos = len(u_self)
+    
+    if left:
+        while lpos < rpos and u_self[lpos] in u_chars:
+           lpos += 1
+       
+    if right:
+        while rpos > lpos and u_self[rpos - 1] in u_chars:
+           rpos -= 1
+           
+    result = [u'\0'] * (rpos - lpos)
+    for i in range(rpos - lpos):
+        result[i] = u_self[lpos + i]
+    return W_UnicodeObject(space, result)
+
+def _strip_none(space, w_self, left, right):
+    "internal function called by str_xstrip methods"
+    u_self = w_self._value
+    
+    lpos = 0
+    rpos = len(u_self)
+    
+    if left:
+        while lpos < rpos and _isspace(u_self[lpos]):
+           lpos += 1
+       
+    if right:
+        while rpos > lpos and _isspace(u_self[rpos - 1]):
+           rpos -= 1
+       
+    result = [u'\0'] * (rpos - lpos)
+    for i in range(rpos - lpos):
+        result[i] = u_self[lpos + i]
+    return W_UnicodeObject(space, result)
+
+def unicode_strip__Unicode_None(space, w_self, w_chars):
+    return _strip_none(space, w_self, 1, 1)
+def unicode_strip__Unicode_Unicode(space, w_self, w_chars):
+    return _strip(space, w_self, w_chars, 1, 1)
+def unicode_strip__Unicode_String(space, w_self, w_chars):
+    return space.call_method(w_self, 'strip',
+                             space.call_function(space.w_unicode, w_chars))
+
+def unicode_lstrip__Unicode_None(space, w_self, w_chars):
+    return _strip_none(space, w_self, 1, 0)
+def unicode_lstrip__Unicode_Unicode(space, w_self, w_chars):
+    return _strip(space, w_self, w_chars, 1, 0)
+def unicode_lstrip__Unicode_String(space, w_self, w_chars):
+    return space.call_method(w_self, 'lstrip',
+                             space.call_function(space.w_unicode, w_chars))
+
+def unicode_rstrip__Unicode_None(space, w_self, w_chars):
+    return _strip_none(space, w_self, 0, 1)
+def unicode_rstrip__Unicode_Unicode(space, w_self, w_chars):
+    return _strip(space, w_self, w_chars, 0, 1)
+def unicode_rstrip__Unicode_String(space, w_self, w_chars):
+    return space.call_method(w_self, 'rstrip',
+                             space.call_function(space.w_unicode, w_chars))
+
+def unicode_capitalize__Unicode(space, w_self):
+    input = w_self._value
+    if len(input) == 0:
+        return W_UnicodeObject(space, [])
+    result = [u'\0'] * len(input)
+    result[0] = unichr(unicodedb.toupper(ord(input[0])))
+    for i in range(1, len(input)):
+        result[i] = unichr(unicodedb.tolower(ord(input[i])))
+    return W_UnicodeObject(space, result)
+
+def unicode_title__Unicode(space, w_self):
+    input = w_self._value
+    if len(input) == 0:
+        return w_self
+    result = [u'\0'] * len(input)
+
+    previous_is_cased = 0
+    for i in range(len(input)):
+        unichar = ord(input[i])
+        if previous_is_cased:
+            result[i] = unichr(unicodedb.tolower(unichar))
+        else:
+            result[i] = unichr(unicodedb.totitle(unichar))
+        previous_is_cased = unicodedb.iscased(unichar)
+    return W_UnicodeObject(space, result)
+
+def unicode_lower__Unicode(space, w_self):
+    input = w_self._value
+    result = [u'\0'] * len(input)
+    for i in range(len(input)):
+        result[i] = unichr(unicodedb.tolower(ord(input[i])))
+    return W_UnicodeObject(space, result)
+
+def unicode_upper__Unicode(space, w_self):
+    input = w_self._value
+    result = [u'\0'] * len(input)
+    for i in range(len(input)):
+        result[i] = unichr(unicodedb.toupper(ord(input[i])))
+    return W_UnicodeObject(space, result)
+
+def unicode_swapcase__Unicode(space, w_self):
+    input = w_self._value
+    result = [u'\0'] * len(input)
+    for i in range(len(input)):
+        unichar = ord(input[i])
+        if unicodedb.islower(unichar):
+            result[i] = unichr(unicodedb.toupper(unichar))
+        elif unicodedb.isupper(unichar):
+            result[i] = unichr(unicodedb.tolower(unichar))
+        else:
+            result[i] = input[i]
+    return W_UnicodeObject(space, result)
+
+def _normalize_index(length, index):
+    if index < 0:
+        index += length
+        if index < 0:
+            index = 0
+    elif index > length:
+        index = length
+    return index
 
-def ge__Unicode_ANY(space, w_uni, w_other):
-    try:
-        return space.newbool(space.unwrap(w_uni) >= space.unwrap(w_other))
-    except:
-        wrap_exception(space)
+def unicode_endswith__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+    self = w_self._value
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
 
-def ord__Unicode(space, w_uni):
-    try:
-        return space.wrap(ord(space.unwrap(w_uni)))
-    except:
-        wrap_exception(space)
+    substr = w_substr._value
+    substr_len = len(substr)
+    
+    if end - start < substr_len:
+        return space.w_False # substring is too long
+    start = end - substr_len
+    for i in range(substr_len):
+        if self[start + i] != substr[i]:
+            return space.w_False
+    return space.w_True
 
-# xxx unicode.__float__ should not exist. For now this approach avoids to deal with unicode in more places
-def float__Unicode(space, w_uni):
-    try:
-        return space.wrap(float(unicode_to_decimal_w(space, w_uni)))
-    except:
-        wrap_exception(space)
-        
-# xxx unicode.__int__ should not exist
-def int__Unicode(space, w_uni):
-    try:
-        s = unicode_to_decimal_w(space, w_uni)
-    except:
-        wrap_exception(space)
-        raise
-    return space.call_function(space.w_int, space.wrap(s))
+def unicode_startswith__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+    self = w_self._value
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
 
-# xxx unicode.__long__ should not exist
-def long__Unicode(space, w_uni):
-    try:
-        return string_to_w_long(space, unicode_to_decimal_w(space, w_uni))
-    except ParseStringError, e:
-        raise OperationError(space.w_ValueError, space.wrap(e.msg))    
-    except:
-        wrap_exception(space)
+    substr = w_substr._value
+    substr_len = len(substr)
+    
+    if end - start < substr_len:
+        return space.w_False # substring is too long
+    
+    for i in range(substr_len):
+        if self[start + i] != substr[i]:
+            return space.w_False
+    return space.w_True
 
-def add__Unicode_Unicode(space, w_left, w_right):
-    return space.wrap(space.unwrap(w_left) + space.unwrap(w_right))
+def unicode_center__Unicode_ANY(space, w_self, w_width):
+    self = w_self._value
+    width = space.int_w(w_width)
+    padding = width - len(self)
+    if padding < 0:
+        return space.call_function(space.w_unicode, w_self)
+    leftpad = padding // 2 + (padding & width & 1)
+    result = [u' '] * width
+    for i in range(len(self)):
+        result[leftpad + i] = self[i]
+    return W_UnicodeObject(space, result)
 
-def contains__String_Unicode(space, w_left, w_right):
-    try:
-        return space.wrap(space.unwrap(w_right) in space.unwrap(w_left))
-    except:
-        wrap_exception(space)
 
-def contains__Unicode_Unicode(space, w_left, w_right):
-    return space.wrap(space.unwrap(w_right) in space.unwrap(w_left))
+def unicode_ljust__Unicode_ANY(space, w_self, w_width):
+    self = w_self._value
+    width = space.int_w(w_width)
+    padding = width - len(self)
+    if padding < 0:
+        return space.call_function(space.w_unicode, w_self)
+    result = [u' '] * width
+    for i in range(len(self)):
+        result[i] = self[i]
+    return W_UnicodeObject(space, result)
 
-# str.strip(unicode) needs to convert self to unicode and call unicode.strip
-def str_strip__String_Unicode(space, w_self, w_chars ):
+def unicode_rjust__Unicode_ANY(space, w_self, w_width):
+    self = w_self._value
+    width = space.int_w(w_width)
+    padding = width - len(self)
+    if padding < 0:
+        return space.call_function(space.w_unicode, w_self)
+    result = [u' '] * width
+    for i in range(len(self)):
+        result[padding + i] = self[i]
+    return W_UnicodeObject(space, result)
+    
+def unicode_zfill__Unicode_ANY(space, w_self, w_width):
+    self = w_self._value
+    width = space.int_w(w_width)
+    if len(self) == 0:
+        return W_UnicodeObject(space, [u'0'] * width)
+    padding = width - len(self)
+    if padding <= 0:
+        return space.call_function(space.w_unicode, w_self)
+    result = [u'0'] * width
+    for i in range(len(self)):
+        result[padding + i] = self[i]
+    # Move sign to first position
+    if self[0] in (u'+', u'-'):
+        result[0] = self[0]
+        result[padding] = u'0'
+    return W_UnicodeObject(space, result)
+
+def unicode_splitlines__Unicode_ANY(space, w_self, w_keepends):
+    self = w_self._value
+    keepends = 0
+    if space.int_w(w_keepends):
+        keepends = 1
+    if len(self) == 0:
+        return space.newlist([])
+    
+    start = 0
+    end = len(self)
+    pos = 0
+    lines = []
+    while pos < end:
+        if unicodedb.islinebreak(ord(self[pos])):
+            if (self[pos] == u'\r' and pos + 1 < end and
+                self[pos + 1] == u'\n'):
+                # Count CRLF as one linebreak
+                lines.append(W_UnicodeObject(space,
+                                             self[start:pos + keepends * 2]))
+                pos += 1
+            else:
+                lines.append(W_UnicodeObject(space,
+                                             self[start:pos + keepends]))
+            pos += 1
+            start = pos
+        else:
+            pos += 1
+    if not unicodedb.islinebreak(ord(self[end - 1])):
+        lines.append(W_UnicodeObject(space, self[start:]))
+    return space.newlist(lines)
+
+def unicode_find__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+    self = w_self._value
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
+    substr = w_substr._value
+    return space.wrap(_find(self, substr, start, end))
+
+def unicode_rfind__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+    self = w_self._value
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
+    substr = w_substr._value
+    return space.wrap(_rfind(self, substr, start, end))
+
+def unicode_index__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
     self = w_self._value
-    return space.wrap( unicode(self).strip( space.unwrap(w_chars) ) )
-def str_lstrip__String_Unicode(space, w_self, w_chars ):
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
+    substr = w_substr._value
+    index = _find(self, substr, start, end)
+    if index < 0:
+        raise OperationError(space.w_ValueError,
+                             space.wrap('substring not found'))
+    return space.wrap(index)
+
+def unicode_rindex__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
     self = w_self._value
-    return space.wrap( unicode(self).lstrip( space.unwrap(w_chars) ) )
-def str_rstrip__String_Unicode(space, w_self, w_chars ):
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
+    substr = w_substr._value
+    index = _rfind(self, substr, start, end)
+    if index < 0:
+        raise OperationError(space.w_ValueError,
+                             space.wrap('substring not found'))
+    return space.wrap(index)
+
+def unicode_count__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
     self = w_self._value
-    return space.wrap( unicode(self).rstrip( space.unwrap(w_chars) ) )
-# we use the following magic to register strip_string_unicode as a String multimethod
-import stringtype
+    start = _normalize_index(len(self), space.int_w(w_start))
+    end = _normalize_index(len(self), space.int_w(w_end))
+    substr = w_substr._value
+    count = 0
+    while start <= end:
+        index = _find(self, substr, start, end)
+        if index < 0:
+            break
+        start = index + 1
+        count += 1
+    return space.wrap(count)
 
 
-register_all(vars(), stringtype)
+def unicode_split__Unicode_None_ANY(space, w_self, w_none, w_maxsplit):
+    self = w_self._value
+    maxsplit = space.int_w(w_maxsplit)
+    parts = []
+    if len(self) == 0:
+        return space.newlist([])
+    start = 0
+    end = len(self)
+    while maxsplit != 0 and start < end:
+        index = start
+        for index in range(start, end):
+            if _isspace(self[index]):
+                break
+        else:
+            break
+        parts.append(W_UnicodeObject(space, self[start:index]))
+        maxsplit -= 1
+        # Eat whitespace
+        for start in range(index + 1, end):
+            if not _isspace(self[start]):
+                break
+        else:
+            return space.newlist(parts)
+    parts.append(W_UnicodeObject(space, self[start:]))
+    return space.newlist(parts)
+
+
+def unicode_split__Unicode_Unicode_ANY(space, w_self, w_delim, w_maxsplit):
+    self = w_self._value
+    delim = w_delim._value
+    maxsplit = space.int_w(w_maxsplit)
+    delim_len = len(delim)
+    if delim_len == 0:
+        raise OperationError(space.w_ValueError,
+                             space.wrap('empty separator'))
+    parts = []
+    if len(self) == 0:
+        return space.newlist([])
+    start = 0
+    end = len(self)
+    while maxsplit != 0:
+        index = _find(self, delim, start, end)
+        if index < 0:
+            break
+        parts.append(W_UnicodeObject(space, self[start:index]))
+        start = index + delim_len
+        maxsplit -= 1
+    parts.append(W_UnicodeObject(space, self[start:]))
+    return space.newlist(parts)
+
+def _split(space, self, maxsplit):
+    if len(self) == 0:
+        return []
+    if maxsplit == 0:
+        return [W_UnicodeObject(space, self)]
+    index = 0
+    end = len(self)
+    parts = [W_UnicodeObject(space, [])]
+    maxsplit -= 1
+    while maxsplit != 0:
+        if index >= end:
+            break
+        parts.append(W_UnicodeObject(space, [self[index]]))
+        index += 1
+        maxsplit -= 1
+    parts.append(W_UnicodeObject(space, self[index:]))
+    return parts
+    
+def unicode_replace__Unicode_Unicode_Unicode_ANY(space, w_self, w_old,
+                                                 w_new, w_maxsplit):
+    if len(w_old._value):
+        w_parts = space.call_method(w_self, 'split', w_old, w_maxsplit)
+    else:
+        self = w_self._value
+        maxsplit = space.int_w(w_maxsplit)
+        w_parts = space.newlist(_split(space, self, maxsplit))
+    return space.call_method(w_new, 'join', w_parts)
+    
+
+'translate'
+app = gateway.applevel(r'''
+import sys
+
+def unicode_expandtabs__Unicode_ANY(self, tabsize):
+    parts = self.split(u'\t')
+    result = [ parts[0] ]
+    prevsize = 0
+    for ch in parts[0]:
+        prevsize += 1
+        if ch in (u"\n", u"\r"):
+            prevsize = 0
+    for i in range(1, len(parts)):
+        pad = tabsize - prevsize % tabsize
+        result.append(u' ' * pad)
+        nextpart = parts[i]
+        result.append(nextpart)
+        prevsize = 0
+        for ch in nextpart:
+            prevsize += 1
+            if ch in (u"\n", u"\r"):
+                prevsize = 0
+    return u''.join(result)
+
+def unicode_translate__Unicode_ANY(self, table):
+    result = []
+    for unichar in self:
+        try:
+            newval = table[ord(unichar)]
+        except KeyError:
+            result.append(unichar)
+        else:
+            if newval is None:
+                continue
+            elif isinstance(newval, int):
+                if newval < 0 or newval > sys.maxunicode:
+                    raise TypeError("character mapping must be in range(0x%x)"%(sys.maxunicode + 1,))
+                result.append(unichr(newval))
+            elif isinstance(newval, unicode):
+                result.append(newval)
+            else:
+                raise TypeError("character mapping must return integer, None or unicode")
+    return ''.join(result)
+
+def mod__Unicode_ANY(format, values):
+    import _formatting
+    if isinstance(values, tuple):
+        return _formatting.format(format, values, None, do_unicode=True)
+    if hasattr(values, 'keys'):
+        return _formatting.format(format, (values,), values, do_unicode=True)
+    return _formatting.format(format, (values,), None, do_unicode=True)
+''')
+unicode_expandtabs__Unicode_ANY = app.interphook('unicode_expandtabs__Unicode_ANY')
+unicode_translate__Unicode_ANY = app.interphook('unicode_translate__Unicode_ANY')
+mod__Unicode_ANY = app.interphook('mod__Unicode_ANY')
+
+import unicodetype
+register_all(vars(), unicodetype)
+
+# str.strip(unicode) needs to convert self to unicode and call unicode.strip
+# we use the following magic to register strip_string_unicode as a String multimethod.
+class str_methods:
+    import stringtype
+    W_UnicodeObject = W_UnicodeObject
+    from pypy.objspace.std.stringobject import W_StringObject
+    def str_strip__String_Unicode(space, w_self, w_chars):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'strip', w_chars)
+    def str_lstrip__String_Unicode(space, w_self, w_chars):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'lstrip', w_chars)
+        self = w_self._value
+    def str_rstrip__String_Unicode(space, w_self, w_chars):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'rstrip', w_chars)
+    def str_count__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'count', w_substr, w_start, w_end)
+    def str_find__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'find', w_substr, w_start, w_end)
+    def str_rfind__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'rfind', w_substr, w_start, w_end)
+    def str_index__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'index', w_substr, w_start, w_end)
+    def str_rindex__String_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'rindex', w_substr, w_start, w_end)
+
+    def str_replace__String_Unicode_Unicode_ANY(space, w_self, w_old, w_new, w_maxsplit):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'replace', w_old, w_new, w_maxsplit)
+
+    def str_split__String_Unicode_ANY(space, w_self, w_delim, w_maxsplit):
+        return space.call_method(space.call_function(space.w_unicode, w_self),
+                                 'split', w_delim, w_maxsplit)
+        
+    register_all(vars(), stringtype)

Modified: pypy/dist/pypy/objspace/std/unicodetype.py
==============================================================================
--- pypy/dist/pypy/objspace/std/unicodetype.py	(original)
+++ pypy/dist/pypy/objspace/std/unicodetype.py	Mon May 23 13:05:59 2005
@@ -1,3 +1,119 @@
-from pypy.objspace.std.fake import fake_type
+from pypy.objspace.std.stdtypedef import *
+from pypy.objspace.std.basestringtype import basestring_typedef
+from pypy.interpreter.error import OperationError
 
-unicode_typedef = fake_type(unicode).typedef
+from sys import maxint
+
+unicode_capitalize = MultiMethod('capitalize', 1)
+unicode_center     = MultiMethod('center', 2, )
+unicode_count      = MultiMethod('count', 4, defaults=(0, maxint))      
+unicode_encode     = MultiMethod('encode', 3, defaults=(None, None))
+unicode_endswith   = MultiMethod('endswith', 4, defaults=(0,maxint))
+unicode_expandtabs = MultiMethod('expandtabs', 2, defaults=(8,))
+unicode_find       = MultiMethod('find', 4, defaults=(0, maxint))
+unicode_index      = MultiMethod('index', 4, defaults=(0, maxint))
+unicode_isalnum    = MultiMethod('isalnum', 1)
+unicode_isalpha    = MultiMethod('isalpha', 1)
+unicode_isdecimal  = MultiMethod('isdecimal', 1)
+unicode_isdigit    = MultiMethod('isdigit', 1)
+unicode_islower    = MultiMethod('islower', 1)
+unicode_isnumeric  = MultiMethod('isnumeric', 1)
+unicode_isspace    = MultiMethod('isspace', 1)
+unicode_istitle    = MultiMethod('istitle', 1)
+unicode_isupper    = MultiMethod('isupper', 1)
+unicode_join       = MultiMethod('join', 2)
+unicode_ljust      = MultiMethod('ljust', 2)
+unicode_lower      = MultiMethod('lower', 1)
+unicode_lstrip     = MultiMethod('lstrip', 2, defaults=(None,))
+unicode_replace    = MultiMethod('replace', 4, defaults=(-1,))
+unicode_rfind      = MultiMethod('rfind', 4, defaults=(0, maxint))
+unicode_rindex     = MultiMethod('rindex', 4, defaults=(0, maxint))
+unicode_rjust      = MultiMethod('rjust', 2)
+unicode_rstrip     = MultiMethod('rstrip', 2, defaults=(None,))
+unicode_split      = MultiMethod('split', 3, defaults=(None,-1))
+unicode_splitlines = MultiMethod('splitlines', 2, defaults=(0,))
+unicode_startswith = MultiMethod('startswith', 4, defaults=(0,maxint))
+unicode_strip      = MultiMethod('strip',  2, defaults=(None,))
+unicode_swapcase   = MultiMethod('swapcase', 1)
+unicode_title      = MultiMethod('title', 1)
+unicode_translate  = MultiMethod('translate', 2)
+unicode_upper      = MultiMethod('upper', 1)
+unicode_zfill      = MultiMethod('zfill', 2)
+unicode_getslice   = MultiMethod('__getslice__', 3)
+# ____________________________________________________________
+
+app = gateway.applevel('''
+import codecs, sys
+
+def unicode_from_encoded_object(obj, encoding, errors):
+    # Fix later for buffer
+    if type(obj).__name__ == 'buffer':
+        obj = obj.buf
+    if encoding is None:
+        encoding = sys.getdefaultencoding()
+    decoder = codecs.getdecoder(encoding)
+    if errors is None:
+        retval, lenght = decoder(obj)
+    else:
+        retval, length = decoder(obj, errors)
+    if not isinstance(retval, unicode):
+        raise TypeError("decoder did not return an unicode object (type=%s)" %
+                        type(retval).__name__)
+    return retval
+
+def unicode_from_object(obj):
+    if isinstance(obj, str):
+        res = obj
+    else:
+        try:
+            unicode_method = obj.__unicode__
+        except AttributeError:
+            res = str(obj)
+        else:
+            res = unicode_method()
+    if isinstance(res, unicode):
+        return res
+    return unicode_from_encoded_object(res, None, "strict")
+    
+''')
+unicode_from_object = app.interphook('unicode_from_object')
+unicode_from_encoded_object = app.interphook('unicode_from_encoded_object')
+
+
+def descr__new__(space, w_unicodetype, w_obj=None, w_encoding=None, w_errors=None):
+    from pypy.objspace.std.unicodeobject import W_UnicodeObject
+    w_obj_type = space.type(w_obj)
+    
+    if space.is_w(w_obj_type, space.w_unicode):
+        if (not space.is_w(w_encoding, space.w_None) or
+            not space.is_w(w_errors, space.w_None)):
+            raise OperationError(space.w_TypeError,
+                                 space.wrap('decoding Unicode is not supported'))
+        if space.is_w(w_unicodetype, space.w_unicode):
+            return w_obj
+        w_value = w_obj
+    elif space.is_w(w_obj, space.w_None):
+        w_value = W_UnicodeObject(space, [])
+    elif (space.is_w(w_encoding, space.w_None) and
+          space.is_w(w_errors, space.w_None)):
+        if space.is_true(space.isinstance(w_obj, space.w_unicode)):
+            w_value = w_obj
+        else:
+            w_value = unicode_from_object(space, w_obj)
+    else:
+        w_value = unicode_from_encoded_object(space, w_obj, w_encoding, w_errors)
+    w_newobj = space.allocate_instance(W_UnicodeObject, w_unicodetype)
+    w_newobj.__init__(space, w_value._value)
+    return w_newobj
+
+# ____________________________________________________________
+
+unicode_typedef = StdTypeDef("unicode", basestring_typedef,
+    __new__ = newmethod(descr__new__),
+    __doc__ = '''unicode(string [, encoding[, errors]]) -> object
+
+Create a new Unicode object from the given encoded string.
+encoding defaults to the current default string encoding.
+errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.'''
+    )
+unicode_typedef.registermethods(globals())



More information about the Pypy-commit mailing list