[pypy-svn] r11639 - in pypy/dist/pypy/lib: . test2

ale at codespeak.net ale at codespeak.net
Sat Apr 30 00:00:11 CEST 2005


Author: ale
Date: Sat Apr 30 00:00:11 2005
New Revision: 11639

Added:
   pypy/dist/pypy/lib/_codecs.py
   pypy/dist/pypy/lib/codecs.py
   pypy/dist/pypy/lib/test2/test_codeccallbacks.py
   pypy/dist/pypy/lib/unicodecodec.py
Log:
A preliminary checkin of the python translation of the _codecsmodule (and friends).

The module _codecs exposes the same methods as _codecs.pyd in CPython. As in CPython it is a wrapper for the actual codecs which I have put in unicodecodec.py ( for now). Not all codecs are implemented yet, and not all test passes.

I had to change the test2/test_codeccallbacks.py in order to test the codecs without involving unicode strings.

There some serious problems to be solved :

1) how to make a global registry from appspace. I have used module variables (in _codecs.py) for now. Ther may be better ways.

2) UnicodeError,UnicodeEncodeError,UnicodeDecodeError exceptions ar faked ? doesnt work as in Cpython anyway.

3) In translating the design from _codecsmodule.c I have discoered a sideeffect. The Encodings package injects the functions exposed in _codecs.py into a class. This converts the function into an unbound method. In a comment in the Encodings package it is mentioned that this doesnt happen for c functions.

This is work in progress

Added: pypy/dist/pypy/lib/_codecs.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/lib/_codecs.py	Sat Apr 30 00:00:11 2005
@@ -0,0 +1,351 @@
+"""
+
+   _codecs -- Provides access to the codec registry and the builtin
+              codecs.
+
+   This module should never be imported directly. The standard library
+   module "codecs" wraps this builtin module for use within Python.
+
+   The codec registry is accessible via:
+
+     register(search_function) -> None
+
+     lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
+
+   The builtin Unicode codecs use the following interface:
+
+     <encoding>_encode(Unicode_object[,errors='strict']) -> 
+         (string object, bytes consumed)
+
+     <encoding>_decode(char_buffer_obj[,errors='strict']) -> 
+        (Unicode object, bytes consumed)
+
+   <encoding>_encode() interfaces also accept non-Unicode object as
+   input. The objects are then converted to Unicode using
+   PyUnicode_FromObject() prior to applying the conversion.
+
+   These <encoding>s are available: utf_8, unicode_escape,
+   raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
+   mbcs (on win32).
+
+
+Written by Marc-Andre Lemburg (mal at lemburg.com).
+
+Copyright (c) Corporation for National Research Initiatives.
+
+"""
+from pypy.lib.unicodecodec import *
+
+#/* --- Registry ----------------------------------------------------------- */
+codec_search_path = []
+codec_search_cache = {}
+codec_error_registry = {}
+
+def codec_register( search_function ):
+    """register(search_function)
+    
+    Register a codec search function. Search functions are expected to take
+    one argument, the encoding name in all lower case letters, and return
+    a tuple of functions (encoder, decoder, stream_reader, stream_writer).
+    """
+
+    if callable(search_function):
+        codec_search_path.append(search_function)
+
+register = codec_register
+
+def codec_lookup(encoding):
+    """lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
+    Looks up a codec tuple in the Python codec registry and returns
+    a tuple of functions.
+    """
+    
+    result = codec_search_cache.get(encoding,None)
+    if not result:
+        for search in codec_search_path:
+            result=search(encoding)
+            if result : break
+    return result
+
+lookup = codec_lookup
+
+def lookup_error(errors):
+    """lookup_error(errors) -> handler
+
+    Return the error handler for the specified error handling name
+    or raise a LookupError, if no handler exists under this name.
+    """
+    try:
+        err_handler = codec_error_registry[errors]
+    except KeyError:
+        raise LookupError("unknown error handler name %s"%errors)
+    return err_handler
+
+def register_error(errors, handler):
+    """register_error(errors, handler)
+
+    Register the specified error handler under the name
+    errors. handler must be a callable object, that
+    will be called with an exception instance containing
+    information about the location of the encoding/decoding
+    error and must return a (replacement, new position) tuple.
+    """
+    if callable(handler):
+        codec_error_registry[errors] = handler
+    else:
+        raise TypeError("handler must be callable")
+    
+def encode(v, encoding='defaultencoding',errors='strict'):
+    """encode(obj, [encoding[,errors]]) -> object
+    
+    Encodes obj using the codec registered for encoding. encoding defaults
+    to the default encoding. errors may be given to set a different error
+    handling scheme. Default is 'strict' meaning that encoding errors raise
+    a ValueError. Other possible values are 'ignore', 'replace' and
+    'xmlcharrefreplace' as well as any other name registered with
+    codecs.register_error that can handle ValueErrors.
+    """
+    
+    encoder = lookup(encoding)[0]
+    if encoder :
+        res = encoder(v,errors)
+    return res[0]
+
+def decode(obj,encoding='defaultencoding',errors='strict'):
+    """decode(obj, [encoding[,errors]]) -> object
+
+    Decodes obj using the codec registered for encoding. encoding defaults
+    to the default encoding. errors may be given to set a different error
+    handling scheme. Default is 'strict' meaning that encoding errors raise
+    a ValueError. Other possible values are 'ignore' and 'replace'
+    as well as any other name registerd with codecs.register_error that is
+    able to handle ValueErrors.
+    """
+    decoder = lookup(encoding)[1]
+    if decoder:
+        res = decoder(obj,errors)
+    return res[0]
+
+def latin_1_encode(inst,obj,errors='strict'):
+    """None
+    """
+    res = PyUnicode_EncodeLatin1(obj,len(obj),errors)
+    return res, len(res)
+# XXX MBCS codec might involve ctypes ?
+def mbcs_decode():
+    """None
+    """
+    pass
+
+def readbuffer_encode(inst,obj,errors='strict'):
+    """None
+    """
+    res = str(obj)
+    return res,len(res)
+
+def escape_encode(inst,obj,errors='strict'):
+    """None
+    """
+    s = repr(obj)
+    v = s[1:-1]
+    return v,len(v)
+# XXX
+def utf_8_decode(inst,data,errors='strict'):
+    """None
+    """
+    pass
+# XXX
+def raw_unicode_escape_decode(inst,data,errors='strict'):
+    """None
+    """
+    pass
+
+def utf_7_decode(inst,data,errors='strict'):
+    """None
+    """
+    unistr = PyUnicode_DecodeUTF7(data,errors='strict')
+    return unistr,len(unistr)
+# XXX
+def unicode_escape_encode(inst,obj,errors='strict'):
+    """None
+    """
+    pass
+# XXX
+def latin_1_decode(inst,data,errors='strict'):
+    """None
+    """
+    pass
+# XXX
+def utf_16_decode(inst,data,errors='strict'):
+    """None
+    """
+    pass
+# XXX
+def unicode_escape_decode(inst,data,errors='strict'):
+    """None
+    """
+    pass
+
+def ascii_decode(inst,data,errors='strict'):
+    """None
+    """
+    res = PyUnicode_DecodeASCII(data,len(data),errors)
+    return res,len(res)
+
+def charmap_encode(obj,errors='strict',mapping='latin-1'):
+    """None
+    """
+    res = PyUnicode_EncodeCharmap(obj,len(obj),mapping,errors)
+    return res,len(res)
+
+def unicode_internal_encode(inst,obj,errors='strict'):
+    """None
+    """
+    if type(obj) == unicode:
+        return obj, len(obj)
+    else:
+        return PyUnicode_FromUnicode(obj,size),size
+# XXX
+def utf_16_ex_decode(inst,data,errors='strict'):
+    """None
+    """
+    pass
+# XXX Check if this is right
+def escape_decode(data,errors='strict'):
+    """None
+    """
+    return data,len(data)
+
+def charbuffer_encode(inst,obj,errors='strict'):
+    """None
+    """
+    res = str(obj)
+    return res,len(res)
+# XXX
+def charmap_decode(inst,data,errors='strict'):
+    """None
+    """
+    pass
+
+def utf_7_encode(inst,obj,errors='strict'):
+    """None
+    """
+    obj = PyUnicode_FromObject(obj)
+    return (PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(obj),
+					 PyUnicode_GET_SIZE(obj),
+                     0,
+                     0,
+					 errors),
+		    PyUnicode_GET_SIZE(obj))
+
+def mbcs_encode(inst,obj,errors='strict'):
+    """None
+    """
+    return (PyUnicode_EncodeMBCS(
+			       PyUnicode_AS_UNICODE(obj), 
+			       PyUnicode_GET_SIZE(obj),
+			       errors),
+		    PyUnicode_GET_SIZE(obj));
+    
+
+def ascii_encode(inst,obj,errors='strict'):
+    """None
+    """
+    return (PyUnicode_EncodeASCII(
+			       PyUnicode_AS_UNICODE(obj), 
+			       PyUnicode_GET_SIZE(obj),
+			       errors),
+                PyUnicode_GET_SIZE(obj))
+
+def utf_16_encode(inst,obj,errors='strict'):
+    """None
+    """
+    u = PyUnicode_EncodeUTF16(obj,len(obj),errors)
+    return u,len(u)
+
+def raw_unicode_escape_encode(inst,obj,errors='strict'):
+    """None
+    """
+    res = PyUnicode_EncodeRawUnicodeEscape(obj,len(obj))
+    return res,len(res)
+# XXX
+def utf_8_encode(inst,obj,errors='strict'):
+    """None
+    """
+    pass
+# XXX
+def utf_16_le_encode(inst,obj,errors='strict'):
+    """None
+    """
+    pass
+# XXX
+def utf_16_be_encode(inst,obj,errors='strict'):
+    """None
+    """
+    pass
+
+def unicode_internal_decode(inst,unistr,errors='strict'):
+    """None
+    """
+    if type(unistr) == unicode:
+        return unistr,len(unistr)
+    else:
+        return unicode(unistr),len(unistr)
+# XXX
+def utf_16_le_decode(inst,data,errors='strict'):
+    """None
+    """
+    pass
+# XXX
+def utf_16_be_decode(inst,data,errors='strict'):
+    """None
+    """
+    pass
+
+def strict_errors(exc):
+    if isinstance(exc,Exception):
+        raise exc
+    else:
+        raise TypeError("codec must pass exception instance")
+    
+def ignore_errors(exc):
+    if type(exc) in [UnicodeEncodeError,UnicodeDecodeError,UnicodeTranslateError]:
+        return u'',exc.end
+    else:
+        raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
+# XXX
+def replace_errors(exc):
+    if isinstance(exc,Exception):
+        raise exc
+    else:
+        raise TypeError("codec must pass exception instance")
+# XXX    
+def xmlcharrefreplace_errors(exc):
+    if isinstance(exc,Exception):
+        raise exc
+    else:
+        raise TypeError("codec must pass exception instance")
+    
+def backslashreplace_errors(exc):
+    if isinstance(exc,UnicodeEncodeError):
+        p=['\\']
+        for c in exc.object[exc.start:exc.end]:
+            oc = ord(c)
+            if (oc >= 0x00010000):
+                p.append('U')
+                p.append("%.8x" % ord(c))
+            elif (oc >= 0x100):
+                p.append('u')
+                p.append("%.4x" % ord(c))
+            else:
+                p.append('x')
+                p.append("%.2x" % ord(c))
+        return u''.join(p),exc.end
+    else:
+        raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
+
+register_error("strict",strict_errors)
+register_error("ignore",ignore_errors)
+register_error("replace",replace_errors)
+register_error("xmlcharrefreplace",xmlcharrefreplace_errors)
+register_error("backslashreplace",backslashreplace_errors)
\ No newline at end of file

Added: pypy/dist/pypy/lib/codecs.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/lib/codecs.py	Sat Apr 30 00:00:11 2005
@@ -0,0 +1,750 @@
+""" codecs -- Python Codec Registry, API and helpers.
+
+
+Written by Marc-Andre Lemburg (mal at lemburg.com).
+
+(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+
+"""#"
+
+import __builtin__, sys
+
+### Registry and builtin stateless codec functions
+
+try:
+    import sys
+    if sys.path[0] != r'd:\projects\pypy_co':
+            sys.path.insert(0,r'd:\projects\pypy_co')
+    from pypy.lib import _codecs
+    reload(_codecs)
+    del _codecs
+    from pypy.lib._codecs import *
+except ImportError, why:
+    raise SystemError,\
+          'Failed to load the builtin codecs: %s' % why
+
+__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
+           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
+           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
+           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
+           "strict_errors", "ignore_errors", "replace_errors",
+           "xmlcharrefreplace_errors",
+           "register_error", "lookup_error"]
+
+### Constants
+
+#
+# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
+# and its possible byte string values
+# for UTF8/UTF16/UTF32 output and little/big endian machines
+#
+
+# UTF-8
+BOM_UTF8 = '\xef\xbb\xbf'
+
+# UTF-16, little endian
+BOM_LE = BOM_UTF16_LE = '\xff\xfe'
+
+# UTF-16, big endian
+BOM_BE = BOM_UTF16_BE = '\xfe\xff'
+
+# UTF-32, little endian
+BOM_UTF32_LE = '\xff\xfe\x00\x00'
+
+# UTF-32, big endian
+BOM_UTF32_BE = '\x00\x00\xfe\xff'
+
+if sys.byteorder == 'little':
+
+    # UTF-16, native endianness
+    BOM = BOM_UTF16 = BOM_UTF16_LE
+
+    # UTF-32, native endianness
+    BOM_UTF32 = BOM_UTF32_LE
+
+else:
+
+    # UTF-16, native endianness
+    BOM = BOM_UTF16 = BOM_UTF16_BE
+
+    # UTF-32, native endianness
+    BOM_UTF32 = BOM_UTF32_BE
+
+# Old broken names (don't use in new code)
+BOM32_LE = BOM_UTF16_LE
+BOM32_BE = BOM_UTF16_BE
+BOM64_LE = BOM_UTF32_LE
+BOM64_BE = BOM_UTF32_BE
+
+
+### Codec base classes (defining the API)
+
+class Codec:
+
+    """ Defines the interface for stateless encoders/decoders.
+
+        The .encode()/.decode() methods may use different error
+        handling schemes by providing the errors argument. These
+        string values are predefined:
+
+         'strict' - raise a ValueError error (or a subclass)
+         'ignore' - ignore the character and continue with the next
+         'replace' - replace with a suitable replacement character;
+                    Python will use the official U+FFFD REPLACEMENT
+                    CHARACTER for the builtin Unicode codecs on
+                    decoding and '?' on encoding.
+         'xmlcharrefreplace' - Replace with the appropriate XML
+                               character reference (only for encoding).
+         'backslashreplace'  - Replace with backslashed escape sequences
+                               (only for encoding).
+
+        The set of allowed values can be extended via register_error.
+
+    """
+    def encode(self, input, errors='strict'):
+
+        """ Encodes the object input and returns a tuple (output
+            object, length consumed).
+
+            errors defines the error handling to apply. It defaults to
+            'strict' handling.
+
+            The method may not store state in the Codec instance. Use
+            StreamCodec for codecs which have to keep state in order to
+            make encoding/decoding efficient.
+
+            The encoder must be able to handle zero length input and
+            return an empty object of the output object type in this
+            situation.
+
+        """
+        raise NotImplementedError
+
+    def decode(self, input, errors='strict'):
+
+        """ Decodes the object input and returns a tuple (output
+            object, length consumed).
+
+            input must be an object which provides the bf_getreadbuf
+            buffer slot. Python strings, buffer objects and memory
+            mapped files are examples of objects providing this slot.
+
+            errors defines the error handling to apply. It defaults to
+            'strict' handling.
+
+            The method may not store state in the Codec instance. Use
+            StreamCodec for codecs which have to keep state in order to
+            make encoding/decoding efficient.
+
+            The decoder must be able to handle zero length input and
+            return an empty object of the output object type in this
+            situation.
+
+        """
+        raise NotImplementedError
+
+#
+# The StreamWriter and StreamReader class provide generic working
+# interfaces which can be used to implement new encoding submodules
+# very easily. See encodings/utf_8.py for an example on how this is
+# done.
+#
+
+class StreamWriter(Codec):
+
+    def __init__(self, stream, errors='strict'):
+
+        """ Creates a StreamWriter instance.
+
+            stream must be a file-like object open for writing
+            (binary) data.
+
+            The StreamWriter may use different error handling
+            schemes by providing the errors keyword argument. These
+            parameters are predefined:
+
+             'strict' - raise a ValueError (or a subclass)
+             'ignore' - ignore the character and continue with the next
+             'replace'- replace with a suitable replacement character
+             'xmlcharrefreplace' - Replace with the appropriate XML
+                                   character reference.
+             'backslashreplace'  - Replace with backslashed escape
+                                   sequences (only for encoding).
+
+            The set of allowed parameter values can be extended via
+            register_error.
+        """
+        self.stream = stream
+        self.errors = errors
+
+    def write(self, object):
+
+        """ Writes the object's contents encoded to self.stream.
+        """
+        data, consumed = self.encode(object, self.errors)
+        print type(data)
+        self.stream.write(data)
+
+    def writelines(self, list):
+
+        """ Writes the concatenated list of strings to the stream
+            using .write().
+        """
+        self.write(''.join(list))
+
+    def reset(self):
+
+        """ Flushes and resets the codec buffers used for keeping state.
+
+            Calling this method should ensure that the data on the
+            output is put into a clean state, that allows appending
+            of new fresh data without having to rescan the whole
+            stream to recover state.
+
+        """
+        pass
+
+    def __getattr__(self, name,
+                    getattr=getattr):
+
+        """ Inherit all other methods from the underlying stream.
+        """
+        return getattr(self.stream, name)
+
+###
+
+class StreamReader(Codec):
+
+    def __init__(self, stream, errors='strict'):
+
+        """ Creates a StreamReader instance.
+
+            stream must be a file-like object open for reading
+            (binary) data.
+
+            The StreamReader may use different error handling
+            schemes by providing the errors keyword argument. These
+            parameters are predefined:
+
+             'strict' - raise a ValueError (or a subclass)
+             'ignore' - ignore the character and continue with the next
+             'replace'- replace with a suitable replacement character;
+
+            The set of allowed parameter values can be extended via
+            register_error.
+        """
+        self.stream = stream
+        self.errors = errors
+        self.bytebuffer = ""
+        self.charbuffer = u""
+        self.atcr = False
+
+    def decode(self, input, errors='strict'):
+        raise NotImplementedError
+
+    def read(self, size=-1, chars=-1):
+
+        """ Decodes data from the stream self.stream and returns the
+            resulting object.
+
+            chars indicates the number of characters to read from the
+            stream. read() will never return more than chars
+            characters, but it might return less, if there are not enough
+            characters available.
+
+            size indicates the approximate maximum number of bytes to
+            read from the stream for decoding purposes. The decoder
+            can modify this setting as appropriate. The default value
+            -1 indicates to read and decode as much as possible.  size
+            is intended to prevent having to decode huge files in one
+            step.
+
+            The method should use a greedy read strategy meaning that
+            it should read as much data as is allowed within the
+            definition of the encoding and the given size, e.g.  if
+            optional encoding endings or state markers are available
+            on the stream, these should be read too.
+        """
+        # read until we get the required number of characters (if available)
+        while True:
+            # can the request can be satisfied from the character buffer?
+            if chars < 0:
+                if self.charbuffer:
+                    break
+            else:
+                if len(self.charbuffer) >= chars:
+                    break
+            # we need more data
+            if size < 0:
+                newdata = self.stream.read()
+            else:
+                newdata = self.stream.read(size)
+            # decode bytes (those remaining from the last call included)
+            data = self.bytebuffer + newdata
+            newchars, decodedbytes = self.decode(data, self.errors)
+            # keep undecoded bytes until the next call
+            self.bytebuffer = data[decodedbytes:]
+            # put new characters in the character buffer
+            self.charbuffer += newchars
+            # there was no data available
+            if not newdata:
+                break
+        if chars < 0:
+            # Return everything we've got
+            result = self.charbuffer
+            self.charbuffer = u""
+        else:
+            # Return the first chars characters
+            result = self.charbuffer[:chars]
+            self.charbuffer = self.charbuffer[chars:]
+        return result
+
+    def readline(self, size=None, keepends=True):
+
+        """ Read one line from the input stream and return the
+            decoded data.
+
+            size, if given, is passed as size argument to the
+            read() method.
+
+        """
+        readsize = size or 72
+        line = u""
+        # If size is given, we call read() only once
+        while True:
+            data = self.read(readsize)
+            if self.atcr and data.startswith(u"\n"):
+                data = data[1:]
+            if data:
+                self.atcr = data.endswith(u"\r")
+            line += data
+            lines = line.splitlines(True)
+            if lines:
+                line0withend = lines[0]
+                line0withoutend = lines[0].splitlines(False)[0]
+                if line0withend != line0withoutend: # We really have a line end
+                    # Put the rest back together and keep it until the next call
+                    self.charbuffer = u"".join(lines[1:]) + self.charbuffer
+                    if keepends:
+                        line = line0withend
+                    else:
+                        line = line0withoutend
+                break
+            # we didn't get anything or this was our only try
+            elif not data or size is not None:
+                if line and not keepends:
+                    line = line.splitlines(False)[0]
+                break
+            if readsize<8000:
+                readsize *= 2
+        return line
+
+    def readlines(self, sizehint=None, keepends=True):
+
+        """ Read all lines available on the input stream
+            and return them as list of lines.
+
+            Line breaks are implemented using the codec's decoder
+            method and are included in the list entries.
+
+            sizehint, if given, is ignored since there is no efficient
+            way to finding the true end-of-line.
+
+        """
+        data = self.read()
+        return data.splitlines(keepends)
+
+    def reset(self):
+
+        """ Resets the codec buffers used for keeping state.
+
+            Note that no stream repositioning should take place.
+            This method is primarily intended to be able to recover
+            from decoding errors.
+
+        """
+        pass
+
+    def next(self):
+
+        """ Return the next decoded line from the input stream."""
+        line = self.readline()
+        if line:
+            return line
+        raise StopIteration
+
+    def __iter__(self):
+        return self
+
+    def __getattr__(self, name,
+                    getattr=getattr):
+
+        """ Inherit all other methods from the underlying stream.
+        """
+        return getattr(self.stream, name)
+
+###
+
+class StreamReaderWriter:
+
+    """ StreamReaderWriter instances allow wrapping streams which
+        work in both read and write modes.
+
+        The design is such that one can use the factory functions
+        returned by the codec.lookup() function to construct the
+        instance.
+
+    """
+    # Optional attributes set by the file wrappers below
+    encoding = 'unknown'
+
+    def __init__(self, stream, Reader, Writer, errors='strict'):
+
+        """ Creates a StreamReaderWriter instance.
+
+            stream must be a Stream-like object.
+
+            Reader, Writer must be factory functions or classes
+            providing the StreamReader, StreamWriter interface resp.
+
+            Error handling is done in the same way as defined for the
+            StreamWriter/Readers.
+
+        """
+        self.stream = stream
+        self.reader = Reader(stream, errors)
+        self.writer = Writer(stream, errors)
+        self.errors = errors
+
+    def read(self, size=-1):
+
+        return self.reader.read(size)
+
+    def readline(self, size=None):
+
+        return self.reader.readline(size)
+
+    def readlines(self, sizehint=None):
+
+        return self.reader.readlines(sizehint)
+
+    def next(self):
+
+        """ Return the next decoded line from the input stream."""
+        return self.reader.next()
+
+    def __iter__(self):
+        return self
+
+    def write(self, data):
+
+        return self.writer.write(data)
+
+    def writelines(self, list):
+
+        return self.writer.writelines(list)
+
+    def reset(self):
+
+        self.reader.reset()
+        self.writer.reset()
+
+    def __getattr__(self, name,
+                    getattr=getattr):
+
+        """ Inherit all other methods from the underlying stream.
+        """
+        return getattr(self.stream, name)
+
+###
+
+class StreamRecoder:
+
+    """ StreamRecoder instances provide a frontend - backend
+        view of encoding data.
+
+        They use the complete set of APIs returned by the
+        codecs.lookup() function to implement their task.
+
+        Data written to the stream is first decoded into an
+        intermediate format (which is dependent on the given codec
+        combination) and then written to the stream using an instance
+        of the provided Writer class.
+
+        In the other direction, data is read from the stream using a
+        Reader instance and then return encoded data to the caller.
+
+    """
+    # Optional attributes set by the file wrappers below
+    data_encoding = 'unknown'
+    file_encoding = 'unknown'
+
+    def __init__(self, stream, encode, decode, Reader, Writer,
+                 errors='strict'):
+
+        """ Creates a StreamRecoder instance which implements a two-way
+            conversion: encode and decode work on the frontend (the
+            input to .read() and output of .write()) while
+            Reader and Writer work on the backend (reading and
+            writing to the stream).
+
+            You can use these objects to do transparent direct
+            recodings from e.g. latin-1 to utf-8 and back.
+
+            stream must be a file-like object.
+
+            encode, decode must adhere to the Codec interface, Reader,
+            Writer must be factory functions or classes providing the
+            StreamReader, StreamWriter interface resp.
+
+            encode and decode are needed for the frontend translation,
+            Reader and Writer for the backend translation. Unicode is
+            used as intermediate encoding.
+
+            Error handling is done in the same way as defined for the
+            StreamWriter/Readers.
+
+        """
+        self.stream = stream
+        self.encode = encode
+        self.decode = decode
+        self.reader = Reader(stream, errors)
+        self.writer = Writer(stream, errors)
+        self.errors = errors
+
+    def read(self, size=-1):
+
+        data = self.reader.read(size)
+        data, bytesencoded = self.encode(data, self.errors)
+        return data
+
+    def readline(self, size=None):
+
+        if size is None:
+            data = self.reader.readline()
+        else:
+            data = self.reader.readline(size)
+        data, bytesencoded = self.encode(data, self.errors)
+        return data
+
+    def readlines(self, sizehint=None):
+
+        data = self.reader.read()
+        data, bytesencoded = self.encode(data, self.errors)
+        return data.splitlines(1)
+
+    def next(self):
+
+        """ Return the next decoded line from the input stream."""
+        return self.reader.next()
+
+    def __iter__(self):
+        return self
+
+    def write(self, data):
+
+        data, bytesdecoded = self.decode(data, self.errors)
+        return self.writer.write(data)
+
+    def writelines(self, list):
+
+        data = ''.join(list)
+        data, bytesdecoded = self.decode(data, self.errors)
+        return self.writer.write(data)
+
+    def reset(self):
+
+        self.reader.reset()
+        self.writer.reset()
+
+    def __getattr__(self, name,
+                    getattr=getattr):
+
+        """ Inherit all other methods from the underlying stream.
+        """
+        return getattr(self.stream, name)
+
+### Shortcuts
+
+def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
+
+    """ Open an encoded file using the given mode and return
+        a wrapped version providing transparent encoding/decoding.
+
+        Note: The wrapped version will only accept the object format
+        defined by the codecs, i.e. Unicode objects for most builtin
+        codecs. Output is also codec dependent and will usually by
+        Unicode as well.
+
+        Files are always opened in binary mode, even if no binary mode
+        was specified. This is done to avoid data loss due to encodings
+        using 8-bit values. The default file mode is 'rb' meaning to
+        open the file in binary read mode.
+
+        encoding specifies the encoding which is to be used for the
+        file.
+
+        errors may be given to define the error handling. It defaults
+        to 'strict' which causes ValueErrors to be raised in case an
+        encoding error occurs.
+
+        buffering has the same meaning as for the builtin open() API.
+        It defaults to line buffered.
+
+        The returned wrapped file object provides an extra attribute
+        .encoding which allows querying the used encoding. This
+        attribute is only available if an encoding was specified as
+        parameter.
+
+    """
+    if encoding is not None and \
+       'b' not in mode:
+        # Force opening of the file in binary mode
+        mode = mode + 'b'
+    file = __builtin__.open(filename, mode, buffering)
+    if encoding is None:
+        return file
+    (e, d, sr, sw) = lookup(encoding)
+    srw = StreamReaderWriter(file, sr, sw, errors)
+    # Add attributes to simplify introspection
+    srw.encoding = encoding
+    return srw
+
+def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
+
+    """ Return a wrapped version of file which provides transparent
+        encoding translation.
+
+        Strings written to the wrapped file are interpreted according
+        to the given data_encoding and then written to the original
+        file as string using file_encoding. The intermediate encoding
+        will usually be Unicode but depends on the specified codecs.
+
+        Strings are read from the file using file_encoding and then
+        passed back to the caller as string using data_encoding.
+
+        If file_encoding is not given, it defaults to data_encoding.
+
+        errors may be given to define the error handling. It defaults
+        to 'strict' which causes ValueErrors to be raised in case an
+        encoding error occurs.
+
+        The returned wrapped file object provides two extra attributes
+        .data_encoding and .file_encoding which reflect the given
+        parameters of the same name. The attributes can be used for
+        introspection by Python programs.
+
+    """
+    if file_encoding is None:
+        file_encoding = data_encoding
+    encode, decode = lookup(data_encoding)[:2]
+    Reader, Writer = lookup(file_encoding)[2:]
+    sr = StreamRecoder(file,
+                       encode, decode, Reader, Writer,
+                       errors)
+    # Add attributes to simplify introspection
+    sr.data_encoding = data_encoding
+    sr.file_encoding = file_encoding
+    return sr
+
+### Helpers for codec lookup
+
+def getencoder(encoding):
+
+    """ Lookup up the codec for the given encoding and return
+        its encoder function.
+
+        Raises a LookupError in case the encoding cannot be found.
+
+    """
+    return lookup(encoding)[0]
+
+def getdecoder(encoding):
+
+    """ Lookup up the codec for the given encoding and return
+        its decoder function.
+
+        Raises a LookupError in case the encoding cannot be found.
+
+    """
+    return lookup(encoding)[1]
+
+def getreader(encoding):
+
+    """ Lookup up the codec for the given encoding and return
+        its StreamReader class or factory function.
+
+        Raises a LookupError in case the encoding cannot be found.
+
+    """
+    return lookup(encoding)[2]
+
+def getwriter(encoding):
+
+    """ Lookup up the codec for the given encoding and return
+        its StreamWriter class or factory function.
+
+        Raises a LookupError in case the encoding cannot be found.
+
+    """
+    return lookup(encoding)[3]
+
+### Helpers for charmap-based codecs
+
+def make_identity_dict(rng):
+
+    """ make_identity_dict(rng) -> dict
+
+        Return a dictionary where elements of the rng sequence are
+        mapped to themselves.
+
+    """
+    res = {}
+    for i in rng:
+        res[i]=i
+    return res
+
+def make_encoding_map(decoding_map):
+
+    """ Creates an encoding map from a decoding map.
+
+        If a target mapping in the decoding map occurs multiple
+        times, then that target is mapped to None (undefined mapping),
+        causing an exception when encountered by the charmap codec
+        during translation.
+
+        One example where this happens is cp875.py which decodes
+        multiple character to \u001a.
+
+    """
+    m = {}
+    for k,v in decoding_map.items():
+        if not v in m:
+            m[v] = k
+        else:
+            m[v] = None
+    return m
+
+### error handlers
+
+strict_errors = lookup_error("strict")
+ignore_errors = lookup_error("ignore")
+replace_errors = lookup_error("replace")
+xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
+backslashreplace_errors = lookup_error("backslashreplace")
+
+# Tell modulefinder that using codecs probably needs the encodings
+# package
+_false = 1
+if _false:
+    import encodings
+
+### Tests
+
+if __name__ == '__main__':
+
+    # Make stdout translate Latin-1 output into UTF-8 output
+    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
+
+    # Have stdin translate Latin-1 input into UTF-8 input
+    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')

Added: pypy/dist/pypy/lib/test2/test_codeccallbacks.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/lib/test2/test_codeccallbacks.py	Sat Apr 30 00:00:11 2005
@@ -0,0 +1,712 @@
+import test.test_support, unittest
+import sys, codecs, htmlentitydefs, unicodedata
+
+class PosReturn:
+    # this can be used for configurable callbacks
+
+    def __init__(self):
+        self.pos = 0
+
+    def handle(self, exc):
+        oldpos = self.pos
+        realpos = oldpos
+        if realpos<0:
+            realpos = len(exc.object) + realpos
+        # if we don't advance this time, terminate on the next call
+        # otherwise we'd get an endless loop
+        if realpos <= exc.start:
+            self.pos = len(exc.object)
+        return (u"<?>", oldpos)
+
+class CodecCallbackTest(unittest.TestCase):
+
+    def test_xmlcharrefreplace(self):
+        # replace unencodable characters which numeric character entities.
+        # For ascii, latin-1 and charmaps this is completely implemented
+        # in C and should be reasonably fast.
+        s = u"\u30b9\u30d1\u30e2 \xe4nd eggs"
+        self.assertEqual(
+            s.encode("ascii", "xmlcharrefreplace"),
+            "&#12473;&#12497;&#12514; &#228;nd eggs"
+        )
+        self.assertEqual(
+            s.encode("latin-1", "xmlcharrefreplace"),
+            "&#12473;&#12497;&#12514; \xe4nd eggs"
+        )
+
+    def test_xmlcharnamereplace(self):
+        # This time use a named character entity for unencodable
+        # characters, if one is available.
+
+        def xmlcharnamereplace(exc):
+            if not isinstance(exc, UnicodeEncodeError):
+                raise TypeError("don't know how to handle %r" % exc)
+            l = []
+            for c in exc.object[exc.start:exc.end]:
+                try:
+                    l.append(u"&%s;" % htmlentitydefs.codepoint2name[ord(c)])
+                except KeyError:
+                    l.append(u"&#%d;" % ord(c))
+            return (u"".join(l), exc.end)
+
+        codecs.register_error(
+            "test.xmlcharnamereplace", xmlcharnamereplace)
+
+        sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
+        sout = "&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
+        self.assertEqual(codecs.encode(sin,"ascii", "test.xmlcharnamereplace"), sout)
+        sout = "\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
+        self.assertEqual(codecs.encode(sin,"latin-1", "test.xmlcharnamereplace"), sout)
+        sout = "\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
+        self.assertEqual(codecs.encode(sin,"iso-8859-15", "test.xmlcharnamereplace"), sout)
+
+    def test_uninamereplace(self):
+        # We're using the names from the unicode database this time,
+        # and we're doing "syntax highlighting" here, i.e. we include
+        # the replaced text in ANSI escape sequences. For this it is
+        # useful that the error handler is not called for every single
+        # unencodable character, but for a complete sequence of
+        # unencodable characters, otherwise we would output many
+        # unneccessary escape sequences.
+
+        def uninamereplace(exc):
+            if not isinstance(exc, UnicodeEncodeError):
+                raise TypeError("don't know how to handle %r" % exc)
+            l = []
+            for c in exc.object[exc.start:exc.end]:
+                l.append(unicodedata.name(c, u"0x%x" % ord(c)))
+            return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end)
+
+        codecs.register_error(
+            "test.uninamereplace", uninamereplace)
+
+        sin = u"\xac\u1234\u20ac\u8000"
+        sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
+        self.assertEqual(codecs.encode(sin,"ascii", "test.uninamereplace"), sout)
+
+        sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
+        self.assertEqual(codecs.encode(sin,"latin-1", "test.uninamereplace"), sout)
+
+        sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
+        self.assertEqual(codecs.encode(sin,"iso-8859-15", "test.uninamereplace"), sout)
+
+    def test_backslashescape(self):
+        # Does the same as the "unicode-escape" encoding, but with different
+        # base encodings.
+        sin = u"a\xac\u1234\u20ac\u8000"
+        if sys.maxunicode > 0xffff:
+            sin += unichr(sys.maxunicode)
+        sout = "a\\xac\\u1234\\u20ac\\u8000"
+        if sys.maxunicode > 0xffff:
+            sout += "\\U%08x" % sys.maxunicode
+        self.assertEqual(codecs.encode(sin,"ascii", "backslashreplace"), sout)
+
+        sout = "a\xac\\u1234\\u20ac\\u8000"
+        if sys.maxunicode > 0xffff:
+            sout += "\\U%08x" % sys.maxunicode
+        self.assertEqual(codecs.encode(sin,"latin-1", "backslashreplace"), sout)
+
+        sout = "a\xac\\u1234\xa4\\u8000"
+        if sys.maxunicode > 0xffff:
+            sout += "\\U%08x" % sys.maxunicode
+        self.assertEqual(codecs.encode(sin,"iso-8859-15", "backslashreplace"), sout)
+
+    def test_relaxedutf8(self):
+        # This is the test for a decoding callback handler,
+        # that relaxes the UTF-8 minimal encoding restriction.
+        # A null byte that is encoded as "\xc0\x80" will be
+        # decoded as a null byte. All other illegal sequences
+        # will be handled strictly.
+        def relaxedutf8(exc):
+            if not isinstance(exc, UnicodeDecodeError):
+                raise TypeError("don't know how to handle %r" % exc)
+            if exc.object[exc.start:exc.end].startswith("\xc0\x80"):
+                return (u"\x00", exc.start+2) # retry after two bytes
+            else:
+                raise exc
+
+        codecs.register_error(
+            "test.relaxedutf8", relaxedutf8)
+
+        sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
+        sout = u"a\x00b\x00c\xfc\x00\x00"
+        self.assertEqual(codecs.decode(sin,"utf-8", "test.relaxedutf8"), sout)
+        sin = "\xc0\x80\xc0\x81"
+        self.assertRaises(UnicodeError, codecs.decode,sin, "utf-8", "test.relaxedutf8")
+
+    def test_charmapencode(self):
+        # For charmap encodings the replacement string will be
+        # mapped through the encoding again. This means, that
+        # to be able to use e.g. the "replace" handler, the
+        # charmap has to have a mapping for "?".
+        charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
+        sin = u"abc"
+        sout = "AABBCC"
+        self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
+
+        sin = u"abcA"
+        self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
+
+        charmap[ord("?")] = "XYZ"
+        sin = u"abcDEF"
+        sout = "AABBCCXYZXYZXYZ"
+        self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
+
+        charmap[ord("?")] = u"XYZ"
+        self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
+
+        charmap[ord("?")] = u"XYZ"
+        self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
+
+    def test_callbacks(self):
+        def handler1(exc):
+            if not isinstance(exc, UnicodeEncodeError) \
+               and not isinstance(exc, UnicodeDecodeError):
+                raise TypeError("don't know how to handle %r" % exc)
+            l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
+            return (u"[%s]" % u"".join(l), exc.end)
+
+        codecs.register_error("test.handler1", handler1)
+
+        def handler2(exc):
+            if not isinstance(exc, UnicodeDecodeError):
+                raise TypeError("don't know how to handle %r" % exc)
+            l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
+            return (u"[%s]" % u"".join(l), exc.end+1) # skip one character
+
+        codecs.register_error("test.handler2", handler2)
+
+        s = "\x00\x81\x7f\x80\xff"
+
+        self.assertEqual(
+            codecs.decode(s,"ascii", "test.handler1"),
+            u"\x00[<129>]\x7f[<128>][<255>]"
+        )
+        self.assertEqual(
+            codecs.decode(s,"ascii", "test.handler2"),
+            u"\x00[<129>][<128>]"
+        )
+
+        self.assertEqual(
+            codecs.decode("\\u3042\u3xxx","unicode-escape", "test.handler1"),
+            u"\u3042[<92><117><51><120>]xx"
+        )
+
+        self.assertEqual(
+            codecs.decode("\\u3042\u3xx","unicode-escape", "test.handler1"),
+            u"\u3042[<92><117><51><120><120>]"
+        )
+
+        self.assertEqual(
+            codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0],
+            u"z[<98>][<99>]"
+        )
+
+        self.assertEqual(
+            codecs.encode(u"g\xfc\xdfrk","ascii", "test.handler1"),
+            u"g[<252><223>]rk"
+        )
+
+        self.assertEqual(
+            codecs.encode(u"g\xfc\xdf","ascii", "test.handler1"),
+            u"g[<252><223>]"
+        )
+
+    def test_longstrings(self):
+        # test long strings to check for memory overflow problems
+        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
+        # register the handlers under different names,
+        # to prevent the codec from recognizing the name
+        for err in errors:
+            codecs.register_error("test." + err, codecs.lookup_error(err))
+        l = 1000
+        errors += [ "test." + err for err in errors ]
+        for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
+            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
+                for err in errors:
+                    try:
+                        codecs.encode(uni,enc, err)
+                    except UnicodeError:
+                        pass
+
+    def check_exceptionobjectargs(self, exctype, args, msg):
+        # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
+        # check with one missing argument
+        self.assertRaises(TypeError, exctype, *args[:-1])
+        # check with one argument too much
+        self.assertRaises(TypeError, exctype, *(args + ["too much"]))
+        # check with one argument of the wrong type
+        wrongargs = [ "spam", u"eggs", 42, 1.0, None ]
+        for i in xrange(len(args)):
+            for wrongarg in wrongargs:
+                if type(wrongarg) is type(args[i]):
+                    continue
+                # build argument array
+                callargs = []
+                for j in xrange(len(args)):
+                    if i==j:
+                        callargs.append(wrongarg)
+                    else:
+                        callargs.append(args[i])
+                self.assertRaises(TypeError, exctype, *callargs)
+
+        # check with the correct number and type of arguments
+        exc = exctype(*args)
+        self.assertEquals(str(exc), msg)
+
+    def test_unicodeencodeerror(self):
+        self.check_exceptionobjectargs(
+            UnicodeEncodeError,
+            ["ascii", u"g\xfcrk", 1, 2, "ouch"],
+            "'ascii' codec can't encode character u'\\xfc' in position 1: ouch"
+        )
+        self.check_exceptionobjectargs(
+            UnicodeEncodeError,
+            ["ascii", u"g\xfcrk", 1, 4, "ouch"],
+            "'ascii' codec can't encode characters in position 1-3: ouch"
+        )
+        self.check_exceptionobjectargs(
+            UnicodeEncodeError,
+            ["ascii", u"\xfcx", 0, 1, "ouch"],
+            "'ascii' codec can't encode character u'\\xfc' in position 0: ouch"
+        )
+        self.check_exceptionobjectargs(
+            UnicodeEncodeError,
+            ["ascii", u"\u0100x", 0, 1, "ouch"],
+            "'ascii' codec can't encode character u'\\u0100' in position 0: ouch"
+        )
+        self.check_exceptionobjectargs(
+            UnicodeEncodeError,
+            ["ascii", u"\uffffx", 0, 1, "ouch"],
+            "'ascii' codec can't encode character u'\\uffff' in position 0: ouch"
+        )
+        if sys.maxunicode > 0xffff:
+            self.check_exceptionobjectargs(
+                UnicodeEncodeError,
+                ["ascii", u"\U00010000x", 0, 1, "ouch"],
+                "'ascii' codec can't encode character u'\\U00010000' in position 0: ouch"
+            )
+
+    def test_unicodedecodeerror(self):
+        self.check_exceptionobjectargs(
+            UnicodeDecodeError,
+            ["ascii", "g\xfcrk", 1, 2, "ouch"],
+            "'ascii' codec can't decode byte 0xfc in position 1: ouch"
+        )
+        self.check_exceptionobjectargs(
+            UnicodeDecodeError,
+            ["ascii", "g\xfcrk", 1, 3, "ouch"],
+            "'ascii' codec can't decode bytes in position 1-2: ouch"
+        )
+
+    def test_unicodetranslateerror(self):
+        self.check_exceptionobjectargs(
+            UnicodeTranslateError,
+            [u"g\xfcrk", 1, 2, "ouch"],
+            "can't translate character u'\\xfc' in position 1: ouch"
+        )
+        self.check_exceptionobjectargs(
+            UnicodeTranslateError,
+            [u"g\u0100rk", 1, 2, "ouch"],
+            "can't translate character u'\\u0100' in position 1: ouch"
+        )
+        self.check_exceptionobjectargs(
+            UnicodeTranslateError,
+            [u"g\uffffrk", 1, 2, "ouch"],
+            "can't translate character u'\\uffff' in position 1: ouch"
+        )
+        if sys.maxunicode > 0xffff:
+            self.check_exceptionobjectargs(
+                UnicodeTranslateError,
+                [u"g\U00010000rk", 1, 2, "ouch"],
+                "can't translate character u'\\U00010000' in position 1: ouch"
+            )
+        self.check_exceptionobjectargs(
+            UnicodeTranslateError,
+            [u"g\xfcrk", 1, 3, "ouch"],
+            "can't translate characters in position 1-2: ouch"
+        )
+
+    def test_badandgoodstrictexceptions(self):
+        # "strict" complains about a non-exception passed in
+        self.assertRaises(
+            TypeError,
+            codecs.strict_errors,
+            42
+        )
+        # "strict" complains about the wrong exception type
+        self.assertRaises(
+            Exception,
+            codecs.strict_errors,
+            Exception("ouch")
+        )
+
+        # If the correct exception is passed in, "strict" raises it
+        self.assertRaises(
+            UnicodeEncodeError,
+            codecs.strict_errors,
+            UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")
+        )
+
+    def test_badandgoodignoreexceptions(self):
+        # "ignore" complains about a non-exception passed in
+        self.assertRaises(
+           TypeError,
+           codecs.ignore_errors,
+           42
+        )
+        # "ignore" complains about the wrong exception type
+        self.assertRaises(
+           TypeError,
+           codecs.ignore_errors,
+           UnicodeError("ouch")
+        )
+        # If the correct exception is passed in, "ignore" returns an empty replacement
+        self.assertEquals(
+            codecs.ignore_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+            (u"", 1)
+        )
+        self.assertEquals(
+            codecs.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
+            (u"", 1)
+        )
+        self.assertEquals(
+            codecs.ignore_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")),
+            (u"", 1)
+        )
+
+    def test_badandgoodreplaceexceptions(self):
+        # "replace" complains about a non-exception passed in
+        self.assertRaises(
+           TypeError,
+           codecs.replace_errors,
+           42
+        )
+        # "replace" complains about the wrong exception type
+        self.assertRaises(
+           TypeError,
+           codecs.replace_errors,
+           UnicodeError("ouch")
+        )
+        # With the correct exception, "ignore" returns an empty replacement
+        self.assertEquals(
+            codecs.replace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+            (u"?", 1)
+        )
+        self.assertEquals(
+            codecs.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
+            (u"\ufffd", 1)
+        )
+        self.assertEquals(
+            codecs.replace_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")),
+            (u"\ufffd", 1)
+        )
+
+    def test_badandgoodxmlcharrefreplaceexceptions(self):
+        # "xmlcharrefreplace" complains about a non-exception passed in
+        self.assertRaises(
+           TypeError,
+           codecs.xmlcharrefreplace_errors,
+           42
+        )
+        # "xmlcharrefreplace" complains about the wrong exception types
+        self.assertRaises(
+           TypeError,
+           codecs.xmlcharrefreplace_errors,
+           UnicodeError("ouch")
+        )
+        # "xmlcharrefreplace" can only be used for encoding
+        self.assertRaises(
+            TypeError,
+            codecs.xmlcharrefreplace_errors,
+            UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
+        )
+        self.assertRaises(
+            TypeError,
+            codecs.xmlcharrefreplace_errors,
+            UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
+        )
+        # Use the correct exception
+        self.assertEquals(
+            codecs.xmlcharrefreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+            (u"&#%d;" % 0x3042, 1)
+        )
+
+    def test_badandgoodbackslashreplaceexceptions(self):
+        # "backslashreplace" complains about a non-exception passed in
+        self.assertRaises(
+           TypeError,
+           codecs.backslashreplace_errors,
+           42
+        )
+        # "backslashreplace" complains about the wrong exception types
+        self.assertRaises(
+           TypeError,
+           codecs.backslashreplace_errors,
+           UnicodeError("ouch")
+        )
+        # "backslashreplace" can only be used for encoding
+        self.assertRaises(
+            TypeError,
+            codecs.backslashreplace_errors,
+            UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
+        )
+        self.assertRaises(
+            TypeError,
+            codecs.backslashreplace_errors,
+            UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
+        )
+        # Use the correct exception
+        self.assertEquals(
+            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+            (u"\\u3042", 1)
+        )
+        self.assertEquals(
+            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\x00", 0, 1, "ouch")),
+            (u"\\x00", 1)
+        )
+        self.assertEquals(
+            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\xff", 0, 1, "ouch")),
+            (u"\\xff", 1)
+        )
+        self.assertEquals(
+            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u0100", 0, 1, "ouch")),
+            (u"\\u0100", 1)
+        )
+        self.assertEquals(
+            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\uffff", 0, 1, "ouch")),
+            (u"\\uffff", 1)
+        )
+        if sys.maxunicode>0xffff:
+            self.assertEquals(
+                codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U00010000", 0, 1, "ouch")),
+                (u"\\U00010000", 1)
+            )
+            self.assertEquals(
+                codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U0010ffff", 0, 1, "ouch")),
+                (u"\\U0010ffff", 1)
+            )
+
+    def test_badhandlerresults(self):
+        results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
+        encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
+
+        for res in results:
+            codecs.register_error("test.badhandler", lambda: res)
+            for enc in encs:
+                self.assertRaises(
+                    TypeError,
+                    codecs.encode,
+                    u"\u3042",
+                    enc,
+                    "test.badhandler"
+                )
+            for (enc, bytes) in (
+                ("ascii", "\xff"),
+                ("utf-8", "\xff"),
+                ("utf-7", "+x-")
+            ):
+                self.assertRaises(
+                    TypeError,
+                    codecs.decode,
+                    bytes,
+                    enc,
+                    "test.badhandler"
+                )
+
+    def test_lookup(self):
+        self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict"))
+        self.assertEquals(codecs.ignore_errors, codecs.lookup_error("ignore"))
+        self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict"))
+        self.assertEquals(
+            codecs.xmlcharrefreplace_errors,
+            codecs.lookup_error("xmlcharrefreplace")
+        )
+        self.assertEquals(
+            codecs.backslashreplace_errors,
+            codecs.lookup_error("backslashreplace")
+        )
+
+    def test_unencodablereplacement(self):
+        def unencrepl(exc):
+            if isinstance(exc, UnicodeEncodeError):
+                return (u"\u4242", exc.end)
+            else:
+                raise TypeError("don't know how to handle %r" % exc)
+        codecs.register_error("test.unencreplhandler", unencrepl)
+        for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
+            self.assertRaises(
+                UnicodeEncodeError,
+                codecs.encode,
+                u"\u4242",
+                enc,
+                "test.unencreplhandler"
+            )
+
+    def test_badregistercall(self):
+        # enhance coverage of:
+        # Modules/_codecsmodule.c::register_error()
+        # Python/codecs.c::PyCodec_RegisterError()
+        self.assertRaises(TypeError, codecs.register_error, 42)
+        self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
+
+    def test_unknownhandler(self):
+        # enhance coverage of:
+        # Modules/_codecsmodule.c::lookup_error()
+        self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
+
+    def test_xmlcharrefvalues(self):
+        # enhance coverage of:
+        # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
+        # and inline implementations
+        v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000)
+        if sys.maxunicode>=100000:
+            v += (100000, 500000, 1000000)
+        s = u"".join([unichr(x) for x in v])
+        codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
+        for enc in ("ascii", "iso-8859-15"):
+            for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
+                codecs.encode(s,enc, err)
+
+    def test_decodehelper(self):
+        # enhance coverage of:
+        # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
+        # and callers
+        self.assertRaises(LookupError, codecs.decode,"\xff", "ascii", "test.unknown")
+
+        def baddecodereturn1(exc):
+            return 42
+        codecs.register_error("test.baddecodereturn1", baddecodereturn1)
+        self.assertRaises(TypeError, codecs.decode, "\xff", "ascii", "test.baddecodereturn1")
+        self.assertRaises(TypeError, codecs.decode, "\\", "unicode-escape", "test.baddecodereturn1")
+        self.assertRaises(TypeError, codecs.decode, "\\x0", "unicode-escape", "test.baddecodereturn1")
+        self.assertRaises(TypeError, codecs.decode, "\\x0y", "unicode-escape", "test.baddecodereturn1")
+        self.assertRaises(TypeError, codecs.decode, "\\Uffffeeee", "unicode-escape", "test.baddecodereturn1")
+        self.assertRaises(TypeError, codecs.decode, "\\uyyyy", "raw-unicode-escape", "test.baddecodereturn1")
+
+        def baddecodereturn2(exc):
+            return (u"?", None)
+        codecs.register_error("test.baddecodereturn2", baddecodereturn2)
+        self.assertRaises(TypeError, codecs.decode, "\xff", "ascii", "test.baddecodereturn2")
+
+        handler = PosReturn()
+        codecs.register_error("test.posreturn", handler.handle)
+
+        # Valid negative position
+        handler.pos = -1
+        self.assertEquals(codecs.decode( "\xff0","ascii", "test.posreturn"), u"<?>0")
+
+        # Valid negative position
+        handler.pos = -2
+        self.assertEquals(codecs.decode("\xff0","ascii", "test.posreturn"), u"<?><?>")
+
+        # Negative position out of bounds
+        handler.pos = -3
+        self.assertRaises(IndexError, codecs.decode,"\xff0", "ascii", "test.posreturn")
+
+        # Valid positive position
+        handler.pos = 1
+        self.assertEquals(codecs.decode("\xff0","ascii", "test.posreturn"), u"<?>0")
+
+        # Largest valid positive position (one beyond end of input
+        handler.pos = 2
+        self.assertEquals(codecs.decode("\xff0","ascii", "test.posreturn"), u"<?>")
+
+        # Invalid positive position
+        handler.pos = 3
+        self.assertRaises(IndexError, codecs.decode,"\xff0", "ascii", "test.posreturn")
+
+        # Restart at the "0"
+        handler.pos = 6
+        self.assertEquals(codecs.decode("\\uyyyy0","raw-unicode-escape", "test.posreturn"), u"<?>0")
+
+        class D(dict):
+            def __getitem__(self, key):
+                raise ValueError
+        self.assertRaises(UnicodeError, codecs.charmap_decode, "\xff", "strict", {0xff: None})
+        self.assertRaises(ValueError, codecs.charmap_decode, "\xff", "strict", D())
+        self.assertRaises(TypeError, codecs.charmap_decode, "\xff", "strict", {0xff: sys.maxunicode+1})
+
+    def test_encodehelper(self):
+        # enhance coverage of:
+        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
+        # and callers
+        self.assertRaises(LookupError, codecs.decode,u"\xff", "ascii", "test.unknown")
+
+        def badencodereturn1(exc):
+            return 42
+        codecs.register_error("test.badencodereturn1", badencodereturn1)
+        self.assertRaises(TypeError, codecs.decode, u"\xff", "ascii", "test.badencodereturn1")
+
+        def badencodereturn2(exc):
+            return (u"?", None)
+        codecs.register_error("test.badencodereturn2", badencodereturn2)
+        self.assertRaises(TypeError, codecs.decode,u"\xff", "ascii", "test.badencodereturn2")
+
+        handler = PosReturn()
+        codecs.register_error("test.posreturn", handler.handle)
+
+        # Valid negative position
+        handler.pos = -1
+        self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
+
+        # Valid negative position
+        handler.pos = -2
+        self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>")
+
+        # Negative position out of bounds
+        handler.pos = -3
+        self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
+
+        # Valid positive position
+        handler.pos = 1
+        self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
+
+        # Largest valid positive position (one beyond end of input
+        handler.pos = 2
+        self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>")
+
+        # Invalid positive position
+        handler.pos = 3
+        self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
+
+        handler.pos = 0
+
+        class D(dict):
+            def __getitem__(self, key):
+                raise ValueError
+        for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
+            self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None})
+            self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D())
+            self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300})
+
+    def test_translatehelper(self):
+        # enhance coverage of:
+        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
+        # and callers
+        # (Unfortunately the errors argument is not directly accessible
+        # from Python, so we can't test that much)
+        class D(dict):
+            def __getitem__(self, key):
+                raise ValueError
+        self.assertRaises(ValueError, u"\xff".translate, D())
+        self.assertRaises(TypeError, u"\xff".translate, {0xff: sys.maxunicode+1})
+        self.assertRaises(TypeError, u"\xff".translate, {0xff: ()})
+
+    def test_bug828737(self):
+        charmap = {
+            ord("&"): u"&amp;",
+            ord("<"): u"&lt;",
+            ord(">"): u"&gt;",
+            ord('"'): u"&quot;",
+        }
+        
+        for n in (1, 10, 100, 1000):
+            text = u'abc<def>ghi'*n
+            text.translate(charmap)
+
+def test_main():
+    test.test_support.run_unittest(CodecCallbackTest)
+
+if __name__ == "__main__":
+    test_main()

Added: pypy/dist/pypy/lib/unicodecodec.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/lib/unicodecodec.py	Sat Apr 30 00:00:11 2005
@@ -0,0 +1,931 @@
+
+## indicate whether a UTF-7 character is special i.e. cannot be directly
+##       encoded:
+##	   0 - not special
+##	   1 - special
+##	   2 - whitespace (optional)
+##	   3 - RFC2152 Set O (optional)
+    
+utf7_special = [
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
+    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
+    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
+]
+unicode_latin1=[]
+for i in range(256):
+    unicode_latin1.append(None)
+    
+def PyUnicode_Check(op):
+    return type(op) == unicode
+def PyUnicode_CheckExact(op):
+    return (type(op) == unicode)
+
+
+def PyUnicode_GET_SIZE(op):
+        return len(unicode(op))
+def PyUnicode_GET_DATA_SIZE(op):
+        return len(unicode(op)) * len(u' ')
+def PyUnicode_AS_UNICODE(op):
+        unicode(op)
+def PyUnicode_AS_DATA(op):
+        buffer(unicode(op)) #XXX This is a read only buffer
+
+def SPECIAL(c, encodeO, encodeWS):
+    c = ord(c)
+    return (c>127 or utf7_special[c] == 1) or \
+            (encodeWS and (utf7_special[(c)] == 2)) or \
+            (encodeO and (utf7_special[(c)] == 3))
+def B64(n):
+    return ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
+def B64CHAR(c):
+    return (isalnum(c) or (c) == '+' or (c) == '/')
+def UB64(c):
+    if (c) == '+' :
+        return 62 
+    elif (c) == '/':
+        return 63 
+    elif (c) >= 'a':
+        return ord(c) - 71 
+    elif (c) >= 'A':
+        return ord(c) - 65 
+    else: 
+        return ord(c) + 4
+
+def ENCODE(out, ch, bits) :
+    while (bits >= 6):
+        out.append( B64(ch >> (bits-6)))
+        bits -= 6; 
+    return ''.join(out),ch,bits
+
+def DECODE(out, ch, bits, surrogate):
+    while (bits >= 16):
+        outCh = unicode (chr((ord(ch) >> (bits-16)) & 0xffff))
+        bits -= 16
+        if (surrogate):
+            ##            We have already generated an error for the high surrogate
+            ##            so let's not bother seeing if the low surrogate is correct or not 
+			surrogate = 0
+        elif (0xDC00 <= outCh and outCh <= 0xDFFF):
+##             This is a surrogate pair. Unfortunately we can't represent 
+##               it in a 16-bit character 
+            surrogate = 1
+            raise UnicodeDecodeError,"code pairs are not supported"
+        else:
+			out.append( outCh )
+    return ''.join(out),ch,bits,surrogate
+
+def PyUnicode_DecodeUTF7(s, size, errors):
+
+    starts = s
+    errmsg = ""
+    inShift = 0
+    bitsleft = 0
+    charsleft = 0
+    surrogate = 0
+    p = []
+    errorHandler = None
+    exc = None
+
+    if (size == 0):
+        return unicode('')
+    i = 0
+    while i < size:
+        
+        ch = s[i]
+        if (inShift):
+            if ((ch == '-') or not B64CHAR(ch)):
+                inShift = 0
+                i += 1
+                p, charsleft, bitsleft, surrogate =  DECODE(p, charsleft, bitsleft, surrogate);
+                if (bitsleft >= 6):
+##                    /* The shift sequence has a partial character in it. If
+##                       bitsleft < 6 then we could just classify it as padding
+##                       but that is not the case here */
+
+                    raise UnicodeDecodeError, "partial character in shift sequence"
+##                /* According to RFC2152 the remaining bits should be zero. We
+##                   choose to signal an error/insert a replacement character
+##                   here so indicate the potential of a misencoded character. */
+
+##                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
+                if (bitsleft and (charsleft << (sizeof(charsleft) * 8 - bitsleft))):
+                    raise UnicodeDecodeError, "non-zero padding bits in shift sequence"
+                if (ch == '-') :
+                    if ((i < size) and (s[i] == '-')) :
+                        p.append( '-')
+                        inShift = 1
+                    
+                elif SPECIAL(ch,0,0) :
+                    raise  UnicodeDecodeError,"unexpected special character"
+	                
+                else:  
+                    p.append( ch )
+            else:
+                charsleft = (charsleft << 6) | UB64(ch)
+                bitsleft += 6
+                i+=1
+##                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
+        elif ( ch == '+' ):
+            startinpos = i
+            i+=1
+            if (i<size and s[i] == '-'):
+                i+=1
+                p.append( '+')
+            else:
+                inShift = 1
+                bitsleft = 0
+                
+        elif (SPECIAL(ch,0,0)):
+            i+=1
+            raise UnicodeDecodeError,"unexpected special character"
+        else:
+            p.append( ch )
+            i+=1
+
+    if (inShift) :
+        outpos = p-PyUnicode_AS_UNICODE(unicode);
+        endinpos = size;
+        raise UnicodeDecodeError, "unterminated shift sequence"
+        
+    return unicode(''.join(p))
+
+def PyUnicode_EncodeUTF7(s, size, encodeSetO, encodeWhiteSpace, errors):
+
+#    /* It might be possible to tighten this worst case */
+    inShift = 0
+    i = 0
+    bitsleft = 0
+    charsleft = 0
+    out = []
+    for ch in s:
+        if (not inShift) :
+            if (ch == '+'):
+                out.append( '+')
+                out.append( '-')
+            elif (SPECIAL(ch, encodeSetO, encodeWhiteSpace)):
+                charsleft = ch
+                bitsleft = 16
+                out.append('+')
+                out, charsleft, bitsleft = ENCODE(out, charsleft, bitsleft)
+                inShift = bitsleft > 0
+            else:
+                out.append(ch)
+        else:
+            if (not SPECIAL(ch, encodeSetO, encodeWhiteSpace)):
+                out.append(B64(charsleft << (6-bitsleft)))
+                charsleft = 0
+                bitsleft = 0
+##                /* Characters not in the BASE64 set implicitly unshift the sequence
+##                   so no '-' is required, except if the character is itself a '-' */
+                if (B64CHAR(ch) or ch == '-'):
+                    out.append('-')
+                inShift = 0
+                out.append(ch)
+            else:
+                bitsleft += 16
+                charsleft = (charsleft << 16) | ch
+                out, charsleft, bitsleft =  ENCODE(out, charsleft, bitsleft)
+
+##                /* If the next character is special then we dont' need to terminate
+##                   the shift sequence. If the next character is not a BASE64 character
+##                   or '-' then the shift sequence will be terminated implicitly and we
+##                   don't have to insert a '-'. */
+
+                if (bitsleft == 0):
+                    if (i + 1 < size):
+                        ch2 = s[i+1]
+
+                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)):
+                            pass
+                        elif (B64CHAR(ch2) or ch2 == '-'):
+                            out.append( '-')
+                            inShift = 0
+                        else:
+                            inShift = 0
+                    else:
+                        out.append( '-')
+                        inShift = 0
+        i+=1
+            
+    if (bitsleft):
+        out.append(B64(charsleft << (6-bitsleft) ))
+        out.append( '-')
+
+    return ''.join(out)
+
+def PyUnicode_FromOrdinal(ordinal):
+    
+    if (ordinal < 0 or ordinal > 0x10ffff):
+        raise ValueError, "unichr() arg not in range(0x110000) (wide Python build)"
+	
+##    if (ordinal < 0 or ordinal > 0xffff):
+##        raise ValueError, "unichr() arg not in range(0x1000) (narrow Python build)"
+	
+    s = unichr(ordinal)
+    return s,1
+
+def PyUnicode_FromObject(obj):
+
+##    /* XXX Perhaps we should make this API an alias of
+##           PyObject_Unicode() instead ?! */
+    if (PyUnicode_CheckExact(obj)):
+        return obj
+    
+    if (PyUnicode_Check(obj)):
+##	/* For a Unicode subtype that's not a Unicode object,
+##	   return a true Unicode object with the same data. */
+        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),PyUnicode_GET_SIZE(obj))
+    return PyUnicode_FromEncodedObject(obj, None, "strict")
+
+unicode_empty=u''
+
+def PyUnicode_FromUnicode(u, size):
+
+##    /* If the Unicode data is known at construction time, we can apply
+##       some optimizations which share commonly used objects. */
+    if (u):
+
+##	/* Optimization for empty strings */
+    	if (size == 0 and unicode_empty != None) :
+    	    return unicode_empty
+    
+    ##	/* Single character Unicode objects in the Latin-1 range are
+    ##	   shared when using this constructor */
+    	if (size == 1 and ord(u) < 256) :
+            result = unicode_latin1[ord(u)]
+    	    if (not result):
+                result = unicode(u)
+                unicode_latin1[ord(u)] = result
+    		if (not result):
+    		    return None
+    	    return result
+        return unicode(u)
+    
+def PyUnicode_Decode(s,size,encoding,errors):
+
+    if (encoding == None):
+        encoding = PyUnicode_GetDefaultEncoding()
+
+##    /* Shortcuts for common default encodings */
+    decoder = encodings.get(encoding,None)
+    if decoder:
+        return decoder(s,encoding,errors)
+##    /* Decode via the codec registry */
+    buf = buffer(s)
+    result = PyCodec_Decode(buf, encoding, errors);
+    if (not PyUnicode_Check(result)):
+        raise UnicodeDecodeError, "decoder did not return an unicode object (type=%.400s)"%type(result)
+    return result
+
+def PyUnicode_FromEncodedObject(obj, encoding,errors):
+    
+    s = str(obj)
+    v = PyUnicode_Decode(s, len(s), encoding, errors)
+    return v
+
+def PyUnicode_DecodeASCII(s, size, errors):
+
+#    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
+    if (size == 1 and ord(s) < 128) :
+        return PyUnicode_FromUnicode(unicode(s), 1)
+    if (size == 0):
+        return unicode('')
+    p = []
+    for c in s:
+        if ord(c) < 128:
+            p.append(c)
+        else:
+            UnicodeDecodeError("ordinal not in range(128)",s.index(c))
+    return ''.join(p)
+
+def PyUnicode_EncodeASCII(p,size,errors):
+
+    return unicode_encode_ucs1(p, size, errors, 128)
+
+def PyUnicode_AsASCIIString(unistr):
+
+    if not type(unistr) == unicode:
+        raise BadArgumnentError
+    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unistr),
+				 len(unicode),
+				None)
+
+##def PyUnicode_DecodeUTF16Stateful(s,size,errors,byteorder,consumed):
+##
+##    bo = 0;       /* assume native ordering by default */
+##    errmsg = "";
+##    /* Offsets from q for retrieving byte pairs in the right order. */
+###ifdef BYTEORDER_IS_LITTLE_ENDIAN
+##    int ihi = 1, ilo = 0;
+###else
+##    int ihi = 0, ilo = 1;
+###endif
+##    PyObject *errorHandler = NULL;
+##    PyObject *exc = NULL;
+##
+##    /* Note: size will always be longer than the resulting Unicode
+##       character count */
+##    unicode = _PyUnicode_New(size);
+##    if (!unicode)
+##        return NULL;
+##    if (size == 0)
+##        return (PyObject *)unicode;
+##
+##    /* Unpack UTF-16 encoded data */
+##    p = unicode->str;
+##    q = (unsigned char *)s;
+##    e = q + size;
+##
+##    if (byteorder)
+##        bo = *byteorder;
+##
+##    /* Check for BOM marks (U+FEFF) in the input and adjust current
+##       byte order setting accordingly. In native mode, the leading BOM
+##       mark is skipped, in all other modes, it is copied to the output
+##       stream as-is (giving a ZWNBSP character). */
+##    if (bo == 0) {
+##        if (size >= 2) {
+##            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
+###ifdef BYTEORDER_IS_LITTLE_ENDIAN
+##	    if (bom == 0xFEFF) {
+##		q += 2;
+##		bo = -1;
+##	    }
+##	    else if (bom == 0xFFFE) {
+##		q += 2;
+##		bo = 1;
+##	    }
+###else
+##	    if (bom == 0xFEFF) {
+##		q += 2;
+##		bo = 1;
+##	    }
+##	    else if (bom == 0xFFFE) {
+##		q += 2;
+##		bo = -1;
+##	    }
+###endif
+##	}
+##    }
+##
+##    if (bo == -1) {
+##        /* force LE */
+##        ihi = 1;
+##        ilo = 0;
+##    }
+##    else if (bo == 1) {
+##        /* force BE */
+##        ihi = 0;
+##        ilo = 1;
+##    }
+##
+##    while (q < e) {
+##	Py_UNICODE ch;
+##	/* remaining bytes at the end? (size should be even) */
+##	if (e-q<2) {
+##	    if (consumed)
+##		break;
+##	    errmsg = "truncated data";
+##	    startinpos = ((const char *)q)-starts;
+##	    endinpos = ((const char *)e)-starts;
+##	    goto utf16Error;
+##	    /* The remaining input chars are ignored if the callback
+##	       chooses to skip the input */
+##	}
+##	ch = (q[ihi] << 8) | q[ilo];
+##
+##	q += 2;
+##
+##	if (ch < 0xD800 || ch > 0xDFFF) {
+##	    *p++ = ch;
+##	    continue;
+##	}
+##
+##	/* UTF-16 code pair: */
+##	if (q >= e) {
+##	    errmsg = "unexpected end of data";
+##	    startinpos = (((const char *)q)-2)-starts;
+##	    endinpos = ((const char *)e)-starts;
+##	    goto utf16Error;
+##	}
+##	if (0xD800 <= ch && ch <= 0xDBFF) {
+##	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
+##	    q += 2;
+##	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+###ifndef Py_UNICODE_WIDE
+##		*p++ = ch;
+##		*p++ = ch2;
+###else
+##		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+###endif
+##		continue;
+##	    }
+##	    else {
+##                errmsg = "illegal UTF-16 surrogate";
+##		startinpos = (((const char *)q)-4)-starts;
+##		endinpos = startinpos+2;
+##		goto utf16Error;
+##	    }
+##
+##	}
+##	errmsg = "illegal encoding";
+##	startinpos = (((const char *)q)-2)-starts;
+##	endinpos = startinpos+2;
+##	/* Fall through to report the error */
+##
+##    utf16Error:
+##	outpos = p-PyUnicode_AS_UNICODE(unicode);
+##	if (unicode_decode_call_errorhandler(
+##	         errors, &errorHandler,
+##	         "utf16", errmsg,
+##	         starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
+##	         (PyObject **)&unicode, &outpos, &p))
+##	    goto onError;
+##    }
+##
+##    if (byteorder)
+##        *byteorder = bo;
+##
+##    if (consumed)
+##	*consumed = (const char *)q-starts;
+##
+##    /* Adjust length */
+##    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
+##        goto onError;
+##
+##    Py_XDECREF(errorHandler);
+##    Py_XDECREF(exc);
+##    return (PyObject *)unicode;
+##
+##onError:
+##    Py_DECREF(unicode);
+##    Py_XDECREF(errorHandler);
+##    Py_XDECREF(exc);
+##    return NULL;
+##}
+
+def PyUnicode_EncodeUTF16(s,size,errors,byteorder='little'):
+
+#    /* Offsets from p for storing byte pairs in the right order. */
+###ifdef BYTEORDER_IS_LITTLE_ENDIAN
+##    int ihi = 1, ilo = 0;
+###else
+##    int ihi = 0, ilo = 1;
+###endif
+
+    def STORECHAR(CH,byteorder):
+        hi = chr(((CH) >> 8) & 0xff)
+        lo = chr((CH) & 0xff)
+        if byteorder == 'little':
+            return [lo,hi]
+        else:
+            return [hi,lo]
+        
+    p = []
+    import sys
+    bom = sys.byteorder
+    if (byteorder == 0):
+        import sys
+        bom = sys.byteorder
+        p.extend(STORECHAR(0xFEFF,bom))
+        
+    if (size == 0):
+        return ""
+
+    if (byteorder == -1):
+        bom = 'little'
+    elif (byteorder == 1):
+        bom = 'big'
+
+    
+    for c in s:
+        ch = ord(c)
+        ch2 = 0
+        if (ch >= 0x10000) :
+            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
+            ch  = 0xD800 | ((ch-0x10000) >> 10)
+
+        p.extend(STORECHAR(ch,bom))
+        if (ch2):
+            p.extend(STORECHAR(ch2,bom))
+
+    return ''.join(p)
+
+
+def PyUnicode_DecodeMBCS(s, size, errors):
+    pass
+##{
+##    PyUnicodeObject *v;
+##    Py_UNICODE *p;
+##
+##    /* First get the size of the result */
+##    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
+##    if (size > 0 && usize==0)
+##        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+##
+##    v = _PyUnicode_New(usize);
+##    if (v == NULL)
+##        return NULL;
+##    if (usize == 0)
+##	return (PyObject *)v;
+##    p = PyUnicode_AS_UNICODE(v);
+##    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
+##        Py_DECREF(v);
+##        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+##    }
+##
+##    return (PyObject *)v;
+##}
+
+##def PyUnicode_EncodeMBCS(p, size, errors):
+##
+####    /* If there are no characters, bail now! */
+##    if (size==0)
+##	    return ""
+##    from ctypes import *
+##    WideCharToMultiByte = windll.kernel32.WideCharToMultiByte
+####    /* First get the size of the result */
+##    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, s, 0, None, None);
+##    if (mbcssize==0)
+##        raise UnicodeEncodeError, "Windows cannot decode the string %s" %p
+### More error handling required (check windows errors and such)
+##    
+###    /* Do the conversion */
+####    s = ' '*mbcssize
+####    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)):
+####        raise UnicodeEncodeError, "Windows cannot decode the string %s" %p
+##    return s
+def PyUnicode_DecodeUTF8(s, size, errors):
+
+    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
+
+def PyUnicode_DecodeUTF8Stateful(s,size,errors,consumed):
+    pass
+##{
+##    const char *starts = s;
+##    int n;
+##    int startinpos;
+##    int endinpos;
+##    int outpos;
+##    const char *e;
+##    PyUnicodeObject *unicode;
+##    Py_UNICODE *p;
+##    const char *errmsg = "";
+##    PyObject *errorHandler = NULL;
+##    PyObject *exc = NULL;
+##
+##    /* Note: size will always be longer than the resulting Unicode
+##       character count */
+##    unicode = _PyUnicode_New(size);
+##    if (!unicode)
+##        return NULL;
+##    if (size == 0) {
+##        if (consumed)
+##            *consumed = 0;
+##        return (PyObject *)unicode;
+##    }
+##
+##    /* Unpack UTF-8 encoded data */
+##    p = unicode->str;
+##    e = s + size;
+##
+##    while (s < e) {
+##        Py_UCS4 ch = (unsigned char)*s;
+##
+##        if (ch < 0x80) {
+##            *p++ = (Py_UNICODE)ch;
+##            s++;
+##            continue;
+##        }
+##
+##        n = utf8_code_length[ch];
+##
+##        if (s + n > e) {
+##	    if (consumed)
+##		break;
+##	    else {
+##		errmsg = "unexpected end of data";
+##		startinpos = s-starts;
+##		endinpos = size;
+##		goto utf8Error;
+##	    }
+##	}
+##
+##        switch (n) {
+##
+##        case 0:
+##            errmsg = "unexpected code byte";
+##	    startinpos = s-starts;
+##	    endinpos = startinpos+1;
+##	    goto utf8Error;
+##
+##        case 1:
+##            errmsg = "internal error";
+##	    startinpos = s-starts;
+##	    endinpos = startinpos+1;
+##	    goto utf8Error;
+##
+##        case 2:
+##            if ((s[1] & 0xc0) != 0x80) {
+##                errmsg = "invalid data";
+##		startinpos = s-starts;
+##		endinpos = startinpos+2;
+##		goto utf8Error;
+##	    }
+##            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
+##            if (ch < 0x80) {
+##		startinpos = s-starts;
+##		endinpos = startinpos+2;
+##                errmsg = "illegal encoding";
+##		goto utf8Error;
+##	    }
+##	    else
+##		*p++ = (Py_UNICODE)ch;
+##            break;
+##
+##        case 3:
+##            if ((s[1] & 0xc0) != 0x80 ||
+##                (s[2] & 0xc0) != 0x80) {
+##                errmsg = "invalid data";
+##		startinpos = s-starts;
+##		endinpos = startinpos+3;
+##		goto utf8Error;
+##	    }
+##            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+##            if (ch < 0x0800) {
+##		/* Note: UTF-8 encodings of surrogates are considered
+##		   legal UTF-8 sequences;
+##
+##		   XXX For wide builds (UCS-4) we should probably try
+##		       to recombine the surrogates into a single code
+##		       unit.
+##		*/
+##                errmsg = "illegal encoding";
+##		startinpos = s-starts;
+##		endinpos = startinpos+3;
+##		goto utf8Error;
+##	    }
+##	    else
+##		*p++ = (Py_UNICODE)ch;
+##            break;
+##
+##        case 4:
+##            if ((s[1] & 0xc0) != 0x80 ||
+##                (s[2] & 0xc0) != 0x80 ||
+##                (s[3] & 0xc0) != 0x80) {
+##                errmsg = "invalid data";
+##		startinpos = s-starts;
+##		endinpos = startinpos+4;
+##		goto utf8Error;
+##	    }
+##            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
+##                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
+##            /* validate and convert to UTF-16 */
+##            if ((ch < 0x10000)        /* minimum value allowed for 4
+##					 byte encoding */
+##                || (ch > 0x10ffff))   /* maximum value allowed for
+##					 UTF-16 */
+##	    {
+##                errmsg = "illegal encoding";
+##		startinpos = s-starts;
+##		endinpos = startinpos+4;
+##		goto utf8Error;
+##	    }
+###ifdef Py_UNICODE_WIDE
+##	    *p++ = (Py_UNICODE)ch;
+###else
+##            /*  compute and append the two surrogates: */
+##
+##            /*  translate from 10000..10FFFF to 0..FFFF */
+##            ch -= 0x10000;
+##
+##            /*  high surrogate = top 10 bits added to D800 */
+##            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
+##
+##            /*  low surrogate = bottom 10 bits added to DC00 */
+##            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
+###endif
+##            break;
+##
+##        default:
+##            /* Other sizes are only needed for UCS-4 */
+##            errmsg = "unsupported Unicode code range";
+##	    startinpos = s-starts;
+##	    endinpos = startinpos+n;
+##	    goto utf8Error;
+##        }
+##        s += n;
+##	continue;
+##
+##    utf8Error:
+##    outpos = p-PyUnicode_AS_UNICODE(unicode);
+##    if (unicode_decode_call_errorhandler(
+##	     errors, &errorHandler,
+##	     "utf8", errmsg,
+##	     starts, size, &startinpos, &endinpos, &exc, &s,
+##	     (PyObject **)&unicode, &outpos, &p))
+##	goto onError;
+##    }
+##    if (consumed)
+##	*consumed = s-starts;
+##
+##    /* Adjust length */
+##    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
+##        goto onError;
+##
+##    Py_XDECREF(errorHandler);
+##    Py_XDECREF(exc);
+##    return (PyObject *)unicode;
+##
+##onError:
+##    Py_XDECREF(errorHandler);
+##    Py_XDECREF(exc);
+##    Py_DECREF(unicode);
+##    return NULL;
+##}
+##
+##/* Allocation strategy:  if the string is short, convert into a stack buffer
+##   and allocate exactly as much space needed at the end.  Else allocate the
+##   maximum possible needed (4 result bytes per Unicode character), and return
+##   the excess memory at the end.
+##*/
+
+def PyUnicode_EncodeUTF8(s,size,errors):
+
+    assert(s != NULL)
+    assert(size >= 0)
+    p = []
+    i = 0
+    while i<size:
+        ch = s[i]
+        i+=1
+        if (ord(ch) < 0x80):
+##         /* Encode ASCII */
+            p.append(ch)
+        elif (ord(ch) < 0x0800) :
+##            /* Encode Latin-1 */
+            p.append(chr((0xc0 | (ch >> 6))))
+            p.append(chr((0x80 | (ch & 0x3f))))
+        else:
+##            /* Encode UCS2 Unicode ordinals */
+            if (ord(ch) < 0x10000):
+##                /* Special case: check for high surrogate */
+                if (0xD800 <=ord(ch) and ord(ch) <= 0xDBFF and i != size) :
+                    ch2 = s[i]
+##                    /* Check for low surrogate and combine the two to
+##                       form a UCS4 value */
+                    if (0xDC00 <= ch2 and ch2 <= 0xDFFF) :
+                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
+                        i+=1
+                        p.extend(encodeUCS4(ch))
+                        continue
+##                    /* Fall through: handles isolated high surrogates */
+                p.append (chr((0xe0 | (ch >> 12))))
+                p.append (chr((0x80 | ((ch >> 6) & 0x3f))))
+                p.append (chr((0x80 | (ch & 0x3f))))
+                continue
+    return ''.join(p)
+
+def encodeUCS4(ch):
+##      /* Encode UCS4 Unicode ordinals */
+    p=[]
+    p.append (chr((0xf0 | (ch >> 18))))
+    p.append (chr((0x80 | ((ch >> 12) & 0x3f))))
+    p.append (chr((0x80 | ((ch >> 6) & 0x3f))))
+    p.append (chr((0x80 | (ch & 0x3f))))
+    return p
+
+#/* --- Latin-1 Codec ------------------------------------------------------ */
+
+def PyUnicode_DecodeLatin1(s, size, errors):
+    pass
+##{
+##    PyUnicodeObject *v;
+##    Py_UNICODE *p;
+##
+##    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
+##    if (size == 1) {
+##	Py_UNICODE r = *(unsigned char*)s;
+##	return PyUnicode_FromUnicode(&r, 1);
+##    }
+##
+##    v = _PyUnicode_New(size);
+##    if (v == NULL)
+##	goto onError;
+##    if (size == 0)
+##	return (PyObject *)v;
+##    p = PyUnicode_AS_UNICODE(v);
+##    while (size-- > 0)
+##	*p++ = (unsigned char)*s++;
+##    return (PyObject *)v;
+##
+## onE rror:
+##    Py_XDECREF(v);
+##    return NULL;
+##}
+from pypy.lib._codecs import lookup_error
+
+def unicode_encode_ucs1(p,size,errors,limit):
+
+    if limit==256:
+        reason = "ordinal not in range(256)"
+        encoding = "latin-1"
+    else:
+        reason = "ordinal not in range(128)"
+        encoding = "ascii"
+    
+    if (size == 0):
+        return ''
+    res = []
+    for ch in p:
+        if ord(ch) < limit:
+            res.append(ch)
+        else:
+            
+            #/* startpos for collecting unencodable chars */
+            collstart = p.index(ch)
+            collend = p.index(ch)
+            #/* find all unecodable characters */
+            for c in p[collstart:]:
+                if ord(c) >= limit:
+                    collend +=1
+                else:
+                    break
+            #uncollectable = [c for c in p if ord(c) >= limit]
+            handler = lookup_error(errors)
+            x = handler(UnicodeEncodeError(encoding,p,collstart,collend,reason))
+            res.append(x)
+
+    return ''.join(res)
+
+def PyUnicode_EncodeLatin1(p,size,errors):
+    
+    return unicode_encode_ucs1(p, size, errors, 256);
+
+def PyUnicode_EncodeRawUnicodeEscape(s,size):
+    
+    if (size == 0):
+        return u''
+
+    p = []
+    for ch in s:
+#	/* Map 32-bit characters to '\Uxxxxxxxx' */
+        if (ord(ch) >= 0x10000):
+            p.append('\\')
+            p.append('U')
+            p.append(hex(ord(ch)))
+        elif (ord(ch) >= 256) :
+#	/* Map 16-bit characters to '\uxxxx' */
+            p.append('\\')
+            p.append('u')
+            p.append(hex(ord(ch)))
+#	/* Copy everything else as-is */
+        else:
+            p.append(ch)
+    
+    p.append('\0')
+    return ''.join(p)
+
+def charmapencode_output(c,mapping,outobj,outpos):
+
+    rep = mapping[c]
+
+def PyUnicode_EncodeCharmap(p,size,mapping='latin-1',errors='strict'):
+
+##    /* the following variable is used for caching string comparisons
+##     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
+##     * 3=ignore, 4=xmlcharrefreplace */
+
+#    /* Default to Latin-1 */
+    if mapping == 'latin-1':
+        return PyUnicode_EncodeLatin1(p, size, errors)
+    if (size == 0):
+        return ''
+    inpos = 0
+    res = []
+    print type(mapping),type(p)
+    while (inpos<size):
+	#/* try to encode it */
+        try:
+            res.append( mapping[ord(p[inpos])])
+        except KeyError:
+            handler = lookup_error(errors)
+            handler(UnicodeEncodeError("charmap",p,inpos,inpos+1,"character maps to <undefined>"))
+        else:
+	    #/* done with this character => adjust input position */
+            inpos+=1
+	
+    
+    return ''.join(res)
+
+
+encodings =     {       "utf-8"     :   PyUnicode_DecodeUTF8,
+                        "latin-1"   :   PyUnicode_DecodeLatin1,
+                        "mbcs"      :   PyUnicode_DecodeMBCS,
+                        "ascii"     :   PyUnicode_DecodeASCII,
+                }        



More information about the Pypy-commit mailing list