[pypy-svn] r11639 - in pypy/dist/pypy/lib: . test2
ale at codespeak.net
ale at codespeak.net
Sat Apr 30 00:00:11 CEST 2005
Author: ale
Date: Sat Apr 30 00:00:11 2005
New Revision: 11639
Added:
pypy/dist/pypy/lib/_codecs.py
pypy/dist/pypy/lib/codecs.py
pypy/dist/pypy/lib/test2/test_codeccallbacks.py
pypy/dist/pypy/lib/unicodecodec.py
Log:
A preliminary checkin of the python translation of the _codecsmodule (and friends).
The module _codecs exposes the same methods as _codecs.pyd in CPython. As in CPython it is a wrapper for the actual codecs which I have put in unicodecodec.py ( for now). Not all codecs are implemented yet, and not all test passes.
I had to change the test2/test_codeccallbacks.py in order to test the codecs without involving unicode strings.
There some serious problems to be solved :
1) how to make a global registry from appspace. I have used module variables (in _codecs.py) for now. Ther may be better ways.
2) UnicodeError,UnicodeEncodeError,UnicodeDecodeError exceptions ar faked ? doesnt work as in Cpython anyway.
3) In translating the design from _codecsmodule.c I have discoered a sideeffect. The Encodings package injects the functions exposed in _codecs.py into a class. This converts the function into an unbound method. In a comment in the Encodings package it is mentioned that this doesnt happen for c functions.
This is work in progress
Added: pypy/dist/pypy/lib/_codecs.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/lib/_codecs.py Sat Apr 30 00:00:11 2005
@@ -0,0 +1,351 @@
+"""
+
+ _codecs -- Provides access to the codec registry and the builtin
+ codecs.
+
+ This module should never be imported directly. The standard library
+ module "codecs" wraps this builtin module for use within Python.
+
+ The codec registry is accessible via:
+
+ register(search_function) -> None
+
+ lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
+
+ The builtin Unicode codecs use the following interface:
+
+ <encoding>_encode(Unicode_object[,errors='strict']) ->
+ (string object, bytes consumed)
+
+ <encoding>_decode(char_buffer_obj[,errors='strict']) ->
+ (Unicode object, bytes consumed)
+
+ <encoding>_encode() interfaces also accept non-Unicode object as
+ input. The objects are then converted to Unicode using
+ PyUnicode_FromObject() prior to applying the conversion.
+
+ These <encoding>s are available: utf_8, unicode_escape,
+ raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
+ mbcs (on win32).
+
+
+Written by Marc-Andre Lemburg (mal at lemburg.com).
+
+Copyright (c) Corporation for National Research Initiatives.
+
+"""
+from pypy.lib.unicodecodec import *
+
+#/* --- Registry ----------------------------------------------------------- */
+codec_search_path = []
+codec_search_cache = {}
+codec_error_registry = {}
+
+def codec_register( search_function ):
+ """register(search_function)
+
+ Register a codec search function. Search functions are expected to take
+ one argument, the encoding name in all lower case letters, and return
+ a tuple of functions (encoder, decoder, stream_reader, stream_writer).
+ """
+
+ if callable(search_function):
+ codec_search_path.append(search_function)
+
+register = codec_register
+
+def codec_lookup(encoding):
+ """lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
+ Looks up a codec tuple in the Python codec registry and returns
+ a tuple of functions.
+ """
+
+ result = codec_search_cache.get(encoding,None)
+ if not result:
+ for search in codec_search_path:
+ result=search(encoding)
+ if result : break
+ return result
+
+lookup = codec_lookup
+
+def lookup_error(errors):
+ """lookup_error(errors) -> handler
+
+ Return the error handler for the specified error handling name
+ or raise a LookupError, if no handler exists under this name.
+ """
+ try:
+ err_handler = codec_error_registry[errors]
+ except KeyError:
+ raise LookupError("unknown error handler name %s"%errors)
+ return err_handler
+
+def register_error(errors, handler):
+ """register_error(errors, handler)
+
+ Register the specified error handler under the name
+ errors. handler must be a callable object, that
+ will be called with an exception instance containing
+ information about the location of the encoding/decoding
+ error and must return a (replacement, new position) tuple.
+ """
+ if callable(handler):
+ codec_error_registry[errors] = handler
+ else:
+ raise TypeError("handler must be callable")
+
+def encode(v, encoding='defaultencoding',errors='strict'):
+ """encode(obj, [encoding[,errors]]) -> object
+
+ Encodes obj using the codec registered for encoding. encoding defaults
+ to the default encoding. errors may be given to set a different error
+ handling scheme. Default is 'strict' meaning that encoding errors raise
+ a ValueError. Other possible values are 'ignore', 'replace' and
+ 'xmlcharrefreplace' as well as any other name registered with
+ codecs.register_error that can handle ValueErrors.
+ """
+
+ encoder = lookup(encoding)[0]
+ if encoder :
+ res = encoder(v,errors)
+ return res[0]
+
+def decode(obj,encoding='defaultencoding',errors='strict'):
+ """decode(obj, [encoding[,errors]]) -> object
+
+ Decodes obj using the codec registered for encoding. encoding defaults
+ to the default encoding. errors may be given to set a different error
+ handling scheme. Default is 'strict' meaning that encoding errors raise
+ a ValueError. Other possible values are 'ignore' and 'replace'
+ as well as any other name registerd with codecs.register_error that is
+ able to handle ValueErrors.
+ """
+ decoder = lookup(encoding)[1]
+ if decoder:
+ res = decoder(obj,errors)
+ return res[0]
+
+def latin_1_encode(inst,obj,errors='strict'):
+ """None
+ """
+ res = PyUnicode_EncodeLatin1(obj,len(obj),errors)
+ return res, len(res)
+# XXX MBCS codec might involve ctypes ?
+def mbcs_decode():
+ """None
+ """
+ pass
+
+def readbuffer_encode(inst,obj,errors='strict'):
+ """None
+ """
+ res = str(obj)
+ return res,len(res)
+
+def escape_encode(inst,obj,errors='strict'):
+ """None
+ """
+ s = repr(obj)
+ v = s[1:-1]
+ return v,len(v)
+# XXX
+def utf_8_decode(inst,data,errors='strict'):
+ """None
+ """
+ pass
+# XXX
+def raw_unicode_escape_decode(inst,data,errors='strict'):
+ """None
+ """
+ pass
+
+def utf_7_decode(inst,data,errors='strict'):
+ """None
+ """
+ unistr = PyUnicode_DecodeUTF7(data,errors='strict')
+ return unistr,len(unistr)
+# XXX
+def unicode_escape_encode(inst,obj,errors='strict'):
+ """None
+ """
+ pass
+# XXX
+def latin_1_decode(inst,data,errors='strict'):
+ """None
+ """
+ pass
+# XXX
+def utf_16_decode(inst,data,errors='strict'):
+ """None
+ """
+ pass
+# XXX
+def unicode_escape_decode(inst,data,errors='strict'):
+ """None
+ """
+ pass
+
+def ascii_decode(inst,data,errors='strict'):
+ """None
+ """
+ res = PyUnicode_DecodeASCII(data,len(data),errors)
+ return res,len(res)
+
+def charmap_encode(obj,errors='strict',mapping='latin-1'):
+ """None
+ """
+ res = PyUnicode_EncodeCharmap(obj,len(obj),mapping,errors)
+ return res,len(res)
+
+def unicode_internal_encode(inst,obj,errors='strict'):
+ """None
+ """
+ if type(obj) == unicode:
+ return obj, len(obj)
+ else:
+ return PyUnicode_FromUnicode(obj,size),size
+# XXX
+def utf_16_ex_decode(inst,data,errors='strict'):
+ """None
+ """
+ pass
+# XXX Check if this is right
+def escape_decode(data,errors='strict'):
+ """None
+ """
+ return data,len(data)
+
+def charbuffer_encode(inst,obj,errors='strict'):
+ """None
+ """
+ res = str(obj)
+ return res,len(res)
+# XXX
+def charmap_decode(inst,data,errors='strict'):
+ """None
+ """
+ pass
+
+def utf_7_encode(inst,obj,errors='strict'):
+ """None
+ """
+ obj = PyUnicode_FromObject(obj)
+ return (PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(obj),
+ PyUnicode_GET_SIZE(obj),
+ 0,
+ 0,
+ errors),
+ PyUnicode_GET_SIZE(obj))
+
+def mbcs_encode(inst,obj,errors='strict'):
+ """None
+ """
+ return (PyUnicode_EncodeMBCS(
+ PyUnicode_AS_UNICODE(obj),
+ PyUnicode_GET_SIZE(obj),
+ errors),
+ PyUnicode_GET_SIZE(obj));
+
+
+def ascii_encode(inst,obj,errors='strict'):
+ """None
+ """
+ return (PyUnicode_EncodeASCII(
+ PyUnicode_AS_UNICODE(obj),
+ PyUnicode_GET_SIZE(obj),
+ errors),
+ PyUnicode_GET_SIZE(obj))
+
+def utf_16_encode(inst,obj,errors='strict'):
+ """None
+ """
+ u = PyUnicode_EncodeUTF16(obj,len(obj),errors)
+ return u,len(u)
+
+def raw_unicode_escape_encode(inst,obj,errors='strict'):
+ """None
+ """
+ res = PyUnicode_EncodeRawUnicodeEscape(obj,len(obj))
+ return res,len(res)
+# XXX
+def utf_8_encode(inst,obj,errors='strict'):
+ """None
+ """
+ pass
+# XXX
+def utf_16_le_encode(inst,obj,errors='strict'):
+ """None
+ """
+ pass
+# XXX
+def utf_16_be_encode(inst,obj,errors='strict'):
+ """None
+ """
+ pass
+
+def unicode_internal_decode(inst,unistr,errors='strict'):
+ """None
+ """
+ if type(unistr) == unicode:
+ return unistr,len(unistr)
+ else:
+ return unicode(unistr),len(unistr)
+# XXX
+def utf_16_le_decode(inst,data,errors='strict'):
+ """None
+ """
+ pass
+# XXX
+def utf_16_be_decode(inst,data,errors='strict'):
+ """None
+ """
+ pass
+
+def strict_errors(exc):
+ if isinstance(exc,Exception):
+ raise exc
+ else:
+ raise TypeError("codec must pass exception instance")
+
+def ignore_errors(exc):
+ if type(exc) in [UnicodeEncodeError,UnicodeDecodeError,UnicodeTranslateError]:
+ return u'',exc.end
+ else:
+ raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
+# XXX
+def replace_errors(exc):
+ if isinstance(exc,Exception):
+ raise exc
+ else:
+ raise TypeError("codec must pass exception instance")
+# XXX
+def xmlcharrefreplace_errors(exc):
+ if isinstance(exc,Exception):
+ raise exc
+ else:
+ raise TypeError("codec must pass exception instance")
+
+def backslashreplace_errors(exc):
+ if isinstance(exc,UnicodeEncodeError):
+ p=['\\']
+ for c in exc.object[exc.start:exc.end]:
+ oc = ord(c)
+ if (oc >= 0x00010000):
+ p.append('U')
+ p.append("%.8x" % ord(c))
+ elif (oc >= 0x100):
+ p.append('u')
+ p.append("%.4x" % ord(c))
+ else:
+ p.append('x')
+ p.append("%.2x" % ord(c))
+ return u''.join(p),exc.end
+ else:
+ raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
+
+register_error("strict",strict_errors)
+register_error("ignore",ignore_errors)
+register_error("replace",replace_errors)
+register_error("xmlcharrefreplace",xmlcharrefreplace_errors)
+register_error("backslashreplace",backslashreplace_errors)
\ No newline at end of file
Added: pypy/dist/pypy/lib/codecs.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/lib/codecs.py Sat Apr 30 00:00:11 2005
@@ -0,0 +1,750 @@
+""" codecs -- Python Codec Registry, API and helpers.
+
+
+Written by Marc-Andre Lemburg (mal at lemburg.com).
+
+(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+
+"""#"
+
+import __builtin__, sys
+
+### Registry and builtin stateless codec functions
+
+try:
+ import sys
+ if sys.path[0] != r'd:\projects\pypy_co':
+ sys.path.insert(0,r'd:\projects\pypy_co')
+ from pypy.lib import _codecs
+ reload(_codecs)
+ del _codecs
+ from pypy.lib._codecs import *
+except ImportError, why:
+ raise SystemError,\
+ 'Failed to load the builtin codecs: %s' % why
+
+__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
+ "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
+ "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
+ "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
+ "strict_errors", "ignore_errors", "replace_errors",
+ "xmlcharrefreplace_errors",
+ "register_error", "lookup_error"]
+
+### Constants
+
+#
+# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
+# and its possible byte string values
+# for UTF8/UTF16/UTF32 output and little/big endian machines
+#
+
+# UTF-8
+BOM_UTF8 = '\xef\xbb\xbf'
+
+# UTF-16, little endian
+BOM_LE = BOM_UTF16_LE = '\xff\xfe'
+
+# UTF-16, big endian
+BOM_BE = BOM_UTF16_BE = '\xfe\xff'
+
+# UTF-32, little endian
+BOM_UTF32_LE = '\xff\xfe\x00\x00'
+
+# UTF-32, big endian
+BOM_UTF32_BE = '\x00\x00\xfe\xff'
+
+if sys.byteorder == 'little':
+
+ # UTF-16, native endianness
+ BOM = BOM_UTF16 = BOM_UTF16_LE
+
+ # UTF-32, native endianness
+ BOM_UTF32 = BOM_UTF32_LE
+
+else:
+
+ # UTF-16, native endianness
+ BOM = BOM_UTF16 = BOM_UTF16_BE
+
+ # UTF-32, native endianness
+ BOM_UTF32 = BOM_UTF32_BE
+
+# Old broken names (don't use in new code)
+BOM32_LE = BOM_UTF16_LE
+BOM32_BE = BOM_UTF16_BE
+BOM64_LE = BOM_UTF32_LE
+BOM64_BE = BOM_UTF32_BE
+
+
+### Codec base classes (defining the API)
+
+class Codec:
+
+ """ Defines the interface for stateless encoders/decoders.
+
+ The .encode()/.decode() methods may use different error
+ handling schemes by providing the errors argument. These
+ string values are predefined:
+
+ 'strict' - raise a ValueError error (or a subclass)
+ 'ignore' - ignore the character and continue with the next
+ 'replace' - replace with a suitable replacement character;
+ Python will use the official U+FFFD REPLACEMENT
+ CHARACTER for the builtin Unicode codecs on
+ decoding and '?' on encoding.
+ 'xmlcharrefreplace' - Replace with the appropriate XML
+ character reference (only for encoding).
+ 'backslashreplace' - Replace with backslashed escape sequences
+ (only for encoding).
+
+ The set of allowed values can be extended via register_error.
+
+ """
+ def encode(self, input, errors='strict'):
+
+ """ Encodes the object input and returns a tuple (output
+ object, length consumed).
+
+ errors defines the error handling to apply. It defaults to
+ 'strict' handling.
+
+ The method may not store state in the Codec instance. Use
+ StreamCodec for codecs which have to keep state in order to
+ make encoding/decoding efficient.
+
+ The encoder must be able to handle zero length input and
+ return an empty object of the output object type in this
+ situation.
+
+ """
+ raise NotImplementedError
+
+ def decode(self, input, errors='strict'):
+
+ """ Decodes the object input and returns a tuple (output
+ object, length consumed).
+
+ input must be an object which provides the bf_getreadbuf
+ buffer slot. Python strings, buffer objects and memory
+ mapped files are examples of objects providing this slot.
+
+ errors defines the error handling to apply. It defaults to
+ 'strict' handling.
+
+ The method may not store state in the Codec instance. Use
+ StreamCodec for codecs which have to keep state in order to
+ make encoding/decoding efficient.
+
+ The decoder must be able to handle zero length input and
+ return an empty object of the output object type in this
+ situation.
+
+ """
+ raise NotImplementedError
+
+#
+# The StreamWriter and StreamReader class provide generic working
+# interfaces which can be used to implement new encoding submodules
+# very easily. See encodings/utf_8.py for an example on how this is
+# done.
+#
+
+class StreamWriter(Codec):
+
+ def __init__(self, stream, errors='strict'):
+
+ """ Creates a StreamWriter instance.
+
+ stream must be a file-like object open for writing
+ (binary) data.
+
+ The StreamWriter may use different error handling
+ schemes by providing the errors keyword argument. These
+ parameters are predefined:
+
+ 'strict' - raise a ValueError (or a subclass)
+ 'ignore' - ignore the character and continue with the next
+ 'replace'- replace with a suitable replacement character
+ 'xmlcharrefreplace' - Replace with the appropriate XML
+ character reference.
+ 'backslashreplace' - Replace with backslashed escape
+ sequences (only for encoding).
+
+ The set of allowed parameter values can be extended via
+ register_error.
+ """
+ self.stream = stream
+ self.errors = errors
+
+ def write(self, object):
+
+ """ Writes the object's contents encoded to self.stream.
+ """
+ data, consumed = self.encode(object, self.errors)
+ print type(data)
+ self.stream.write(data)
+
+ def writelines(self, list):
+
+ """ Writes the concatenated list of strings to the stream
+ using .write().
+ """
+ self.write(''.join(list))
+
+ def reset(self):
+
+ """ Flushes and resets the codec buffers used for keeping state.
+
+ Calling this method should ensure that the data on the
+ output is put into a clean state, that allows appending
+ of new fresh data without having to rescan the whole
+ stream to recover state.
+
+ """
+ pass
+
+ def __getattr__(self, name,
+ getattr=getattr):
+
+ """ Inherit all other methods from the underlying stream.
+ """
+ return getattr(self.stream, name)
+
+###
+
+class StreamReader(Codec):
+
+ def __init__(self, stream, errors='strict'):
+
+ """ Creates a StreamReader instance.
+
+ stream must be a file-like object open for reading
+ (binary) data.
+
+ The StreamReader may use different error handling
+ schemes by providing the errors keyword argument. These
+ parameters are predefined:
+
+ 'strict' - raise a ValueError (or a subclass)
+ 'ignore' - ignore the character and continue with the next
+ 'replace'- replace with a suitable replacement character;
+
+ The set of allowed parameter values can be extended via
+ register_error.
+ """
+ self.stream = stream
+ self.errors = errors
+ self.bytebuffer = ""
+ self.charbuffer = u""
+ self.atcr = False
+
+ def decode(self, input, errors='strict'):
+ raise NotImplementedError
+
+ def read(self, size=-1, chars=-1):
+
+ """ Decodes data from the stream self.stream and returns the
+ resulting object.
+
+ chars indicates the number of characters to read from the
+ stream. read() will never return more than chars
+ characters, but it might return less, if there are not enough
+ characters available.
+
+ size indicates the approximate maximum number of bytes to
+ read from the stream for decoding purposes. The decoder
+ can modify this setting as appropriate. The default value
+ -1 indicates to read and decode as much as possible. size
+ is intended to prevent having to decode huge files in one
+ step.
+
+ The method should use a greedy read strategy meaning that
+ it should read as much data as is allowed within the
+ definition of the encoding and the given size, e.g. if
+ optional encoding endings or state markers are available
+ on the stream, these should be read too.
+ """
+ # read until we get the required number of characters (if available)
+ while True:
+ # can the request can be satisfied from the character buffer?
+ if chars < 0:
+ if self.charbuffer:
+ break
+ else:
+ if len(self.charbuffer) >= chars:
+ break
+ # we need more data
+ if size < 0:
+ newdata = self.stream.read()
+ else:
+ newdata = self.stream.read(size)
+ # decode bytes (those remaining from the last call included)
+ data = self.bytebuffer + newdata
+ newchars, decodedbytes = self.decode(data, self.errors)
+ # keep undecoded bytes until the next call
+ self.bytebuffer = data[decodedbytes:]
+ # put new characters in the character buffer
+ self.charbuffer += newchars
+ # there was no data available
+ if not newdata:
+ break
+ if chars < 0:
+ # Return everything we've got
+ result = self.charbuffer
+ self.charbuffer = u""
+ else:
+ # Return the first chars characters
+ result = self.charbuffer[:chars]
+ self.charbuffer = self.charbuffer[chars:]
+ return result
+
+ def readline(self, size=None, keepends=True):
+
+ """ Read one line from the input stream and return the
+ decoded data.
+
+ size, if given, is passed as size argument to the
+ read() method.
+
+ """
+ readsize = size or 72
+ line = u""
+ # If size is given, we call read() only once
+ while True:
+ data = self.read(readsize)
+ if self.atcr and data.startswith(u"\n"):
+ data = data[1:]
+ if data:
+ self.atcr = data.endswith(u"\r")
+ line += data
+ lines = line.splitlines(True)
+ if lines:
+ line0withend = lines[0]
+ line0withoutend = lines[0].splitlines(False)[0]
+ if line0withend != line0withoutend: # We really have a line end
+ # Put the rest back together and keep it until the next call
+ self.charbuffer = u"".join(lines[1:]) + self.charbuffer
+ if keepends:
+ line = line0withend
+ else:
+ line = line0withoutend
+ break
+ # we didn't get anything or this was our only try
+ elif not data or size is not None:
+ if line and not keepends:
+ line = line.splitlines(False)[0]
+ break
+ if readsize<8000:
+ readsize *= 2
+ return line
+
+ def readlines(self, sizehint=None, keepends=True):
+
+ """ Read all lines available on the input stream
+ and return them as list of lines.
+
+ Line breaks are implemented using the codec's decoder
+ method and are included in the list entries.
+
+ sizehint, if given, is ignored since there is no efficient
+ way to finding the true end-of-line.
+
+ """
+ data = self.read()
+ return data.splitlines(keepends)
+
+ def reset(self):
+
+ """ Resets the codec buffers used for keeping state.
+
+ Note that no stream repositioning should take place.
+ This method is primarily intended to be able to recover
+ from decoding errors.
+
+ """
+ pass
+
+ def next(self):
+
+ """ Return the next decoded line from the input stream."""
+ line = self.readline()
+ if line:
+ return line
+ raise StopIteration
+
+ def __iter__(self):
+ return self
+
+ def __getattr__(self, name,
+ getattr=getattr):
+
+ """ Inherit all other methods from the underlying stream.
+ """
+ return getattr(self.stream, name)
+
+###
+
+class StreamReaderWriter:
+
+ """ StreamReaderWriter instances allow wrapping streams which
+ work in both read and write modes.
+
+ The design is such that one can use the factory functions
+ returned by the codec.lookup() function to construct the
+ instance.
+
+ """
+ # Optional attributes set by the file wrappers below
+ encoding = 'unknown'
+
+ def __init__(self, stream, Reader, Writer, errors='strict'):
+
+ """ Creates a StreamReaderWriter instance.
+
+ stream must be a Stream-like object.
+
+ Reader, Writer must be factory functions or classes
+ providing the StreamReader, StreamWriter interface resp.
+
+ Error handling is done in the same way as defined for the
+ StreamWriter/Readers.
+
+ """
+ self.stream = stream
+ self.reader = Reader(stream, errors)
+ self.writer = Writer(stream, errors)
+ self.errors = errors
+
+ def read(self, size=-1):
+
+ return self.reader.read(size)
+
+ def readline(self, size=None):
+
+ return self.reader.readline(size)
+
+ def readlines(self, sizehint=None):
+
+ return self.reader.readlines(sizehint)
+
+ def next(self):
+
+ """ Return the next decoded line from the input stream."""
+ return self.reader.next()
+
+ def __iter__(self):
+ return self
+
+ def write(self, data):
+
+ return self.writer.write(data)
+
+ def writelines(self, list):
+
+ return self.writer.writelines(list)
+
+ def reset(self):
+
+ self.reader.reset()
+ self.writer.reset()
+
+ def __getattr__(self, name,
+ getattr=getattr):
+
+ """ Inherit all other methods from the underlying stream.
+ """
+ return getattr(self.stream, name)
+
+###
+
+class StreamRecoder:
+
+ """ StreamRecoder instances provide a frontend - backend
+ view of encoding data.
+
+ They use the complete set of APIs returned by the
+ codecs.lookup() function to implement their task.
+
+ Data written to the stream is first decoded into an
+ intermediate format (which is dependent on the given codec
+ combination) and then written to the stream using an instance
+ of the provided Writer class.
+
+ In the other direction, data is read from the stream using a
+ Reader instance and then return encoded data to the caller.
+
+ """
+ # Optional attributes set by the file wrappers below
+ data_encoding = 'unknown'
+ file_encoding = 'unknown'
+
+ def __init__(self, stream, encode, decode, Reader, Writer,
+ errors='strict'):
+
+ """ Creates a StreamRecoder instance which implements a two-way
+ conversion: encode and decode work on the frontend (the
+ input to .read() and output of .write()) while
+ Reader and Writer work on the backend (reading and
+ writing to the stream).
+
+ You can use these objects to do transparent direct
+ recodings from e.g. latin-1 to utf-8 and back.
+
+ stream must be a file-like object.
+
+ encode, decode must adhere to the Codec interface, Reader,
+ Writer must be factory functions or classes providing the
+ StreamReader, StreamWriter interface resp.
+
+ encode and decode are needed for the frontend translation,
+ Reader and Writer for the backend translation. Unicode is
+ used as intermediate encoding.
+
+ Error handling is done in the same way as defined for the
+ StreamWriter/Readers.
+
+ """
+ self.stream = stream
+ self.encode = encode
+ self.decode = decode
+ self.reader = Reader(stream, errors)
+ self.writer = Writer(stream, errors)
+ self.errors = errors
+
+ def read(self, size=-1):
+
+ data = self.reader.read(size)
+ data, bytesencoded = self.encode(data, self.errors)
+ return data
+
+ def readline(self, size=None):
+
+ if size is None:
+ data = self.reader.readline()
+ else:
+ data = self.reader.readline(size)
+ data, bytesencoded = self.encode(data, self.errors)
+ return data
+
+ def readlines(self, sizehint=None):
+
+ data = self.reader.read()
+ data, bytesencoded = self.encode(data, self.errors)
+ return data.splitlines(1)
+
+ def next(self):
+
+ """ Return the next decoded line from the input stream."""
+ return self.reader.next()
+
+ def __iter__(self):
+ return self
+
+ def write(self, data):
+
+ data, bytesdecoded = self.decode(data, self.errors)
+ return self.writer.write(data)
+
+ def writelines(self, list):
+
+ data = ''.join(list)
+ data, bytesdecoded = self.decode(data, self.errors)
+ return self.writer.write(data)
+
+ def reset(self):
+
+ self.reader.reset()
+ self.writer.reset()
+
+ def __getattr__(self, name,
+ getattr=getattr):
+
+ """ Inherit all other methods from the underlying stream.
+ """
+ return getattr(self.stream, name)
+
+### Shortcuts
+
+def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
+
+ """ Open an encoded file using the given mode and return
+ a wrapped version providing transparent encoding/decoding.
+
+ Note: The wrapped version will only accept the object format
+ defined by the codecs, i.e. Unicode objects for most builtin
+ codecs. Output is also codec dependent and will usually by
+ Unicode as well.
+
+ Files are always opened in binary mode, even if no binary mode
+ was specified. This is done to avoid data loss due to encodings
+ using 8-bit values. The default file mode is 'rb' meaning to
+ open the file in binary read mode.
+
+ encoding specifies the encoding which is to be used for the
+ file.
+
+ errors may be given to define the error handling. It defaults
+ to 'strict' which causes ValueErrors to be raised in case an
+ encoding error occurs.
+
+ buffering has the same meaning as for the builtin open() API.
+ It defaults to line buffered.
+
+ The returned wrapped file object provides an extra attribute
+ .encoding which allows querying the used encoding. This
+ attribute is only available if an encoding was specified as
+ parameter.
+
+ """
+ if encoding is not None and \
+ 'b' not in mode:
+ # Force opening of the file in binary mode
+ mode = mode + 'b'
+ file = __builtin__.open(filename, mode, buffering)
+ if encoding is None:
+ return file
+ (e, d, sr, sw) = lookup(encoding)
+ srw = StreamReaderWriter(file, sr, sw, errors)
+ # Add attributes to simplify introspection
+ srw.encoding = encoding
+ return srw
+
+def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
+
+ """ Return a wrapped version of file which provides transparent
+ encoding translation.
+
+ Strings written to the wrapped file are interpreted according
+ to the given data_encoding and then written to the original
+ file as string using file_encoding. The intermediate encoding
+ will usually be Unicode but depends on the specified codecs.
+
+ Strings are read from the file using file_encoding and then
+ passed back to the caller as string using data_encoding.
+
+ If file_encoding is not given, it defaults to data_encoding.
+
+ errors may be given to define the error handling. It defaults
+ to 'strict' which causes ValueErrors to be raised in case an
+ encoding error occurs.
+
+ The returned wrapped file object provides two extra attributes
+ .data_encoding and .file_encoding which reflect the given
+ parameters of the same name. The attributes can be used for
+ introspection by Python programs.
+
+ """
+ if file_encoding is None:
+ file_encoding = data_encoding
+ encode, decode = lookup(data_encoding)[:2]
+ Reader, Writer = lookup(file_encoding)[2:]
+ sr = StreamRecoder(file,
+ encode, decode, Reader, Writer,
+ errors)
+ # Add attributes to simplify introspection
+ sr.data_encoding = data_encoding
+ sr.file_encoding = file_encoding
+ return sr
+
+### Helpers for codec lookup
+
+def getencoder(encoding):
+
+ """ Lookup up the codec for the given encoding and return
+ its encoder function.
+
+ Raises a LookupError in case the encoding cannot be found.
+
+ """
+ return lookup(encoding)[0]
+
+def getdecoder(encoding):
+
+ """ Lookup up the codec for the given encoding and return
+ its decoder function.
+
+ Raises a LookupError in case the encoding cannot be found.
+
+ """
+ return lookup(encoding)[1]
+
+def getreader(encoding):
+
+ """ Lookup up the codec for the given encoding and return
+ its StreamReader class or factory function.
+
+ Raises a LookupError in case the encoding cannot be found.
+
+ """
+ return lookup(encoding)[2]
+
+def getwriter(encoding):
+
+ """ Lookup up the codec for the given encoding and return
+ its StreamWriter class or factory function.
+
+ Raises a LookupError in case the encoding cannot be found.
+
+ """
+ return lookup(encoding)[3]
+
+### Helpers for charmap-based codecs
+
+def make_identity_dict(rng):
+
+ """ make_identity_dict(rng) -> dict
+
+ Return a dictionary where elements of the rng sequence are
+ mapped to themselves.
+
+ """
+ res = {}
+ for i in rng:
+ res[i]=i
+ return res
+
+def make_encoding_map(decoding_map):
+
+ """ Creates an encoding map from a decoding map.
+
+ If a target mapping in the decoding map occurs multiple
+ times, then that target is mapped to None (undefined mapping),
+ causing an exception when encountered by the charmap codec
+ during translation.
+
+ One example where this happens is cp875.py which decodes
+ multiple character to \u001a.
+
+ """
+ m = {}
+ for k,v in decoding_map.items():
+ if not v in m:
+ m[v] = k
+ else:
+ m[v] = None
+ return m
+
+### error handlers
+
+strict_errors = lookup_error("strict")
+ignore_errors = lookup_error("ignore")
+replace_errors = lookup_error("replace")
+xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
+backslashreplace_errors = lookup_error("backslashreplace")
+
+# Tell modulefinder that using codecs probably needs the encodings
+# package
+_false = 1
+if _false:
+ import encodings
+
+### Tests
+
+if __name__ == '__main__':
+
+ # Make stdout translate Latin-1 output into UTF-8 output
+ sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
+
+ # Have stdin translate Latin-1 input into UTF-8 input
+ sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
Added: pypy/dist/pypy/lib/test2/test_codeccallbacks.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/lib/test2/test_codeccallbacks.py Sat Apr 30 00:00:11 2005
@@ -0,0 +1,712 @@
+import test.test_support, unittest
+import sys, codecs, htmlentitydefs, unicodedata
+
+class PosReturn:
+ # this can be used for configurable callbacks
+
+ def __init__(self):
+ self.pos = 0
+
+ def handle(self, exc):
+ oldpos = self.pos
+ realpos = oldpos
+ if realpos<0:
+ realpos = len(exc.object) + realpos
+ # if we don't advance this time, terminate on the next call
+ # otherwise we'd get an endless loop
+ if realpos <= exc.start:
+ self.pos = len(exc.object)
+ return (u"<?>", oldpos)
+
+class CodecCallbackTest(unittest.TestCase):
+
+ def test_xmlcharrefreplace(self):
+ # replace unencodable characters which numeric character entities.
+ # For ascii, latin-1 and charmaps this is completely implemented
+ # in C and should be reasonably fast.
+ s = u"\u30b9\u30d1\u30e2 \xe4nd eggs"
+ self.assertEqual(
+ s.encode("ascii", "xmlcharrefreplace"),
+ "スパモ änd eggs"
+ )
+ self.assertEqual(
+ s.encode("latin-1", "xmlcharrefreplace"),
+ "スパモ \xe4nd eggs"
+ )
+
+ def test_xmlcharnamereplace(self):
+ # This time use a named character entity for unencodable
+ # characters, if one is available.
+
+ def xmlcharnamereplace(exc):
+ if not isinstance(exc, UnicodeEncodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ l = []
+ for c in exc.object[exc.start:exc.end]:
+ try:
+ l.append(u"&%s;" % htmlentitydefs.codepoint2name[ord(c)])
+ except KeyError:
+ l.append(u"&#%d;" % ord(c))
+ return (u"".join(l), exc.end)
+
+ codecs.register_error(
+ "test.xmlcharnamereplace", xmlcharnamereplace)
+
+ sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
+ sout = "«ℜ» = ⟨ሴ€⟩"
+ self.assertEqual(codecs.encode(sin,"ascii", "test.xmlcharnamereplace"), sout)
+ sout = "\xabℜ\xbb = ⟨ሴ€⟩"
+ self.assertEqual(codecs.encode(sin,"latin-1", "test.xmlcharnamereplace"), sout)
+ sout = "\xabℜ\xbb = ⟨ሴ\xa4⟩"
+ self.assertEqual(codecs.encode(sin,"iso-8859-15", "test.xmlcharnamereplace"), sout)
+
+ def test_uninamereplace(self):
+ # We're using the names from the unicode database this time,
+ # and we're doing "syntax highlighting" here, i.e. we include
+ # the replaced text in ANSI escape sequences. For this it is
+ # useful that the error handler is not called for every single
+ # unencodable character, but for a complete sequence of
+ # unencodable characters, otherwise we would output many
+ # unneccessary escape sequences.
+
+ def uninamereplace(exc):
+ if not isinstance(exc, UnicodeEncodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ l = []
+ for c in exc.object[exc.start:exc.end]:
+ l.append(unicodedata.name(c, u"0x%x" % ord(c)))
+ return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end)
+
+ codecs.register_error(
+ "test.uninamereplace", uninamereplace)
+
+ sin = u"\xac\u1234\u20ac\u8000"
+ sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
+ self.assertEqual(codecs.encode(sin,"ascii", "test.uninamereplace"), sout)
+
+ sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
+ self.assertEqual(codecs.encode(sin,"latin-1", "test.uninamereplace"), sout)
+
+ sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
+ self.assertEqual(codecs.encode(sin,"iso-8859-15", "test.uninamereplace"), sout)
+
+ def test_backslashescape(self):
+ # Does the same as the "unicode-escape" encoding, but with different
+ # base encodings.
+ sin = u"a\xac\u1234\u20ac\u8000"
+ if sys.maxunicode > 0xffff:
+ sin += unichr(sys.maxunicode)
+ sout = "a\\xac\\u1234\\u20ac\\u8000"
+ if sys.maxunicode > 0xffff:
+ sout += "\\U%08x" % sys.maxunicode
+ self.assertEqual(codecs.encode(sin,"ascii", "backslashreplace"), sout)
+
+ sout = "a\xac\\u1234\\u20ac\\u8000"
+ if sys.maxunicode > 0xffff:
+ sout += "\\U%08x" % sys.maxunicode
+ self.assertEqual(codecs.encode(sin,"latin-1", "backslashreplace"), sout)
+
+ sout = "a\xac\\u1234\xa4\\u8000"
+ if sys.maxunicode > 0xffff:
+ sout += "\\U%08x" % sys.maxunicode
+ self.assertEqual(codecs.encode(sin,"iso-8859-15", "backslashreplace"), sout)
+
+ def test_relaxedutf8(self):
+ # This is the test for a decoding callback handler,
+ # that relaxes the UTF-8 minimal encoding restriction.
+ # A null byte that is encoded as "\xc0\x80" will be
+ # decoded as a null byte. All other illegal sequences
+ # will be handled strictly.
+ def relaxedutf8(exc):
+ if not isinstance(exc, UnicodeDecodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ if exc.object[exc.start:exc.end].startswith("\xc0\x80"):
+ return (u"\x00", exc.start+2) # retry after two bytes
+ else:
+ raise exc
+
+ codecs.register_error(
+ "test.relaxedutf8", relaxedutf8)
+
+ sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
+ sout = u"a\x00b\x00c\xfc\x00\x00"
+ self.assertEqual(codecs.decode(sin,"utf-8", "test.relaxedutf8"), sout)
+ sin = "\xc0\x80\xc0\x81"
+ self.assertRaises(UnicodeError, codecs.decode,sin, "utf-8", "test.relaxedutf8")
+
+ def test_charmapencode(self):
+ # For charmap encodings the replacement string will be
+ # mapped through the encoding again. This means, that
+ # to be able to use e.g. the "replace" handler, the
+ # charmap has to have a mapping for "?".
+ charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
+ sin = u"abc"
+ sout = "AABBCC"
+ self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
+
+ sin = u"abcA"
+ self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
+
+ charmap[ord("?")] = "XYZ"
+ sin = u"abcDEF"
+ sout = "AABBCCXYZXYZXYZ"
+ self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
+
+ charmap[ord("?")] = u"XYZ"
+ self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
+
+ charmap[ord("?")] = u"XYZ"
+ self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
+
+ def test_callbacks(self):
+ def handler1(exc):
+ if not isinstance(exc, UnicodeEncodeError) \
+ and not isinstance(exc, UnicodeDecodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
+ return (u"[%s]" % u"".join(l), exc.end)
+
+ codecs.register_error("test.handler1", handler1)
+
+ def handler2(exc):
+ if not isinstance(exc, UnicodeDecodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
+ return (u"[%s]" % u"".join(l), exc.end+1) # skip one character
+
+ codecs.register_error("test.handler2", handler2)
+
+ s = "\x00\x81\x7f\x80\xff"
+
+ self.assertEqual(
+ codecs.decode(s,"ascii", "test.handler1"),
+ u"\x00[<129>]\x7f[<128>][<255>]"
+ )
+ self.assertEqual(
+ codecs.decode(s,"ascii", "test.handler2"),
+ u"\x00[<129>][<128>]"
+ )
+
+ self.assertEqual(
+ codecs.decode("\\u3042\u3xxx","unicode-escape", "test.handler1"),
+ u"\u3042[<92><117><51><120>]xx"
+ )
+
+ self.assertEqual(
+ codecs.decode("\\u3042\u3xx","unicode-escape", "test.handler1"),
+ u"\u3042[<92><117><51><120><120>]"
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0],
+ u"z[<98>][<99>]"
+ )
+
+ self.assertEqual(
+ codecs.encode(u"g\xfc\xdfrk","ascii", "test.handler1"),
+ u"g[<252><223>]rk"
+ )
+
+ self.assertEqual(
+ codecs.encode(u"g\xfc\xdf","ascii", "test.handler1"),
+ u"g[<252><223>]"
+ )
+
+ def test_longstrings(self):
+ # test long strings to check for memory overflow problems
+ errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
+ # register the handlers under different names,
+ # to prevent the codec from recognizing the name
+ for err in errors:
+ codecs.register_error("test." + err, codecs.lookup_error(err))
+ l = 1000
+ errors += [ "test." + err for err in errors ]
+ for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
+ for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
+ for err in errors:
+ try:
+ codecs.encode(uni,enc, err)
+ except UnicodeError:
+ pass
+
+ def check_exceptionobjectargs(self, exctype, args, msg):
+ # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
+ # check with one missing argument
+ self.assertRaises(TypeError, exctype, *args[:-1])
+ # check with one argument too much
+ self.assertRaises(TypeError, exctype, *(args + ["too much"]))
+ # check with one argument of the wrong type
+ wrongargs = [ "spam", u"eggs", 42, 1.0, None ]
+ for i in xrange(len(args)):
+ for wrongarg in wrongargs:
+ if type(wrongarg) is type(args[i]):
+ continue
+ # build argument array
+ callargs = []
+ for j in xrange(len(args)):
+ if i==j:
+ callargs.append(wrongarg)
+ else:
+ callargs.append(args[i])
+ self.assertRaises(TypeError, exctype, *callargs)
+
+ # check with the correct number and type of arguments
+ exc = exctype(*args)
+ self.assertEquals(str(exc), msg)
+
+ def test_unicodeencodeerror(self):
+ self.check_exceptionobjectargs(
+ UnicodeEncodeError,
+ ["ascii", u"g\xfcrk", 1, 2, "ouch"],
+ "'ascii' codec can't encode character u'\\xfc' in position 1: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeEncodeError,
+ ["ascii", u"g\xfcrk", 1, 4, "ouch"],
+ "'ascii' codec can't encode characters in position 1-3: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeEncodeError,
+ ["ascii", u"\xfcx", 0, 1, "ouch"],
+ "'ascii' codec can't encode character u'\\xfc' in position 0: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeEncodeError,
+ ["ascii", u"\u0100x", 0, 1, "ouch"],
+ "'ascii' codec can't encode character u'\\u0100' in position 0: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeEncodeError,
+ ["ascii", u"\uffffx", 0, 1, "ouch"],
+ "'ascii' codec can't encode character u'\\uffff' in position 0: ouch"
+ )
+ if sys.maxunicode > 0xffff:
+ self.check_exceptionobjectargs(
+ UnicodeEncodeError,
+ ["ascii", u"\U00010000x", 0, 1, "ouch"],
+ "'ascii' codec can't encode character u'\\U00010000' in position 0: ouch"
+ )
+
+ def test_unicodedecodeerror(self):
+ self.check_exceptionobjectargs(
+ UnicodeDecodeError,
+ ["ascii", "g\xfcrk", 1, 2, "ouch"],
+ "'ascii' codec can't decode byte 0xfc in position 1: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeDecodeError,
+ ["ascii", "g\xfcrk", 1, 3, "ouch"],
+ "'ascii' codec can't decode bytes in position 1-2: ouch"
+ )
+
+ def test_unicodetranslateerror(self):
+ self.check_exceptionobjectargs(
+ UnicodeTranslateError,
+ [u"g\xfcrk", 1, 2, "ouch"],
+ "can't translate character u'\\xfc' in position 1: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeTranslateError,
+ [u"g\u0100rk", 1, 2, "ouch"],
+ "can't translate character u'\\u0100' in position 1: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeTranslateError,
+ [u"g\uffffrk", 1, 2, "ouch"],
+ "can't translate character u'\\uffff' in position 1: ouch"
+ )
+ if sys.maxunicode > 0xffff:
+ self.check_exceptionobjectargs(
+ UnicodeTranslateError,
+ [u"g\U00010000rk", 1, 2, "ouch"],
+ "can't translate character u'\\U00010000' in position 1: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeTranslateError,
+ [u"g\xfcrk", 1, 3, "ouch"],
+ "can't translate characters in position 1-2: ouch"
+ )
+
+ def test_badandgoodstrictexceptions(self):
+ # "strict" complains about a non-exception passed in
+ self.assertRaises(
+ TypeError,
+ codecs.strict_errors,
+ 42
+ )
+ # "strict" complains about the wrong exception type
+ self.assertRaises(
+ Exception,
+ codecs.strict_errors,
+ Exception("ouch")
+ )
+
+ # If the correct exception is passed in, "strict" raises it
+ self.assertRaises(
+ UnicodeEncodeError,
+ codecs.strict_errors,
+ UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")
+ )
+
+ def test_badandgoodignoreexceptions(self):
+ # "ignore" complains about a non-exception passed in
+ self.assertRaises(
+ TypeError,
+ codecs.ignore_errors,
+ 42
+ )
+ # "ignore" complains about the wrong exception type
+ self.assertRaises(
+ TypeError,
+ codecs.ignore_errors,
+ UnicodeError("ouch")
+ )
+ # If the correct exception is passed in, "ignore" returns an empty replacement
+ self.assertEquals(
+ codecs.ignore_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+ (u"", 1)
+ )
+ self.assertEquals(
+ codecs.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
+ (u"", 1)
+ )
+ self.assertEquals(
+ codecs.ignore_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")),
+ (u"", 1)
+ )
+
+ def test_badandgoodreplaceexceptions(self):
+ # "replace" complains about a non-exception passed in
+ self.assertRaises(
+ TypeError,
+ codecs.replace_errors,
+ 42
+ )
+ # "replace" complains about the wrong exception type
+ self.assertRaises(
+ TypeError,
+ codecs.replace_errors,
+ UnicodeError("ouch")
+ )
+ # With the correct exception, "ignore" returns an empty replacement
+ self.assertEquals(
+ codecs.replace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+ (u"?", 1)
+ )
+ self.assertEquals(
+ codecs.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
+ (u"\ufffd", 1)
+ )
+ self.assertEquals(
+ codecs.replace_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")),
+ (u"\ufffd", 1)
+ )
+
+ def test_badandgoodxmlcharrefreplaceexceptions(self):
+ # "xmlcharrefreplace" complains about a non-exception passed in
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ 42
+ )
+ # "xmlcharrefreplace" complains about the wrong exception types
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ UnicodeError("ouch")
+ )
+ # "xmlcharrefreplace" can only be used for encoding
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
+ )
+ # Use the correct exception
+ self.assertEquals(
+ codecs.xmlcharrefreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+ (u"&#%d;" % 0x3042, 1)
+ )
+
+ def test_badandgoodbackslashreplaceexceptions(self):
+ # "backslashreplace" complains about a non-exception passed in
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ 42
+ )
+ # "backslashreplace" complains about the wrong exception types
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ UnicodeError("ouch")
+ )
+ # "backslashreplace" can only be used for encoding
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
+ )
+ # Use the correct exception
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+ (u"\\u3042", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\x00", 0, 1, "ouch")),
+ (u"\\x00", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\xff", 0, 1, "ouch")),
+ (u"\\xff", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u0100", 0, 1, "ouch")),
+ (u"\\u0100", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\uffff", 0, 1, "ouch")),
+ (u"\\uffff", 1)
+ )
+ if sys.maxunicode>0xffff:
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U00010000", 0, 1, "ouch")),
+ (u"\\U00010000", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U0010ffff", 0, 1, "ouch")),
+ (u"\\U0010ffff", 1)
+ )
+
+ def test_badhandlerresults(self):
+ results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
+ encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
+
+ for res in results:
+ codecs.register_error("test.badhandler", lambda: res)
+ for enc in encs:
+ self.assertRaises(
+ TypeError,
+ codecs.encode,
+ u"\u3042",
+ enc,
+ "test.badhandler"
+ )
+ for (enc, bytes) in (
+ ("ascii", "\xff"),
+ ("utf-8", "\xff"),
+ ("utf-7", "+x-")
+ ):
+ self.assertRaises(
+ TypeError,
+ codecs.decode,
+ bytes,
+ enc,
+ "test.badhandler"
+ )
+
+ def test_lookup(self):
+ self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict"))
+ self.assertEquals(codecs.ignore_errors, codecs.lookup_error("ignore"))
+ self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict"))
+ self.assertEquals(
+ codecs.xmlcharrefreplace_errors,
+ codecs.lookup_error("xmlcharrefreplace")
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors,
+ codecs.lookup_error("backslashreplace")
+ )
+
+ def test_unencodablereplacement(self):
+ def unencrepl(exc):
+ if isinstance(exc, UnicodeEncodeError):
+ return (u"\u4242", exc.end)
+ else:
+ raise TypeError("don't know how to handle %r" % exc)
+ codecs.register_error("test.unencreplhandler", unencrepl)
+ for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
+ self.assertRaises(
+ UnicodeEncodeError,
+ codecs.encode,
+ u"\u4242",
+ enc,
+ "test.unencreplhandler"
+ )
+
+ def test_badregistercall(self):
+ # enhance coverage of:
+ # Modules/_codecsmodule.c::register_error()
+ # Python/codecs.c::PyCodec_RegisterError()
+ self.assertRaises(TypeError, codecs.register_error, 42)
+ self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
+
+ def test_unknownhandler(self):
+ # enhance coverage of:
+ # Modules/_codecsmodule.c::lookup_error()
+ self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
+
+ def test_xmlcharrefvalues(self):
+ # enhance coverage of:
+ # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
+ # and inline implementations
+ v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000)
+ if sys.maxunicode>=100000:
+ v += (100000, 500000, 1000000)
+ s = u"".join([unichr(x) for x in v])
+ codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
+ for enc in ("ascii", "iso-8859-15"):
+ for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
+ codecs.encode(s,enc, err)
+
+ def test_decodehelper(self):
+ # enhance coverage of:
+ # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
+ # and callers
+ self.assertRaises(LookupError, codecs.decode,"\xff", "ascii", "test.unknown")
+
+ def baddecodereturn1(exc):
+ return 42
+ codecs.register_error("test.baddecodereturn1", baddecodereturn1)
+ self.assertRaises(TypeError, codecs.decode, "\xff", "ascii", "test.baddecodereturn1")
+ self.assertRaises(TypeError, codecs.decode, "\\", "unicode-escape", "test.baddecodereturn1")
+ self.assertRaises(TypeError, codecs.decode, "\\x0", "unicode-escape", "test.baddecodereturn1")
+ self.assertRaises(TypeError, codecs.decode, "\\x0y", "unicode-escape", "test.baddecodereturn1")
+ self.assertRaises(TypeError, codecs.decode, "\\Uffffeeee", "unicode-escape", "test.baddecodereturn1")
+ self.assertRaises(TypeError, codecs.decode, "\\uyyyy", "raw-unicode-escape", "test.baddecodereturn1")
+
+ def baddecodereturn2(exc):
+ return (u"?", None)
+ codecs.register_error("test.baddecodereturn2", baddecodereturn2)
+ self.assertRaises(TypeError, codecs.decode, "\xff", "ascii", "test.baddecodereturn2")
+
+ handler = PosReturn()
+ codecs.register_error("test.posreturn", handler.handle)
+
+ # Valid negative position
+ handler.pos = -1
+ self.assertEquals(codecs.decode( "\xff0","ascii", "test.posreturn"), u"<?>0")
+
+ # Valid negative position
+ handler.pos = -2
+ self.assertEquals(codecs.decode("\xff0","ascii", "test.posreturn"), u"<?><?>")
+
+ # Negative position out of bounds
+ handler.pos = -3
+ self.assertRaises(IndexError, codecs.decode,"\xff0", "ascii", "test.posreturn")
+
+ # Valid positive position
+ handler.pos = 1
+ self.assertEquals(codecs.decode("\xff0","ascii", "test.posreturn"), u"<?>0")
+
+ # Largest valid positive position (one beyond end of input
+ handler.pos = 2
+ self.assertEquals(codecs.decode("\xff0","ascii", "test.posreturn"), u"<?>")
+
+ # Invalid positive position
+ handler.pos = 3
+ self.assertRaises(IndexError, codecs.decode,"\xff0", "ascii", "test.posreturn")
+
+ # Restart at the "0"
+ handler.pos = 6
+ self.assertEquals(codecs.decode("\\uyyyy0","raw-unicode-escape", "test.posreturn"), u"<?>0")
+
+ class D(dict):
+ def __getitem__(self, key):
+ raise ValueError
+ self.assertRaises(UnicodeError, codecs.charmap_decode, "\xff", "strict", {0xff: None})
+ self.assertRaises(ValueError, codecs.charmap_decode, "\xff", "strict", D())
+ self.assertRaises(TypeError, codecs.charmap_decode, "\xff", "strict", {0xff: sys.maxunicode+1})
+
+ def test_encodehelper(self):
+ # enhance coverage of:
+ # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
+ # and callers
+ self.assertRaises(LookupError, codecs.decode,u"\xff", "ascii", "test.unknown")
+
+ def badencodereturn1(exc):
+ return 42
+ codecs.register_error("test.badencodereturn1", badencodereturn1)
+ self.assertRaises(TypeError, codecs.decode, u"\xff", "ascii", "test.badencodereturn1")
+
+ def badencodereturn2(exc):
+ return (u"?", None)
+ codecs.register_error("test.badencodereturn2", badencodereturn2)
+ self.assertRaises(TypeError, codecs.decode,u"\xff", "ascii", "test.badencodereturn2")
+
+ handler = PosReturn()
+ codecs.register_error("test.posreturn", handler.handle)
+
+ # Valid negative position
+ handler.pos = -1
+ self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
+
+ # Valid negative position
+ handler.pos = -2
+ self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>")
+
+ # Negative position out of bounds
+ handler.pos = -3
+ self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
+
+ # Valid positive position
+ handler.pos = 1
+ self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
+
+ # Largest valid positive position (one beyond end of input
+ handler.pos = 2
+ self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>")
+
+ # Invalid positive position
+ handler.pos = 3
+ self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
+
+ handler.pos = 0
+
+ class D(dict):
+ def __getitem__(self, key):
+ raise ValueError
+ for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
+ self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None})
+ self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D())
+ self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300})
+
+ def test_translatehelper(self):
+ # enhance coverage of:
+ # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
+ # and callers
+ # (Unfortunately the errors argument is not directly accessible
+ # from Python, so we can't test that much)
+ class D(dict):
+ def __getitem__(self, key):
+ raise ValueError
+ self.assertRaises(ValueError, u"\xff".translate, D())
+ self.assertRaises(TypeError, u"\xff".translate, {0xff: sys.maxunicode+1})
+ self.assertRaises(TypeError, u"\xff".translate, {0xff: ()})
+
+ def test_bug828737(self):
+ charmap = {
+ ord("&"): u"&",
+ ord("<"): u"<",
+ ord(">"): u">",
+ ord('"'): u""",
+ }
+
+ for n in (1, 10, 100, 1000):
+ text = u'abc<def>ghi'*n
+ text.translate(charmap)
+
+def test_main():
+ test.test_support.run_unittest(CodecCallbackTest)
+
+if __name__ == "__main__":
+ test_main()
Added: pypy/dist/pypy/lib/unicodecodec.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/lib/unicodecodec.py Sat Apr 30 00:00:11 2005
@@ -0,0 +1,931 @@
+
+## indicate whether a UTF-7 character is special i.e. cannot be directly
+## encoded:
+## 0 - not special
+## 1 - special
+## 2 - whitespace (optional)
+## 3 - RFC2152 Set O (optional)
+
+utf7_special = [
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
+ 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
+ 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
+]
+unicode_latin1=[]
+for i in range(256):
+ unicode_latin1.append(None)
+
+def PyUnicode_Check(op):
+ return type(op) == unicode
+def PyUnicode_CheckExact(op):
+ return (type(op) == unicode)
+
+
+def PyUnicode_GET_SIZE(op):
+ return len(unicode(op))
+def PyUnicode_GET_DATA_SIZE(op):
+ return len(unicode(op)) * len(u' ')
+def PyUnicode_AS_UNICODE(op):
+ unicode(op)
+def PyUnicode_AS_DATA(op):
+ buffer(unicode(op)) #XXX This is a read only buffer
+
+def SPECIAL(c, encodeO, encodeWS):
+ c = ord(c)
+ return (c>127 or utf7_special[c] == 1) or \
+ (encodeWS and (utf7_special[(c)] == 2)) or \
+ (encodeO and (utf7_special[(c)] == 3))
+def B64(n):
+ return ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
+def B64CHAR(c):
+ return (isalnum(c) or (c) == '+' or (c) == '/')
+def UB64(c):
+ if (c) == '+' :
+ return 62
+ elif (c) == '/':
+ return 63
+ elif (c) >= 'a':
+ return ord(c) - 71
+ elif (c) >= 'A':
+ return ord(c) - 65
+ else:
+ return ord(c) + 4
+
+def ENCODE(out, ch, bits) :
+ while (bits >= 6):
+ out.append( B64(ch >> (bits-6)))
+ bits -= 6;
+ return ''.join(out),ch,bits
+
+def DECODE(out, ch, bits, surrogate):
+ while (bits >= 16):
+ outCh = unicode (chr((ord(ch) >> (bits-16)) & 0xffff))
+ bits -= 16
+ if (surrogate):
+ ## We have already generated an error for the high surrogate
+ ## so let's not bother seeing if the low surrogate is correct or not
+ surrogate = 0
+ elif (0xDC00 <= outCh and outCh <= 0xDFFF):
+## This is a surrogate pair. Unfortunately we can't represent
+## it in a 16-bit character
+ surrogate = 1
+ raise UnicodeDecodeError,"code pairs are not supported"
+ else:
+ out.append( outCh )
+ return ''.join(out),ch,bits,surrogate
+
+def PyUnicode_DecodeUTF7(s, size, errors):
+
+ starts = s
+ errmsg = ""
+ inShift = 0
+ bitsleft = 0
+ charsleft = 0
+ surrogate = 0
+ p = []
+ errorHandler = None
+ exc = None
+
+ if (size == 0):
+ return unicode('')
+ i = 0
+ while i < size:
+
+ ch = s[i]
+ if (inShift):
+ if ((ch == '-') or not B64CHAR(ch)):
+ inShift = 0
+ i += 1
+ p, charsleft, bitsleft, surrogate = DECODE(p, charsleft, bitsleft, surrogate);
+ if (bitsleft >= 6):
+## /* The shift sequence has a partial character in it. If
+## bitsleft < 6 then we could just classify it as padding
+## but that is not the case here */
+
+ raise UnicodeDecodeError, "partial character in shift sequence"
+## /* According to RFC2152 the remaining bits should be zero. We
+## choose to signal an error/insert a replacement character
+## here so indicate the potential of a misencoded character. */
+
+## /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
+ if (bitsleft and (charsleft << (sizeof(charsleft) * 8 - bitsleft))):
+ raise UnicodeDecodeError, "non-zero padding bits in shift sequence"
+ if (ch == '-') :
+ if ((i < size) and (s[i] == '-')) :
+ p.append( '-')
+ inShift = 1
+
+ elif SPECIAL(ch,0,0) :
+ raise UnicodeDecodeError,"unexpected special character"
+
+ else:
+ p.append( ch )
+ else:
+ charsleft = (charsleft << 6) | UB64(ch)
+ bitsleft += 6
+ i+=1
+## /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
+ elif ( ch == '+' ):
+ startinpos = i
+ i+=1
+ if (i<size and s[i] == '-'):
+ i+=1
+ p.append( '+')
+ else:
+ inShift = 1
+ bitsleft = 0
+
+ elif (SPECIAL(ch,0,0)):
+ i+=1
+ raise UnicodeDecodeError,"unexpected special character"
+ else:
+ p.append( ch )
+ i+=1
+
+ if (inShift) :
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ endinpos = size;
+ raise UnicodeDecodeError, "unterminated shift sequence"
+
+ return unicode(''.join(p))
+
+def PyUnicode_EncodeUTF7(s, size, encodeSetO, encodeWhiteSpace, errors):
+
+# /* It might be possible to tighten this worst case */
+ inShift = 0
+ i = 0
+ bitsleft = 0
+ charsleft = 0
+ out = []
+ for ch in s:
+ if (not inShift) :
+ if (ch == '+'):
+ out.append( '+')
+ out.append( '-')
+ elif (SPECIAL(ch, encodeSetO, encodeWhiteSpace)):
+ charsleft = ch
+ bitsleft = 16
+ out.append('+')
+ out, charsleft, bitsleft = ENCODE(out, charsleft, bitsleft)
+ inShift = bitsleft > 0
+ else:
+ out.append(ch)
+ else:
+ if (not SPECIAL(ch, encodeSetO, encodeWhiteSpace)):
+ out.append(B64(charsleft << (6-bitsleft)))
+ charsleft = 0
+ bitsleft = 0
+## /* Characters not in the BASE64 set implicitly unshift the sequence
+## so no '-' is required, except if the character is itself a '-' */
+ if (B64CHAR(ch) or ch == '-'):
+ out.append('-')
+ inShift = 0
+ out.append(ch)
+ else:
+ bitsleft += 16
+ charsleft = (charsleft << 16) | ch
+ out, charsleft, bitsleft = ENCODE(out, charsleft, bitsleft)
+
+## /* If the next character is special then we dont' need to terminate
+## the shift sequence. If the next character is not a BASE64 character
+## or '-' then the shift sequence will be terminated implicitly and we
+## don't have to insert a '-'. */
+
+ if (bitsleft == 0):
+ if (i + 1 < size):
+ ch2 = s[i+1]
+
+ if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)):
+ pass
+ elif (B64CHAR(ch2) or ch2 == '-'):
+ out.append( '-')
+ inShift = 0
+ else:
+ inShift = 0
+ else:
+ out.append( '-')
+ inShift = 0
+ i+=1
+
+ if (bitsleft):
+ out.append(B64(charsleft << (6-bitsleft) ))
+ out.append( '-')
+
+ return ''.join(out)
+
+def PyUnicode_FromOrdinal(ordinal):
+
+ if (ordinal < 0 or ordinal > 0x10ffff):
+ raise ValueError, "unichr() arg not in range(0x110000) (wide Python build)"
+
+## if (ordinal < 0 or ordinal > 0xffff):
+## raise ValueError, "unichr() arg not in range(0x1000) (narrow Python build)"
+
+ s = unichr(ordinal)
+ return s,1
+
+def PyUnicode_FromObject(obj):
+
+## /* XXX Perhaps we should make this API an alias of
+## PyObject_Unicode() instead ?! */
+ if (PyUnicode_CheckExact(obj)):
+ return obj
+
+ if (PyUnicode_Check(obj)):
+## /* For a Unicode subtype that's not a Unicode object,
+## return a true Unicode object with the same data. */
+ return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),PyUnicode_GET_SIZE(obj))
+ return PyUnicode_FromEncodedObject(obj, None, "strict")
+
+unicode_empty=u''
+
+def PyUnicode_FromUnicode(u, size):
+
+## /* If the Unicode data is known at construction time, we can apply
+## some optimizations which share commonly used objects. */
+ if (u):
+
+## /* Optimization for empty strings */
+ if (size == 0 and unicode_empty != None) :
+ return unicode_empty
+
+ ## /* Single character Unicode objects in the Latin-1 range are
+ ## shared when using this constructor */
+ if (size == 1 and ord(u) < 256) :
+ result = unicode_latin1[ord(u)]
+ if (not result):
+ result = unicode(u)
+ unicode_latin1[ord(u)] = result
+ if (not result):
+ return None
+ return result
+ return unicode(u)
+
+def PyUnicode_Decode(s,size,encoding,errors):
+
+ if (encoding == None):
+ encoding = PyUnicode_GetDefaultEncoding()
+
+## /* Shortcuts for common default encodings */
+ decoder = encodings.get(encoding,None)
+ if decoder:
+ return decoder(s,encoding,errors)
+## /* Decode via the codec registry */
+ buf = buffer(s)
+ result = PyCodec_Decode(buf, encoding, errors);
+ if (not PyUnicode_Check(result)):
+ raise UnicodeDecodeError, "decoder did not return an unicode object (type=%.400s)"%type(result)
+ return result
+
+def PyUnicode_FromEncodedObject(obj, encoding,errors):
+
+ s = str(obj)
+ v = PyUnicode_Decode(s, len(s), encoding, errors)
+ return v
+
+def PyUnicode_DecodeASCII(s, size, errors):
+
+# /* ASCII is equivalent to the first 128 ordinals in Unicode. */
+ if (size == 1 and ord(s) < 128) :
+ return PyUnicode_FromUnicode(unicode(s), 1)
+ if (size == 0):
+ return unicode('')
+ p = []
+ for c in s:
+ if ord(c) < 128:
+ p.append(c)
+ else:
+ UnicodeDecodeError("ordinal not in range(128)",s.index(c))
+ return ''.join(p)
+
+def PyUnicode_EncodeASCII(p,size,errors):
+
+ return unicode_encode_ucs1(p, size, errors, 128)
+
+def PyUnicode_AsASCIIString(unistr):
+
+ if not type(unistr) == unicode:
+ raise BadArgumnentError
+ return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unistr),
+ len(unicode),
+ None)
+
+##def PyUnicode_DecodeUTF16Stateful(s,size,errors,byteorder,consumed):
+##
+## bo = 0; /* assume native ordering by default */
+## errmsg = "";
+## /* Offsets from q for retrieving byte pairs in the right order. */
+###ifdef BYTEORDER_IS_LITTLE_ENDIAN
+## int ihi = 1, ilo = 0;
+###else
+## int ihi = 0, ilo = 1;
+###endif
+## PyObject *errorHandler = NULL;
+## PyObject *exc = NULL;
+##
+## /* Note: size will always be longer than the resulting Unicode
+## character count */
+## unicode = _PyUnicode_New(size);
+## if (!unicode)
+## return NULL;
+## if (size == 0)
+## return (PyObject *)unicode;
+##
+## /* Unpack UTF-16 encoded data */
+## p = unicode->str;
+## q = (unsigned char *)s;
+## e = q + size;
+##
+## if (byteorder)
+## bo = *byteorder;
+##
+## /* Check for BOM marks (U+FEFF) in the input and adjust current
+## byte order setting accordingly. In native mode, the leading BOM
+## mark is skipped, in all other modes, it is copied to the output
+## stream as-is (giving a ZWNBSP character). */
+## if (bo == 0) {
+## if (size >= 2) {
+## const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
+###ifdef BYTEORDER_IS_LITTLE_ENDIAN
+## if (bom == 0xFEFF) {
+## q += 2;
+## bo = -1;
+## }
+## else if (bom == 0xFFFE) {
+## q += 2;
+## bo = 1;
+## }
+###else
+## if (bom == 0xFEFF) {
+## q += 2;
+## bo = 1;
+## }
+## else if (bom == 0xFFFE) {
+## q += 2;
+## bo = -1;
+## }
+###endif
+## }
+## }
+##
+## if (bo == -1) {
+## /* force LE */
+## ihi = 1;
+## ilo = 0;
+## }
+## else if (bo == 1) {
+## /* force BE */
+## ihi = 0;
+## ilo = 1;
+## }
+##
+## while (q < e) {
+## Py_UNICODE ch;
+## /* remaining bytes at the end? (size should be even) */
+## if (e-q<2) {
+## if (consumed)
+## break;
+## errmsg = "truncated data";
+## startinpos = ((const char *)q)-starts;
+## endinpos = ((const char *)e)-starts;
+## goto utf16Error;
+## /* The remaining input chars are ignored if the callback
+## chooses to skip the input */
+## }
+## ch = (q[ihi] << 8) | q[ilo];
+##
+## q += 2;
+##
+## if (ch < 0xD800 || ch > 0xDFFF) {
+## *p++ = ch;
+## continue;
+## }
+##
+## /* UTF-16 code pair: */
+## if (q >= e) {
+## errmsg = "unexpected end of data";
+## startinpos = (((const char *)q)-2)-starts;
+## endinpos = ((const char *)e)-starts;
+## goto utf16Error;
+## }
+## if (0xD800 <= ch && ch <= 0xDBFF) {
+## Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
+## q += 2;
+## if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+###ifndef Py_UNICODE_WIDE
+## *p++ = ch;
+## *p++ = ch2;
+###else
+## *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+###endif
+## continue;
+## }
+## else {
+## errmsg = "illegal UTF-16 surrogate";
+## startinpos = (((const char *)q)-4)-starts;
+## endinpos = startinpos+2;
+## goto utf16Error;
+## }
+##
+## }
+## errmsg = "illegal encoding";
+## startinpos = (((const char *)q)-2)-starts;
+## endinpos = startinpos+2;
+## /* Fall through to report the error */
+##
+## utf16Error:
+## outpos = p-PyUnicode_AS_UNICODE(unicode);
+## if (unicode_decode_call_errorhandler(
+## errors, &errorHandler,
+## "utf16", errmsg,
+## starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
+## (PyObject **)&unicode, &outpos, &p))
+## goto onError;
+## }
+##
+## if (byteorder)
+## *byteorder = bo;
+##
+## if (consumed)
+## *consumed = (const char *)q-starts;
+##
+## /* Adjust length */
+## if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
+## goto onError;
+##
+## Py_XDECREF(errorHandler);
+## Py_XDECREF(exc);
+## return (PyObject *)unicode;
+##
+##onError:
+## Py_DECREF(unicode);
+## Py_XDECREF(errorHandler);
+## Py_XDECREF(exc);
+## return NULL;
+##}
+
+def PyUnicode_EncodeUTF16(s,size,errors,byteorder='little'):
+
+# /* Offsets from p for storing byte pairs in the right order. */
+###ifdef BYTEORDER_IS_LITTLE_ENDIAN
+## int ihi = 1, ilo = 0;
+###else
+## int ihi = 0, ilo = 1;
+###endif
+
+ def STORECHAR(CH,byteorder):
+ hi = chr(((CH) >> 8) & 0xff)
+ lo = chr((CH) & 0xff)
+ if byteorder == 'little':
+ return [lo,hi]
+ else:
+ return [hi,lo]
+
+ p = []
+ import sys
+ bom = sys.byteorder
+ if (byteorder == 0):
+ import sys
+ bom = sys.byteorder
+ p.extend(STORECHAR(0xFEFF,bom))
+
+ if (size == 0):
+ return ""
+
+ if (byteorder == -1):
+ bom = 'little'
+ elif (byteorder == 1):
+ bom = 'big'
+
+
+ for c in s:
+ ch = ord(c)
+ ch2 = 0
+ if (ch >= 0x10000) :
+ ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
+ ch = 0xD800 | ((ch-0x10000) >> 10)
+
+ p.extend(STORECHAR(ch,bom))
+ if (ch2):
+ p.extend(STORECHAR(ch2,bom))
+
+ return ''.join(p)
+
+
+def PyUnicode_DecodeMBCS(s, size, errors):
+ pass
+##{
+## PyUnicodeObject *v;
+## Py_UNICODE *p;
+##
+## /* First get the size of the result */
+## DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
+## if (size > 0 && usize==0)
+## return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+##
+## v = _PyUnicode_New(usize);
+## if (v == NULL)
+## return NULL;
+## if (usize == 0)
+## return (PyObject *)v;
+## p = PyUnicode_AS_UNICODE(v);
+## if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
+## Py_DECREF(v);
+## return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+## }
+##
+## return (PyObject *)v;
+##}
+
+##def PyUnicode_EncodeMBCS(p, size, errors):
+##
+#### /* If there are no characters, bail now! */
+## if (size==0)
+## return ""
+## from ctypes import *
+## WideCharToMultiByte = windll.kernel32.WideCharToMultiByte
+#### /* First get the size of the result */
+## mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, s, 0, None, None);
+## if (mbcssize==0)
+## raise UnicodeEncodeError, "Windows cannot decode the string %s" %p
+### More error handling required (check windows errors and such)
+##
+### /* Do the conversion */
+#### s = ' '*mbcssize
+#### if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)):
+#### raise UnicodeEncodeError, "Windows cannot decode the string %s" %p
+## return s
+def PyUnicode_DecodeUTF8(s, size, errors):
+
+ return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
+
+def PyUnicode_DecodeUTF8Stateful(s,size,errors,consumed):
+ pass
+##{
+## const char *starts = s;
+## int n;
+## int startinpos;
+## int endinpos;
+## int outpos;
+## const char *e;
+## PyUnicodeObject *unicode;
+## Py_UNICODE *p;
+## const char *errmsg = "";
+## PyObject *errorHandler = NULL;
+## PyObject *exc = NULL;
+##
+## /* Note: size will always be longer than the resulting Unicode
+## character count */
+## unicode = _PyUnicode_New(size);
+## if (!unicode)
+## return NULL;
+## if (size == 0) {
+## if (consumed)
+## *consumed = 0;
+## return (PyObject *)unicode;
+## }
+##
+## /* Unpack UTF-8 encoded data */
+## p = unicode->str;
+## e = s + size;
+##
+## while (s < e) {
+## Py_UCS4 ch = (unsigned char)*s;
+##
+## if (ch < 0x80) {
+## *p++ = (Py_UNICODE)ch;
+## s++;
+## continue;
+## }
+##
+## n = utf8_code_length[ch];
+##
+## if (s + n > e) {
+## if (consumed)
+## break;
+## else {
+## errmsg = "unexpected end of data";
+## startinpos = s-starts;
+## endinpos = size;
+## goto utf8Error;
+## }
+## }
+##
+## switch (n) {
+##
+## case 0:
+## errmsg = "unexpected code byte";
+## startinpos = s-starts;
+## endinpos = startinpos+1;
+## goto utf8Error;
+##
+## case 1:
+## errmsg = "internal error";
+## startinpos = s-starts;
+## endinpos = startinpos+1;
+## goto utf8Error;
+##
+## case 2:
+## if ((s[1] & 0xc0) != 0x80) {
+## errmsg = "invalid data";
+## startinpos = s-starts;
+## endinpos = startinpos+2;
+## goto utf8Error;
+## }
+## ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
+## if (ch < 0x80) {
+## startinpos = s-starts;
+## endinpos = startinpos+2;
+## errmsg = "illegal encoding";
+## goto utf8Error;
+## }
+## else
+## *p++ = (Py_UNICODE)ch;
+## break;
+##
+## case 3:
+## if ((s[1] & 0xc0) != 0x80 ||
+## (s[2] & 0xc0) != 0x80) {
+## errmsg = "invalid data";
+## startinpos = s-starts;
+## endinpos = startinpos+3;
+## goto utf8Error;
+## }
+## ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+## if (ch < 0x0800) {
+## /* Note: UTF-8 encodings of surrogates are considered
+## legal UTF-8 sequences;
+##
+## XXX For wide builds (UCS-4) we should probably try
+## to recombine the surrogates into a single code
+## unit.
+## */
+## errmsg = "illegal encoding";
+## startinpos = s-starts;
+## endinpos = startinpos+3;
+## goto utf8Error;
+## }
+## else
+## *p++ = (Py_UNICODE)ch;
+## break;
+##
+## case 4:
+## if ((s[1] & 0xc0) != 0x80 ||
+## (s[2] & 0xc0) != 0x80 ||
+## (s[3] & 0xc0) != 0x80) {
+## errmsg = "invalid data";
+## startinpos = s-starts;
+## endinpos = startinpos+4;
+## goto utf8Error;
+## }
+## ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
+## ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
+## /* validate and convert to UTF-16 */
+## if ((ch < 0x10000) /* minimum value allowed for 4
+## byte encoding */
+## || (ch > 0x10ffff)) /* maximum value allowed for
+## UTF-16 */
+## {
+## errmsg = "illegal encoding";
+## startinpos = s-starts;
+## endinpos = startinpos+4;
+## goto utf8Error;
+## }
+###ifdef Py_UNICODE_WIDE
+## *p++ = (Py_UNICODE)ch;
+###else
+## /* compute and append the two surrogates: */
+##
+## /* translate from 10000..10FFFF to 0..FFFF */
+## ch -= 0x10000;
+##
+## /* high surrogate = top 10 bits added to D800 */
+## *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
+##
+## /* low surrogate = bottom 10 bits added to DC00 */
+## *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
+###endif
+## break;
+##
+## default:
+## /* Other sizes are only needed for UCS-4 */
+## errmsg = "unsupported Unicode code range";
+## startinpos = s-starts;
+## endinpos = startinpos+n;
+## goto utf8Error;
+## }
+## s += n;
+## continue;
+##
+## utf8Error:
+## outpos = p-PyUnicode_AS_UNICODE(unicode);
+## if (unicode_decode_call_errorhandler(
+## errors, &errorHandler,
+## "utf8", errmsg,
+## starts, size, &startinpos, &endinpos, &exc, &s,
+## (PyObject **)&unicode, &outpos, &p))
+## goto onError;
+## }
+## if (consumed)
+## *consumed = s-starts;
+##
+## /* Adjust length */
+## if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
+## goto onError;
+##
+## Py_XDECREF(errorHandler);
+## Py_XDECREF(exc);
+## return (PyObject *)unicode;
+##
+##onError:
+## Py_XDECREF(errorHandler);
+## Py_XDECREF(exc);
+## Py_DECREF(unicode);
+## return NULL;
+##}
+##
+##/* Allocation strategy: if the string is short, convert into a stack buffer
+## and allocate exactly as much space needed at the end. Else allocate the
+## maximum possible needed (4 result bytes per Unicode character), and return
+## the excess memory at the end.
+##*/
+
+def PyUnicode_EncodeUTF8(s,size,errors):
+
+ assert(s != NULL)
+ assert(size >= 0)
+ p = []
+ i = 0
+ while i<size:
+ ch = s[i]
+ i+=1
+ if (ord(ch) < 0x80):
+## /* Encode ASCII */
+ p.append(ch)
+ elif (ord(ch) < 0x0800) :
+## /* Encode Latin-1 */
+ p.append(chr((0xc0 | (ch >> 6))))
+ p.append(chr((0x80 | (ch & 0x3f))))
+ else:
+## /* Encode UCS2 Unicode ordinals */
+ if (ord(ch) < 0x10000):
+## /* Special case: check for high surrogate */
+ if (0xD800 <=ord(ch) and ord(ch) <= 0xDBFF and i != size) :
+ ch2 = s[i]
+## /* Check for low surrogate and combine the two to
+## form a UCS4 value */
+ if (0xDC00 <= ch2 and ch2 <= 0xDFFF) :
+ ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
+ i+=1
+ p.extend(encodeUCS4(ch))
+ continue
+## /* Fall through: handles isolated high surrogates */
+ p.append (chr((0xe0 | (ch >> 12))))
+ p.append (chr((0x80 | ((ch >> 6) & 0x3f))))
+ p.append (chr((0x80 | (ch & 0x3f))))
+ continue
+ return ''.join(p)
+
+def encodeUCS4(ch):
+## /* Encode UCS4 Unicode ordinals */
+ p=[]
+ p.append (chr((0xf0 | (ch >> 18))))
+ p.append (chr((0x80 | ((ch >> 12) & 0x3f))))
+ p.append (chr((0x80 | ((ch >> 6) & 0x3f))))
+ p.append (chr((0x80 | (ch & 0x3f))))
+ return p
+
+#/* --- Latin-1 Codec ------------------------------------------------------ */
+
+def PyUnicode_DecodeLatin1(s, size, errors):
+ pass
+##{
+## PyUnicodeObject *v;
+## Py_UNICODE *p;
+##
+## /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
+## if (size == 1) {
+## Py_UNICODE r = *(unsigned char*)s;
+## return PyUnicode_FromUnicode(&r, 1);
+## }
+##
+## v = _PyUnicode_New(size);
+## if (v == NULL)
+## goto onError;
+## if (size == 0)
+## return (PyObject *)v;
+## p = PyUnicode_AS_UNICODE(v);
+## while (size-- > 0)
+## *p++ = (unsigned char)*s++;
+## return (PyObject *)v;
+##
+## onE rror:
+## Py_XDECREF(v);
+## return NULL;
+##}
+from pypy.lib._codecs import lookup_error
+
+def unicode_encode_ucs1(p,size,errors,limit):
+
+ if limit==256:
+ reason = "ordinal not in range(256)"
+ encoding = "latin-1"
+ else:
+ reason = "ordinal not in range(128)"
+ encoding = "ascii"
+
+ if (size == 0):
+ return ''
+ res = []
+ for ch in p:
+ if ord(ch) < limit:
+ res.append(ch)
+ else:
+
+ #/* startpos for collecting unencodable chars */
+ collstart = p.index(ch)
+ collend = p.index(ch)
+ #/* find all unecodable characters */
+ for c in p[collstart:]:
+ if ord(c) >= limit:
+ collend +=1
+ else:
+ break
+ #uncollectable = [c for c in p if ord(c) >= limit]
+ handler = lookup_error(errors)
+ x = handler(UnicodeEncodeError(encoding,p,collstart,collend,reason))
+ res.append(x)
+
+ return ''.join(res)
+
+def PyUnicode_EncodeLatin1(p,size,errors):
+
+ return unicode_encode_ucs1(p, size, errors, 256);
+
+def PyUnicode_EncodeRawUnicodeEscape(s,size):
+
+ if (size == 0):
+ return u''
+
+ p = []
+ for ch in s:
+# /* Map 32-bit characters to '\Uxxxxxxxx' */
+ if (ord(ch) >= 0x10000):
+ p.append('\\')
+ p.append('U')
+ p.append(hex(ord(ch)))
+ elif (ord(ch) >= 256) :
+# /* Map 16-bit characters to '\uxxxx' */
+ p.append('\\')
+ p.append('u')
+ p.append(hex(ord(ch)))
+# /* Copy everything else as-is */
+ else:
+ p.append(ch)
+
+ p.append('\0')
+ return ''.join(p)
+
+def charmapencode_output(c,mapping,outobj,outpos):
+
+ rep = mapping[c]
+
+def PyUnicode_EncodeCharmap(p,size,mapping='latin-1',errors='strict'):
+
+## /* the following variable is used for caching string comparisons
+## * -1=not initialized, 0=unknown, 1=strict, 2=replace,
+## * 3=ignore, 4=xmlcharrefreplace */
+
+# /* Default to Latin-1 */
+ if mapping == 'latin-1':
+ return PyUnicode_EncodeLatin1(p, size, errors)
+ if (size == 0):
+ return ''
+ inpos = 0
+ res = []
+ print type(mapping),type(p)
+ while (inpos<size):
+ #/* try to encode it */
+ try:
+ res.append( mapping[ord(p[inpos])])
+ except KeyError:
+ handler = lookup_error(errors)
+ handler(UnicodeEncodeError("charmap",p,inpos,inpos+1,"character maps to <undefined>"))
+ else:
+ #/* done with this character => adjust input position */
+ inpos+=1
+
+
+ return ''.join(res)
+
+
+encodings = { "utf-8" : PyUnicode_DecodeUTF8,
+ "latin-1" : PyUnicode_DecodeLatin1,
+ "mbcs" : PyUnicode_DecodeMBCS,
+ "ascii" : PyUnicode_DecodeASCII,
+ }
More information about the Pypy-commit
mailing list