[Python-3000] Adaptation vs. Generic Functions

Sun Apr 9 23:22:28 CEST 2006

"""This is my attempt to rewrite pprint.py to be based on protocols. Or
overloading or specialization or whatever they end up being called ;-). My
first attempt only touched about 20 lines of the original pprint and 
probably
had 90% of the usability gain that this one does, but for some reason I 
decided
that wasn't enough work, and I went back and tried to make it completely
extensible.

This took far too much time, as touching one part of the code led to another
part of the code until I had ripped the whole thing apart. Then I had to 
figure
out how to put it back together sensibly. My first, 
simplest-thing-that-could-
possibly-work, version was slow. About 20x slower than the original 
pprint.py
for pformat, based on pprint._perfcheck. This led to another round of
reorganization and optimization. The current version is slightly faster than
the original for percheck, but slightly slower for saferepr.

Let me add this cautionary note -- this primarily intended as for
illustrative purposes. There's a good chance it has bugs. It does, 
however, pass
test_pprint except for test_subclassing, which it can't really be 
expected to
pass.

Rather than having you scroll all through this long message/file, I'll 
instead
place all the examples up here at the top. Assuming that you save this as
pprint2.py, you should be able to run the examples just by executing the 
file.

"""

if __name__ == "__main__":
    import pprint2

    # Here's that same old object that I stole Fred Lundh's librarybook.

    data = (
    "this is a string", [1, 2, 3, 4], ("more tuples",
    1.0, 2.3, 4.5), "this is yet another string"
    )

    # [1] You can use pprint just as before.

    pprint2.pprint(data)
    print
    # ('this is a string',
    #  [1, 2, 3, 4],
    #  ('more tuples', 1.0, 2.3, 4.5),
    #  'this is yet another string')

    # [2] However, the core object is actually pprinter, which by 
default returns
    # a string instead of printing to stdout.

    print pprint2.pprinter(data)
    print
    # Same as for [1]

    # [3] Let's extend pprinter so that integers are printed in hex. We use
    # register_simple because we don't care about all of the details 
regarding
    # indentation and recursion. register_simple wants a function that takes
    # a single object and returns a string.

    mypprinter = pprint2.PrettyPrinter(pprint2.pprinter)
    print mypprinter(data)
    print
    # Still the same as for [1]

    @mypprinter.register_simple(int)
    @mypprinter.register_simple(long)
    def format_int_as_hex(obj):
        return hex(obj)

    print mypprinter(data)
    print
    # ('this is a string',
    #  [0x1, 0x2, 0x3, 0x4],
    #  ('more tuples', 1.0, 2.3, 4.5),
    #  'this is yet another string')

    # Pretty cool! Note that pprinter itself is unchanged.

    print pprint2.pprinter(data)
    print
    # Still the same as for [1].

    # Ok, just to finish things off, let's override something using the full
    # interface. This interface is not fully documented or even fully
    # understood (I copied it from pprint.py, but I'm sure it warped in
    # transition.
    #
    # This example is pretty useless, but at least you get to see full
    # overriding. Note that this function must return an iterable.
    # For real examples, see the registrations at the bottom of the program.
    # (Although be warned, they can be messy!)

    @mypprinter.register(list)
    def format_list(obj, context, indent, allowance, level):
        yield ("<list @%s [indent=%s, allowance=%s, level=%s]>" %
               (id(obj), indent, allowance, level))

    print mypprinter(data)
    print
    # ('this is a string',
    #  <list @12020992 [indent=1, allowance=1, level=3]>,
    #  ('more tuples', 1.0, 2.3, 4.5),
    #  'this is yet another string')

    # I'm not sure why level is 3 instead of 2, that my be a buglet. I'm not
    # going to track it down right now.

    # OK, that's it for now. Back to my normally scheduled life!

"""
I include Registry and Protocol here since the Protocol implemention keeps
changing. Tracking down a compatible implementation would probably be hard.

I use Registries instead of dicts for my Protocol class. So far, I like this
approach. It makes runtime extension of Protocols simple while exiling the
increased complexity to the Registry class.

While I like the semantics, the implementation is questionable. As currently
implemented lookups are potentially slow while setting and deleting is fast.
This is exactly backwards! This could be reimplemented with the same 
semantics
using some sort of view observer magic as suggested by Nick Coghlan.
"""

_missing = object()
_unknown = object()

class Registry(dict):
    """A dict like object that forwards item requests to its parent if 
neeeded."""
    def __init__(self, *parents):
        dict.__init__(self)
        self.parents = list(parents)
    def __contains__(self, key):
        return self.get(key, _missing) is not _missing
    def __getitem__(self, key):
        x = self._get(key, set())
        if x is _missing:
            raise KeyError('not found')
        return x
    def _get(self, key, visited):
        # Get a value from self or parents. Return _missing on failure
        # visited contains the ids of objects searched so far.
        myid = _id(self)
        if myid in visited:
            return _missing
        visited.add(myid)
        x = dict.get(self, key, _missing)
        if x is not _missing:
            return x               
        for p in self.parents:
            if isinstance(p, Registry):
                x = p._get(key, visited)
            else:
                x = p.get(key, _missing)
            if x is not _missing:
                return x
        return _missing
    def get(self, key, default=None):
        x = self._get(key, set())
        if x is _missing:
            return default
        return x

"""
Yet another Protocol implementation. This one is pretty similar to most 
of the
other recent ones except that it uses Registries.
"""

class Protocol(object):
    """Declare a protocol object that subclasses parents if provided"""
    def __init__(self, *parents):
        self.registry = Registry(*(x.registry for x in parents))

    def register(self, *args):
        """Function decorator to register as an adapter for given keys"""
        if len(args) == 1:
            args = args[0]
        def helper(adapter):
            if adapter is None:
                adapter = null_adapter
            self.registry[args] = adapter
            return adapter
        return helper

    def signatures(self, arg):
        """Find signatures for given call arguments"""
        # Default behaviour dispatches on the type of the first argument
        return _type(arg).__mro__

    def default_adapter(self, *args):
        """Call result when no adapter was found"""
        raise TypeError("Can't adapt <%s> to %s" %
                        (_commajoin(x.__class__.__name__ for x in args),
                         self.__class__.__name__))

    def find_adapter(self, *args):
        """Find an adapter for args"""
        for key in self.signatures(*args):
            adapter = self.registry.get(key, _missing)
            if adapter is not _missing:
                return adapter
        return self.default_adapter

    def __call__(self, *args):
        """Adapt supplied arguments to this protocol"""
        return self.find_adapter(*args)(*args)

"""
Here we have the actual pprint2 stuff.

The first order of business is just defining some marker and helper 
classes.
Nothing exciting.

"""

import sys

class Marker(object):
    def __init__(self, value):
        self.value = value
# These marker classes are used to tell PrettyPrinter to use the formatter
# for a long object (long in the sense that len(pformat(obj)) is long).
class LongList(Marker): pass
class LongTuple(Marker): pass
class LongSequence(Marker): pass
class LongDict(Marker): pass
long_types = {list : LongList, tuple : LongTuple, dict : LongDict}   
# More marker classes. Recursive is used, suprise, for recursive objects and
# Generic is used to format objects that are not otherwise formatable.
class Recursive(Marker): pass
# Generic is a little different from the other marker classes as it doesn't
# wrap it's arguments.
class Generic(object): pass

# This is used to pass context information down to the registered 
formatters.
class Context(object):
    __slots__ = ['locked', 'width', 'maxlevels', 'indent_per_level',
                'readable', 'recursive', 'adapter_cache',
                'long_adapter_cache', 'formatter']

    def __init__(self,  width, maxlevels, indent_per_level, formatter):
        self.locked = set()
        self.width = width
        self.maxlevels = maxlevels
        self.indent_per_level = indent_per_level
        self.readable = True
        self.recursive = False
        self.formatter = formatter
        self.adapter_cache = {}
        self.long_adapter_cache = {}

# Some microoptimizations stolen from the original
_id = id
_len = len
_type = type
_commajoin = ', '.join
_join = ''.join

"""

Now we have reached the core of the implementation.

"""

class PrettyPrinter(Protocol):
    """PrettyPrinter is a Protocol that accepts objects and either 
returns a
       pretty representation of the object or writes the representation 
to a
       stream if that is supplied.
    """

    # Override signatures so that only superclasses with the same repr are
    # return. This centralizes a bunch of logic in pprint.py, in one place
    # simplifies things a lot farther down. Also, always yield Generic, so
    # that the default behavious can be overridden.
    def signatures(self, *args):
        repr = _type(args[0]).__repr__
        for x in _type(args[0]).__mro__:
            if x.__repr__ is not repr:
                break
            yield x
        yield Generic

    # Here to ease changing formatting of simple types, like ints for 
example.
    def register_simple(self, arg):
        """Register a simple converter that takes a obj and returns a 
string"""
        def helper(adapter):
            if adapter is None:
                adapter = null_adapter
            def func(obj, context, indent, allowance, level):
                yield adapter(obj)
            self.registry[arg] = func
            return adapter
        return helper

    # pprint makes a distinction between objects with long representations,
    # which it splits between lines and others. This finds the correct 
formatter
    # for a given type.
    def _find_long_adapter(self, objtype, obj, context):
        try:
            return context.long_adapter_cache[objtype]
        except KeyError:
            for sig in self.signatures(obj, context):
                longtype = long_types.get(sig, None)   
                if longtype:
                    try:
                        adapter = self.registry[longtype]
                        break
                    except KeyError:
                        pass
            else:
                adapter = None
            context.long_adapter_cache[objtype] = adapter
        return adapter

    # This is the core of the algorithm. It's really quite simple if you 
look
    # past two complicating factors. First, there's a bunch of caching going
    # on, that's essential for performance reasons. Second, everything is
    # implemented in terms of generators. This allows the adapters to not
    # worry about the whole stream issue.
    def _format(self, obj, context, indent=0, allowance=0, level=0):
        max_length = context.width - indent - allowance - 1
        objtype = type(obj)
        try:
            adapter = context.adapter_cache[objtype]
        except:
            adapter = context.adapter_cache[objtype] = 
self.find_adapter(obj)
        chunkmaker = adapter(obj, context, indent, allowance, level+1)
        rep = ''
        for chunk in chunkmaker:
            rep += chunk
            if (len(rep) > max_length):
                # If max_length gets too long, we try to use a long adapter
                # instead. If that works, we clear rep and break out. Other-
                # wise, we keep rep and still break out, but still with the
                # old chunkmaker.
                longadapter = self._find_long_adapter(objtype, obj, context)
                if longadapter:
                    rep = ''
                    context.locked.discard(_id(obj))
                    chunkmaker = longadapter(Marker(obj),
                                           context, indent, allowance, 
level+1)
                break
        # Yield rep and any remaining chunks.
        yield rep
        for chunk in chunkmaker:
            yield chunk

    # This just does some error checking, sets up the stream, and hands 
things
    # off to _format.
    def __call__(self, obj, stream=None, indent=1, width=80, depth=None):
        indent = int(indent)
        width = int(width)
        if indent < 0:
            raise ValueError("indent must be >= 0")
        if not (depth is None or depth > 0):
            raise ValueError("depth must be > 0")
        if width <= 0:
            raise ValueError("width must be > 0")
        context = Context(width, depth, indent, self._format)
        if stream is None:
            return _join(self._format(obj, context))
        else:
            for chunk in self._format(obj, context):
                stream.write(chunk)

    # recreate the pprint interface.

    def isrecursive(self, obj):
        context = Context(80, None, 1, self._format)
        for chunk in self._format(obj, context):
            pass
        return context.recursive

    def isreadable(self, obj):
        context = Context(80, None, 1, self._format)
        for chunk in self._format(obj, context):
            pass
        return context.readable and not context.recursive

"""
Now that we have the Protocol object, we define two instances. One 
(saferepr)
that doesn't do anything special with long lines, and one (pprinter) that
does. pprinter extends saferepr, so any changes to saferepr will be 
picked up
automagically by pprinter.

With those two in place, it's a simple matter to recreate the pprint.py 
module
interface. Then we're done except for actually creating and registering 
all of
the functions.
"""

saferepr = PrettyPrinter()
pprinter = PrettyPrinter(saferepr)

# Recreate the module interface.
pformat = pprinter
isreadable = saferepr.isreadable
isrecursive = saferepr.isrecursive
def pprint(obj, indent=1, width=80, depth=None, stream=None):
    if stream is None:
        stream = sys.stdout
    pprinter(obj, stream, indent, width, depth)
    stream.write('\n')

"""
Way down here at the bottom is where we define and register all the 
behaviour
of everything. Don't look too closely at these implementations -- I 
ripped them
from pprint.py and modified them to fit here without thinking about them too
much. They may have suffered in the transition.
"""

@saferepr.register(Generic)
def _format_generic(obj, context,  indent, allowance, level):
    rep = repr(obj)
    if rep.startswith('<'):
        context.readable = False
    yield rep

@saferepr.register(Recursive)
def _format_recursive(markerobj, context, indent, allowance, level):
    obj = markerobj.value
    context.recursive = True
    yield ("<Recursion on %s with id=%s>"
           % (_type(obj).__name__, _id(obj)))

@saferepr.register(float)
def _format_float(obj, context, indent, allowance, level):
    yield str(obj)

@saferepr.register(str)
def _format_str(obj, context, indent, allowance, level):
    if 'locale' not in sys.modules:
        yield repr(obj)
        return
    if "'" in obj and '"' not in obj:
        closure = '"'
        quotes = {'"': '\\"'}
    else:
        closure = "'"
        quotes = {"'": "\\'"}
    qget = quotes.get
    chars = []
    write = chars.append
    for char in obj:
        if char.isalpha():
            write(char)
        else:
            write(qget(char, repr(char)[1:-1]))
    yield ("%s%s%s" % (closure, _join(chars), closure))

@saferepr.register(dict)
def _format_dict(obj, context, indent, allowance, level):
    objid = _id(obj)
    if objid in context.locked:
        for chunk in context.formatter(Recursive(obj), context,
                                       indent, allowance, level):
            yield chunk
    else:
        context.locked.add(objid)
        yield '{'
        if context.maxlevels and level > context.maxlevels:
            yield "..."
        else:           
            if obj:
                format = context.formatter
                items = obj.iteritems()
                # This should really be sorted, but tests don't expect it.
                k, v = items.next()
                for chunk in format(k, context, indent, allowance, level):
                    yield chunk
                yield ': '
                for chunk in format(v, context, indent, allowance, level):
                    yield chunk
                for k, v in items:
                    yield ', '
                    for chunk in format(k, context):
                        yield chunk
                    yield ': '
                    for chunk in format(v, context):
                        yield chunk
        yield '}'
        context.locked.remove(objid)

@saferepr.register(list)
@saferepr.register(tuple)
def _format_sequence(obj, context, indent, allowance, level):
    typ = _type(obj)
    if issubclass(typ, list):
        if not obj:
            yield "[]"
            return
        format = "[%s]"
    elif _len(obj) == 1:
        format = "(%s,)"
    else:
        if not obj:
            yield "()"
            return
        format = "(%s)"
    objid = _id(obj)
    if context.maxlevels and level > context.maxlevels:
        yield format % "..."
        return
    if objid in context.locked:
        for chunk in context.formatter(Recursive(obj), context,
                                       indent, allowance, level):
            yield chunk       
        return
    context.locked.add(objid)
    components = []
    append = components.append
    formatter = context.formatter
    for o in obj:
        for orepr in formatter(o, context, indent, allowance, level):
            append(orepr)
    yield format % _commajoin(components)
    context.locked.remove(objid)

@pprinter.register(LongList)
def _format_longlist(longobj, context, indent, allowance, level):
    yield '['
    for chunk in context.formatter(LongSequence(longobj.value), context, 
indent,
                                                   allowance, level):
        yield chunk
    yield ']'

@pprinter.register(LongTuple)
def _format_longtuple(longobj, context, indent, allowance, level):
    obj = longobj.value
    yield '('
    for chunk in context.formatter(LongSequence(longobj.value), context, 
indent,
                                                   allowance, level):
        yield chunk
    if _len(obj) == 1:
        yield ','
    yield ')'

@pprinter.register(LongSequence)
def _format_longsequence(longobj, context, indent, allowance, level):
    obj = longobj.value
    if context.indent_per_level > 1:
       yield (context.indent_per_level - 1) * ' '
    length = _len(obj)
    if length:
        objid = _id(obj)
        if objid in context.locked:
            for chunk in context.formatter(Recursive(obj), context,
                                       indent, allowance, level):
                yield chunk           
            return
        context.locked.add(objid)
        format = context.formatter
        indent += context.indent_per_level
        objiter = iter(obj)
        for chunk in format(objiter.next(), context, indent, 
allowance+1, level):
            yield chunk
        if length > 1:
            indentation = ',\n' + ' '*indent
            for ent in objiter:
                yield indentation
                for chunk in format(ent, context, indent, allowance+1, 
level):
                    yield chunk
        context.locked.remove(objid)

@pprinter.register(LongDict) # could break out into keys and use 
sequence somehow.
def _format_longdict(longobj, context, indent, allowance, level):
    obj = longobj.value
    yield '{'
    if context.indent_per_level > 1:
        yield (context.indent_per_level - 1) * ' '
    length = _len(obj)
    if length:
        format = context.formatter
        objid = _id(obj)
        if objid in context.locked:
            for chunk in context.formatter(Recursive(obj), context,
                                       indent, allowance, level):
                yield chunk
            return
        context.locked.add(objid)
        indent += context.indent_per_level
        items  = obj.items()
        items.sort()
        itemsiter = iter(items)
        key, ent = itemsiter.next()
        valindent = indent
        valallow = allowance
        for chunk in format(key, context, indent, allowance, level):
            valindent += _len(chunk)
            yield chunk
        yield ': '
        valindent += 2
        valallow += 1
        for chunk in format(ent, context):
            yield chunk
        if length > 1:
            indentation = ',\n'+' '*indent
            for key, ent in itemsiter:
                yield indentation
                for chunk in format(key, context, indent, allowance, level):
                    yield chunk
                yield ': '
                for chunk in format(ent, context, valindent, valallow, 
level):
                    yield chunk
        context.locked.remove(objid)
    yield '}'

def _perfcheck(object=None):
    import time
    if object is None:
        object = [("string", (1, 2), [3, 4], {5: 6, 7: 8})] * 10000
    p = PrettyPrinter()
    t1 = time.time()
    saferepr(object)
    t2 = time.time()
    pformat(object)
    t3 = time.time()
    print "safe_repr:", t2 - t1
    print "pformat:", t3 - t2
#~ _perfcheck()