HTMLPrinter

Paramjit Oberoi p_s_oberoi at hotmail.com
Sat Mar 27 17:05:57 EST 2004


All my uses of the HTMLParser class in the standard library have involved
modifying HTML in some way and writing it back out.  It would be very
convenient if the standard library had a HTMLPrinter class, defined as
follows:

-------------------------------------------------
import sys
from xml.sax import saxutils
from HTMLParser import HTMLParser

class HTMLPrinter(HTMLParser):
    def __init__(self, outfile=None):
        HTMLParser.__init__(self)
        if outfile is None:
            self.outfile = sys.stdout
        else:
            self.outfile = outfile

    def handle_data(self, data):
        self.outfile.write(saxutils.escape(data))

    def handle_starttag(self, tag, attrs):
        self.outfile.write('<%s' % tag)
        for (name,value) in attrs:
            self.outfile.write(' %s=%s' % (name, saxutils.quoteattr(value)))
        self.outfile.write('>')

    def handle_endtag(self, tag):
        self.outfile.write('</%s>' % tag)

    def handle_charref(self, name):
        self.outfile.write('&#%s;' % name)

    def handle_entityref(self, name):
        self.outfile.write('&%s;' % name)

    # is any quoting needed on comment/decl/pi?

    def handle_comment(self, data):
        self.outfile.write('<!--%s-->' % data)
 
    def handle_decl(self, decl):
        self.outfile.write('<!%s>' % decl)
 
    def handle_pi(self, data):
        self.outfile.write('<?%s>' % data)
 
-------------------------------------------------

Such a class would make HTML munging much easier.
For instance:

class RemoveBreaks(HTMLPrinter):
    def handle_starttag(self, tag, attrs):
        if tag != 'br':
            HTMLPrinter.handle_starttag(self, tag, attrs)
        else:
            HTMLPrinter.handle_data(self, ' ')

    def handle_endtag(self, tag):
        if tag != 'br':
            HTMLPrinter.handle_endtag(self, tag)

The code becomes much clearer since it focuses on the
munging rather than on all the boilerplate HTML printing.



More information about the Python-list mailing list