Making sgmlib more liberal

Jeff Bowden jlbgnews at houseofdistraction.com
Thu Aug 26 21:01:50 CEST 2004


I've written a simple class derived from sgmllib.SGMLParser to extract
text from html pages.  So far it's worked pretty well except for a few
cases where I get exceptions.  I've managed to work around these
problems by overriding parse_declaration.

Since parse_declaration is preceded by the comment 

    # Internal -- parse declaration (for use by subclasses).

I am thinking my workaround might possibly stop working with future
versions of sgmllib so I'm looking for a more correct alternative. 
Any suggestions?

Here's my code:

_endTag = re.compile(r'>')

class SGML2TextParser(sgmllib.SGMLParser):
    def __init__(self, f, ignoretags=['script']):
        sgmllib.SGMLParser.__init__(self)
        self.f = f
        self.ignoretags = ignoretags
        self.tag = ''

    def handle_starttag(self, tag, attrs):
        self.tag = tag

    def handle_data(self, data):
        if self.tag not in self.ignoretags:
            self.f.write(data)

    def handle_charref(self, name):
        try:
            n = int(name)
            self.handle_data(unichr(n))
        except ValueError:
            pass

    # DANGER: overriding internal function
    def parse_declaration(self, i):  
        try:
            return sgmllib.SGMLParser.parse_declaration(self, i)
        except:
            match = _endTag.search(self.rawdata, i)
            return match and match.end(0) or -1

def extractText(html_text):
    s = StringIO.StringIO()
    x = SGML2TextParser(s)
    x.feed(html_text)
    return s.getvalue()



More information about the Python-list mailing list