totally confused on sgmllib

iddwb iddwb at imap1.asu.edu
Thu Apr 26 18:14:46 EDT 2001


Now I feeling really stupid about how I understood this language.  The
attachment does almost what I want.  The problem is all the other html
tags to not get written back to the result as I thought

..self.result = self.result + fulltag

would do.  Only the tag enclosing the body part of the html is written to
the result.  The purpose of this script was to extract only the body
portion of an html file -- retaining all the tags within.  Well, its doing
partly that -- but I can figure out why its ignoring all the valid tags..

Any advice?

#!/usr/local/bin/python
# from python recepes

import sys, string, sgmllib

class htmlExtractor(sgmllib.SGMLParser):
    
    # htmlentitydefs has html tags defined
    from htmlentitydefs import entitydefs
    # what tags do I want to retain
    valid_tags = ('body', 'a', 'i', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'ol', 'ul' )
    
    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.result = ""
        self.TagList = []
        
    def handle_data(self, data):
        if data:
            self.result = self.result + data
            
    def handle_charref(self, data):
        self.result = "%s&#%s;" % (self.result, name)
        
    def handle_entityref(self, name):
        if self.entityedefs.has_key(name):
            x = ';'
        else:
            x = ''
        self.result = "%s&%s%s" % (self.result, name, x)
        
    def unknown_starttag(self, tag, attrs):
        fulltag = ""
        if tag in self.valid_tags:
            for i, j in attrs:
                fulltag=fulltag+i+"="+j+" "
            fulltag="<"+tag+" "+fulltag+">"
            self.TagList.append(fulltag)
        self.result = self.result + fulltag

    def unknown_endtag(self, tag):
        fulltag = ""
        if tag in self.valid_tags:
            fulltag = "</" + tag + ">"
            self.TagList.append(fulltag)
            self.result = "%s</%s>" % (self.result, tag)
            
    def cleanup(self):
        """ why? where would I have lost a close tag? """
        
def getBody(bf):
    """ take a string buffer and return a string buffer with only the html body part """
    bd = htmlExtractor()
    bd.feed(bf)
    bd.close()
    print bd.TagList
    return bd.result
            

if __name__=='__main__':
    fn = 'test.html'
    rawbuff = ''
    mybody = ''
    try:
        fd = open(fn, 'r')
    except:
        print "Error in main, could not open %s",  filename
        exit(1)
    rawbuff = fd.read()
    fd.close()
    mybody = getBody(rawbuff)
#    print mybody

David Bear
College of Public Programs/ASU




More information about the Python-list mailing list