totally confused on sgmllib
iddwb
iddwb at imap1.asu.edu
Thu Apr 26 18:14:46 EDT 2001
Now I feeling really stupid about how I understood this language. The
attachment does almost what I want. The problem is all the other html
tags to not get written back to the result as I thought
..self.result = self.result + fulltag
would do. Only the tag enclosing the body part of the html is written to
the result. The purpose of this script was to extract only the body
portion of an html file -- retaining all the tags within. Well, its doing
partly that -- but I can figure out why its ignoring all the valid tags..
Any advice?
#!/usr/local/bin/python
# from python recepes
import sys, string, sgmllib
class htmlExtractor(sgmllib.SGMLParser):
# htmlentitydefs has html tags defined
from htmlentitydefs import entitydefs
# what tags do I want to retain
valid_tags = ('body', 'a', 'i', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'ol', 'ul' )
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.result = ""
self.TagList = []
def handle_data(self, data):
if data:
self.result = self.result + data
def handle_charref(self, data):
self.result = "%s&#%s;" % (self.result, name)
def handle_entityref(self, name):
if self.entityedefs.has_key(name):
x = ';'
else:
x = ''
self.result = "%s&%s%s" % (self.result, name, x)
def unknown_starttag(self, tag, attrs):
fulltag = ""
if tag in self.valid_tags:
for i, j in attrs:
fulltag=fulltag+i+"="+j+" "
fulltag="<"+tag+" "+fulltag+">"
self.TagList.append(fulltag)
self.result = self.result + fulltag
def unknown_endtag(self, tag):
fulltag = ""
if tag in self.valid_tags:
fulltag = "</" + tag + ">"
self.TagList.append(fulltag)
self.result = "%s</%s>" % (self.result, tag)
def cleanup(self):
""" why? where would I have lost a close tag? """
def getBody(bf):
""" take a string buffer and return a string buffer with only the html body part """
bd = htmlExtractor()
bd.feed(bf)
bd.close()
print bd.TagList
return bd.result
if __name__=='__main__':
fn = 'test.html'
rawbuff = ''
mybody = ''
try:
fd = open(fn, 'r')
except:
print "Error in main, could not open %s", filename
exit(1)
rawbuff = fd.read()
fd.close()
mybody = getBody(rawbuff)
# print mybody
David Bear
College of Public Programs/ASU
More information about the Python-list
mailing list