[Tutor] Re: How do I get text from an HTML document.
Prahlad Vaidyanathan
slime@vsnl.net
Fri, 16 Aug 2002 09:09:37 +0530
Hi,
On Wed, 14 Aug 2002 SA spewed into the ether:
> Hi Everyone-
>
> I have HTML docs that have text between the comment tags:
> <!--Story-->
> Some text here
> <!--Story-->
>
> What would be the simplest way to get this text. The text will also have
> some common html tags mixed in like <p>. So I want to strip all the html
> tags from the text I get also.
[-- snippity --]
This is a modified version of a script I found on the net
sometime back (I think on the Python Cookbook site). It defines
a derived clas of the sgmllib.SGMLParser class, and subsequently
uses the handle_starttag() and handle_endtag() methods to strip
out unwanted tags. Here goes :
"""
import sgmllib
class StrippingParser(sgmllib.SGMLParser):
# These are the HTML tags that we will leave intact
valid_tags = ('b', 'i', 'p')
from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.result = []
self.endTagList = []
def handle_data(self, data):
if data:
self.result.append(data)
def handle_charref(self, name):
self.result.append("&#%s;" % name)
def handle_entityref(self, name):
x = '' # this breaks unstandard entities that end with ';'
if self.entitydefs.has_key(name):
x = ';'
self.result.append("&%s%s" % (name, x))
def unknown_starttag(self, tag, attrs):
""" Delete all tags except for legal ones """
if tag in self.valid_tags:
self.result.append('<%s' % tag)
for k, v in attrs:
if k[0:2].lower() != 'on' and v[0:10].lower() != 'javascript':
self.result.append(' %s="%s"' % (k, v))
endTag = '</%s>' % tag
self.endTagList.insert(0,endTag)
self.result.append('>')
def unknown_endtag(self, tag):
if tag in self.valid_tags:
self.result.append("</%s>" % tag)
remTag = '</%s>' % tag
self.endTagList.remove(remTag)
def cleanup(self):
""" Append missing closing tags """
for i in self.endTagList :
self.result.append(i)
self.result = "".join(self.result)
def strip(s):
""" Strip illegal HTML tags from string s """
parser = StrippingParser()
parser.feed(s)
parser.close()
parser.cleanup()
return parser.result
if __name__ == "__main__" :
import sys
file = sys.argv[1]
fd = open(file,'r')
res = strip(fd.read())
fd.close()
print res
"""
HTH,
pv.
--
Prahlad Vaidyanathan <http://www.symonds.net/~prahladv/>
Children are like cats, they can tell when you don't like them. That's
when they come over and violate your body space.