Stripping HTML tags from a string
Fredrik Lundh
fredrik at pythonware.com
Wed May 2 15:04:38 EDT 2001
Colin Meeks wrote:
> I know I've seen this somewhere before, but can't find it now I want it.
> Does anybody know how to strip all HTML tags from a string. I imagine I
> would use a regular expression
...or you could use the SGML/HTML parser:
import sgmllib, string
class Stripper(sgmllib.SGMLParser):
def __init__(self):
self.data = []
sgmllib.SGMLParser.__init__(self)
def unknown_starttag(self, tag, attrib):
self.data.append(" ")
def unknown_endtag(self, tag):
self.data.append(" ")
def handle_data(self, data):
self.data.append(data)
def gettext(self):
text = string.join(self.data, "")
return string.join(string.split(text)) # normalize whitespace
def strip(text):
s = Stripper()
s.feed(text)
s.close()
return s.gettext()
>>> print strip("""
<P>Hello<P><FONT FACE="Arial">This is really cool</FONT> isn't it<BR>The End
""")
Hello This is really cool isn't it The End
Cheers /F
<!-- (the eff-bot guide to) the standard python library:
http://www.pythonware.com/people/fredrik/librarybook.htm
-->
More information about the Python-list
mailing list