Stripping HTML tags from a string

Fredrik Lundh fredrik at pythonware.com
Wed May 2 15:04:38 EDT 2001


Colin Meeks wrote:
> I know I've seen this somewhere before, but can't find it now I want it.
> Does anybody know how to strip all HTML tags from a string. I imagine I
> would use a regular expression

...or you could use the SGML/HTML parser:

import sgmllib, string

class Stripper(sgmllib.SGMLParser):
    def __init__(self):
        self.data = []
        sgmllib.SGMLParser.__init__(self)
    def unknown_starttag(self, tag, attrib):
        self.data.append(" ")
    def unknown_endtag(self, tag):
        self.data.append(" ")
    def handle_data(self, data):
        self.data.append(data)
    def gettext(self):
        text = string.join(self.data, "")
        return string.join(string.split(text)) # normalize whitespace

def strip(text):
    s = Stripper()
    s.feed(text)
    s.close()
    return s.gettext()

>>> print strip("""
<P>Hello<P><FONT FACE="Arial">This is really cool</FONT> isn't it<BR>The End
""")
Hello This is really cool isn't it The End

Cheers /F

<!-- (the eff-bot guide to) the standard python library:
http://www.pythonware.com/people/fredrik/librarybook.htm
-->





More information about the Python-list mailing list