Is there any simple way to get all HTML tags and remove attribute

Gerhard Häring gerhard.haering at gmx.de
Tue Jul 2 14:03:32 EDT 2002


* sanjay <sanjay2kind at yahoo.com> [2002-07-02 10:02 -0700]:
> Hi,
> 
> I am trying to get all HTML tag and remove color attribute using
> SGMLparser.  Can anyone provide simple code ..

This is what I use:
http://cgi.algonet.se/htbin/cgiwrap/ug/show.py?script=sgmlecho.py

Basically just use the attrs_to_remove attribute of SgmlEcho.

Copied for archival, should this URL ever get offline:

from sgmllib import SGMLParser
import string

# Parse through an SGML file and copy it to self.output
# Can be made to remove certain attributes from all tags by
# setting 'attrs_to_remove' to a list of attribute names.
# Suitable base class for all kinds of HTML file tweakers.
class SgmlEcho(SGMLParser):
    # Initialize default values
    def __init__(self):
	SGMLParser.__init__(self)
	self.echo = 1
	self.attrs_to_remove = []
	self.output_list = []

    # Return the collected output camouflaged as a member
    def __getattr__(self, name):
	if name == 'output':
	    return string.join(self.output_list, '')
	else:
	    raise AttributeError, name

    # Reset output
    def reset_output(self):
	self.output_list = []

    # Save all output, unless turned off
    def write(self, data):
	if self.echo:
	    self.output_list.append(data)

    # Trap all start tags, check attributes
    def unknown_starttag(self, tag, attrs):
	self.write('<'+string.upper(tag))
	for attr in attrs:
	    name, value = attr
	    if name not in self.attrs_to_remove:
		self.write(' '+string.upper(name)+'="'+value+'"')
	self.write('>')

    # Trap all end tags
    def unknown_endtag(self, tag):
	self.write('</'+string.upper(tag)+'>')

    # Take care of character references
    def handle_charref(self, name):
	self.handle_data('&#'+name)

    # Take care of entity references
    def handle_entityref(self, name):
	self.handle_data('&'+name+';')

    # Take care of data
    def handle_data(self, data):
	self.write(data)

    # Take care of comments
    def handle_comment(self, data):
	self.write('<!--'+data+'-->')

if __name__ == '__main__':
    import sys
    p = SgmlEcho()
    p.attrs_to_remove = ["add_date", "last_visit", "last_modified"]
    fp = open(sys.argv[1])
    p.feed(fp.read())
    fp.close()
    sys.stdout.write(p.output)
    p.close()

Gerhard
-- 
mail:   gerhard <at> bigfoot <dot> de       registered Linux user #64239
web:    http://www.cs.fhm.edu/~ifw00065/    OpenPGP public key id AD24C930
public key fingerprint: 3FCC 8700 3012 0A9E B0C9  3667 814B 9CAA AD24 C930
reduce(lambda x,y:x+y,map(lambda x:chr(ord(x)^42),tuple('zS^BED\nX_FOY\x0b')))





More information about the Python-list mailing list