Is there any simple way to get all HTML tags and remove attribute
Gerhard Häring
gerhard.haering at gmx.de
Tue Jul 2 14:03:32 EDT 2002
* sanjay <sanjay2kind at yahoo.com> [2002-07-02 10:02 -0700]:
> Hi,
>
> I am trying to get all HTML tag and remove color attribute using
> SGMLparser. Can anyone provide simple code ..
This is what I use:
http://cgi.algonet.se/htbin/cgiwrap/ug/show.py?script=sgmlecho.py
Basically just use the attrs_to_remove attribute of SgmlEcho.
Copied for archival, should this URL ever get offline:
from sgmllib import SGMLParser
import string
# Parse through an SGML file and copy it to self.output
# Can be made to remove certain attributes from all tags by
# setting 'attrs_to_remove' to a list of attribute names.
# Suitable base class for all kinds of HTML file tweakers.
class SgmlEcho(SGMLParser):
# Initialize default values
def __init__(self):
SGMLParser.__init__(self)
self.echo = 1
self.attrs_to_remove = []
self.output_list = []
# Return the collected output camouflaged as a member
def __getattr__(self, name):
if name == 'output':
return string.join(self.output_list, '')
else:
raise AttributeError, name
# Reset output
def reset_output(self):
self.output_list = []
# Save all output, unless turned off
def write(self, data):
if self.echo:
self.output_list.append(data)
# Trap all start tags, check attributes
def unknown_starttag(self, tag, attrs):
self.write('<'+string.upper(tag))
for attr in attrs:
name, value = attr
if name not in self.attrs_to_remove:
self.write(' '+string.upper(name)+'="'+value+'"')
self.write('>')
# Trap all end tags
def unknown_endtag(self, tag):
self.write('</'+string.upper(tag)+'>')
# Take care of character references
def handle_charref(self, name):
self.handle_data('&#'+name)
# Take care of entity references
def handle_entityref(self, name):
self.handle_data('&'+name+';')
# Take care of data
def handle_data(self, data):
self.write(data)
# Take care of comments
def handle_comment(self, data):
self.write('<!--'+data+'-->')
if __name__ == '__main__':
import sys
p = SgmlEcho()
p.attrs_to_remove = ["add_date", "last_visit", "last_modified"]
fp = open(sys.argv[1])
p.feed(fp.read())
fp.close()
sys.stdout.write(p.output)
p.close()
Gerhard
--
mail: gerhard <at> bigfoot <dot> de registered Linux user #64239
web: http://www.cs.fhm.edu/~ifw00065/ OpenPGP public key id AD24C930
public key fingerprint: 3FCC 8700 3012 0A9E B0C9 3667 814B 9CAA AD24 C930
reduce(lambda x,y:x+y,map(lambda x:chr(ord(x)^42),tuple('zS^BED\nX_FOY\x0b')))
More information about the Python-list
mailing list