HTML parser documentation
Alex
alex at somewhere.round.here
Mon Feb 28 16:29:52 EST 2000
> I want to write program to extract info out of web sites, process the
> info, and then republish it on my web page.
Hi. Here is something I have used to do that sort of thing. Check out
the SGMLParser module:
http://www.python.org/doc/current/lib/module-sgmllib.html
Alex.
from sgmllib import SGMLParser
class Tag:
def __init__(self, attributes):
self.attributes = attributes
def select_attribute(self, attribute_name):
target_tuple = self.attribute_instances(attribute_name)
assert len(target_tuple) == 1
return target_tuple[0][1]
def attribute_instances(self, attribute_name):
return filter(lambda t, a = attribute_name:t[0] == a,
self.attributes)
class Form(Tag):
def __init__(self, attributes):
Tag.__init__(self, attributes)
for attribute in ('action', 'method'):
if self.attribute_instances(attribute):
setattr(self, attribute, self.select_attribute(attribute))
else:
setattr(self, attribute, None)
self.input_entries = {}
def __str__(self):
output = ''
output = output + 'Action: %s\n' % self.action
output = output + 'method: %s\n' % self.method
return output
class Input(Tag):
def __init__(self, attributes):
Tag.__init__(self, attributes)
self.attributes = attributes
possible_name = self.attribute_instances('name')
if possible_name:
assert len(possible_name) == 1
self.name = possible_name[0][1]
possible_value = self.attribute_instances('value')
if possible_value:
assert len(possible_value) == 1
self.value = possible_value[0][1]
else:self.value = ''
class HTMLForm_reader(SGMLParser):
def __init__(self):
self.forms = []
self.currently_in_form_p = 0
SGMLParser.__init__(self)
def start_form(self, attrs):
self.forms.append(Form(attrs))
self.currently_in_form_p = 1
def end_form(self):
self.currently_in_form_p = 0
def start_input(self, attrs):
assert self.currently_in_form_p
c = Input(attrs)
if hasattr(c, 'name'):
self.forms[-1].input_entries[c.name] = c.value
if __name__ == '__main__':
from urllib import urlopen
page = urlopen('http://www.python.org/search').read()
h = HTMLForm_reader()
h.feed(page)
h.close()
for form in h.forms:
print form
More information about the Python-list
mailing list