HTML parser documentation

Alex alex at somewhere.round.here
Mon Feb 28 16:29:52 EST 2000


> I want to write program to extract info out of web sites, process the
> info, and then republish it on my web page.

Hi.  Here is something I have used to do that sort of thing.  Check out
the SGMLParser module:

http://www.python.org/doc/current/lib/module-sgmllib.html

Alex.

from sgmllib import SGMLParser

class Tag:
    
    def __init__(self, attributes):

        self.attributes = attributes

    def select_attribute(self, attribute_name):

        target_tuple = self.attribute_instances(attribute_name)
        assert len(target_tuple) == 1
        return target_tuple[0][1]

    def attribute_instances(self, attribute_name):

        return filter(lambda t, a = attribute_name:t[0] == a, 
                                    self.attributes)

        
class Form(Tag):
    
    def __init__(self, attributes):

        Tag.__init__(self, attributes)
        for attribute in ('action', 'method'):
            if self.attribute_instances(attribute):
                setattr(self, attribute, self.select_attribute(attribute))
            else:
                setattr(self, attribute, None)
        self.input_entries = {}

    def __str__(self):

        output = ''
        output = output + 'Action: %s\n' % self.action
        output = output + 'method: %s\n' % self.method
        return output
    

class Input(Tag):
    
    def __init__(self, attributes):

        Tag.__init__(self, attributes)
        self.attributes = attributes
        possible_name = self.attribute_instances('name')
        if possible_name:
            assert len(possible_name) == 1
            self.name = possible_name[0][1]
        possible_value = self.attribute_instances('value')
        if possible_value:
            assert len(possible_value) == 1
            self.value = possible_value[0][1]
        else:self.value = ''


class HTMLForm_reader(SGMLParser):
    
    def __init__(self):

        self.forms = []
        self.currently_in_form_p = 0
        SGMLParser.__init__(self)

    def start_form(self, attrs):

        self.forms.append(Form(attrs))
        self.currently_in_form_p = 1

    def end_form(self):

        self.currently_in_form_p = 0

    def start_input(self, attrs):

        assert self.currently_in_form_p
        c = Input(attrs)
        if hasattr(c, 'name'):
            self.forms[-1].input_entries[c.name] = c.value

if __name__ == '__main__':
    from urllib import urlopen
    page = urlopen('http://www.python.org/search').read()
    h = HTMLForm_reader()
    h.feed(page)
    h.close()
    for form in h.forms:
        print form



More information about the Python-list mailing list