[Tutor] A hacky indexer for Useless Python?

Danny Yoo dyoo@hkn.eecs.berkeley.edu
Sun, 6 Oct 2002 15:30:52 -0700 (PDT)


Hi Rob,

One of your challenges mentioned writing a small program to make it easier
to search Useless Python.  I've started doing a little HTML parsing of the
Useless Python pages, and came up with the following program: it reads
through all 16 pages of Useless, and tries to extract the program entries
from each page.

It's really rough code, but I hope it's useful for Useless.  Just don't
change Useless Python's format, ok?  *grin*


######
"""A small script to help index Useless Python.

Danny Yoo (dyoo@hkn.eecs.berkeley.edu)
"""

import sgmllib
import urllib
import re


def main():
    code_pages = [("http://www.uselesspython.com/"
                   + "uselesspython%s.html" % index)
                  for index in range(1, 16+1)]  ## fixme, don't hardcode!
    entries = []
    for page in code_pages:
        print "processing page", page
        entries.extend(parse_entries(urllib.urlopen(page)))
    for e in entries:
        print e
        print



def parse_entries(source_file):
    parser = UselessParser()
    parser.feed(source_file.read())   ## perhaps we want to feed only a
bit at at a time
    return parser.getEntries()



class UselessParser(sgmllib.SGMLParser):
    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self._entries = []
        self._tags = []
        self._newrow = []
        self._newtd = []


    def handle_data(self, data):
        if 'td' in self._tags:
            self._newtd.append(data)


    def unknown_starttag(self, tag, attrs):
        if tag == 'tr':
            self._newrow = []
        if tag == 'td':
            self._newtd = []
        if 'td' in self._tags:
            self._newtd.append(self.get_starttag_text())
        self._tags.append(tag)


    def unknown_endtag(self, tag):
        if tag in self._tags:
            while tag in self._tags:  # ugly hack; I need to use XMLParser
                self._tags.pop()
        if tag == 'tr' and self._hasGoodRow():
            self._addEntry()
        if tag == 'td':
            self._newrow.append(''.join(self._newtd))
        if 'td' in self._tags:
            self._newtd.append("</%s>" % tag)



    def getEntries(self):
        return self._entries

    def _hasGoodRow(self):
        return len(self._newrow) == 3

    def _addEntry(self):
        self._entries.append(UselessEntry(self._newrow[0],
                                          self._newrow[1],
                                          self._newrow[2]))




class UselessEntry:
    def __init__(self, url, submitter, description):
        self.url, self.submitter, self.description = \
                  url, submitter, description

    def __str__(self):
        return "source: %s\nsubmitter: %s\ndescription:%s" %\
               (self.url, self.submitter, self.description)



if __name__ == '__main__':
    main()
######


Best of wishes to you!