[Tutor] A hacky indexer for Useless Python?
Danny Yoo
dyoo@hkn.eecs.berkeley.edu
Sun, 6 Oct 2002 15:30:52 -0700 (PDT)
Hi Rob,
One of your challenges mentioned writing a small program to make it easier
to search Useless Python. I've started doing a little HTML parsing of the
Useless Python pages, and came up with the following program: it reads
through all 16 pages of Useless, and tries to extract the program entries
from each page.
It's really rough code, but I hope it's useful for Useless. Just don't
change Useless Python's format, ok? *grin*
######
"""A small script to help index Useless Python.
Danny Yoo (dyoo@hkn.eecs.berkeley.edu)
"""
import sgmllib
import urllib
import re
def main():
code_pages = [("http://www.uselesspython.com/"
+ "uselesspython%s.html" % index)
for index in range(1, 16+1)] ## fixme, don't hardcode!
entries = []
for page in code_pages:
print "processing page", page
entries.extend(parse_entries(urllib.urlopen(page)))
for e in entries:
print e
print
def parse_entries(source_file):
parser = UselessParser()
parser.feed(source_file.read()) ## perhaps we want to feed only a
bit at at a time
return parser.getEntries()
class UselessParser(sgmllib.SGMLParser):
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self._entries = []
self._tags = []
self._newrow = []
self._newtd = []
def handle_data(self, data):
if 'td' in self._tags:
self._newtd.append(data)
def unknown_starttag(self, tag, attrs):
if tag == 'tr':
self._newrow = []
if tag == 'td':
self._newtd = []
if 'td' in self._tags:
self._newtd.append(self.get_starttag_text())
self._tags.append(tag)
def unknown_endtag(self, tag):
if tag in self._tags:
while tag in self._tags: # ugly hack; I need to use XMLParser
self._tags.pop()
if tag == 'tr' and self._hasGoodRow():
self._addEntry()
if tag == 'td':
self._newrow.append(''.join(self._newtd))
if 'td' in self._tags:
self._newtd.append("</%s>" % tag)
def getEntries(self):
return self._entries
def _hasGoodRow(self):
return len(self._newrow) == 3
def _addEntry(self):
self._entries.append(UselessEntry(self._newrow[0],
self._newrow[1],
self._newrow[2]))
class UselessEntry:
def __init__(self, url, submitter, description):
self.url, self.submitter, self.description = \
url, submitter, description
def __str__(self):
return "source: %s\nsubmitter: %s\ndescription:%s" %\
(self.url, self.submitter, self.description)
if __name__ == '__main__':
main()
######
Best of wishes to you!