Parsing a HTML file for links?

Wed May 5 17:50:33 EDT 1999

>>>>> "Zigron" == Zigron  <zigron at jps.net> writes:

    >     What I want to do is go through a HTML file, and spit
    > out a dictionary based on the links, and title of the
    > file. I want a dictionary, I guess, of like,
    > {"text-between-anchor-tags":["Destination1","DestinationN.."]}

This does what you're looking for.  The dictionary keys are the
text between anchor starts and ends - <a href=...> and </a>.
Entries contain the href.

Mike

#!/usr/bin/env python
import UserDict
import formatter
import htmllib
import sys
import urllib

class IndexDict(htmllib.HTMLParser, UserDict.UserDict):
    """Parses an HTML file and creates a dictionary of links."""
    def __init__(self, formatter, verbose=0):
        htmllib.HTMLParser.__init__(self, formatter, verbose)
        UserDict.UserDict.__init__(self)

    def anchor_bgn(self, href, name, type): 
        #print 'anchor at ', href,
        self.href = href
        self.save_bgn()

    def anchor_end(self):
        text = self.save_end()
        self.data[text] = self.href

class test:
    url = sys.argv[1]
    index_formatter = formatter.AbstractFormatter(formatter.NullWriter())
    index = IndexDict(index_formatter)
    print 'Reading ', url
    file = urllib.urlopen(url)
    data = file.read()
    file.close()
    index.feed(data)
    index.close()

    keys = index.keys()
    keys.sort()
    for k in keys:
        print k, index[k]

if __name__ == '__main__':
    app = test()