Parsing a HTML file for links?
M.A.Miller
miller at uinpluxa.npl.uiuc.edu
Wed May 5 17:50:33 EDT 1999
>>>>> "Zigron" == Zigron <zigron at jps.net> writes:
> What I want to do is go through a HTML file, and spit
> out a dictionary based on the links, and title of the
> file. I want a dictionary, I guess, of like,
> {"text-between-anchor-tags":["Destination1","DestinationN.."]}
This does what you're looking for. The dictionary keys are the
text between anchor starts and ends - <a href=...> and </a>.
Entries contain the href.
Mike
#!/usr/bin/env python
import UserDict
import formatter
import htmllib
import sys
import urllib
class IndexDict(htmllib.HTMLParser, UserDict.UserDict):
"""Parses an HTML file and creates a dictionary of links."""
def __init__(self, formatter, verbose=0):
htmllib.HTMLParser.__init__(self, formatter, verbose)
UserDict.UserDict.__init__(self)
def anchor_bgn(self, href, name, type):
#print 'anchor at ', href,
self.href = href
self.save_bgn()
def anchor_end(self):
text = self.save_end()
self.data[text] = self.href
class test:
url = sys.argv[1]
index_formatter = formatter.AbstractFormatter(formatter.NullWriter())
index = IndexDict(index_formatter)
print 'Reading ', url
file = urllib.urlopen(url)
data = file.read()
file.close()
index.feed(data)
index.close()
keys = index.keys()
keys.sort()
for k in keys:
print k, index[k]
if __name__ == '__main__':
app = test()
More information about the Python-list
mailing list