HTMLParser : Record title tags. More info?
Oleg Broytmann
phd at phd.pp.ru
Fri Oct 19 12:18:47 EDT 2001
On Sat, Oct 20, 2001 at 01:47:48AM +0800, Paul Lim wrote:
> I am a newbie in Python. I hope the guru can advise me.
Welcome!
> I am trying to record the title of links in a html document. If the
> opening tag has a title value, I need to record this value. If the title
> does not exist, I need to record the text between the opening <A> and
> closing </A>. Does anyone have any idea how should we do it?
Attached is the example program.
Oleg.
--
Oleg Broytmann http://phd.pp.ru/ phd at phd.pp.ru
Programmers don't die, they just GOSUB without RETURN.
-------------- next part --------------
#! /usr/local/bin/python -O
"""
Exerpt from Parser for Netscape Navigator's bookmarks.html
Written by BroytMann, Jun 1997 - Jun 2001. Copyright (C) 1997-2001 PhiloSoft Design
"""
import sys, string
from sgmllib import SGMLParser
class ATagParser(SGMLParser):
from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
def __init__(self):
SGMLParser.__init__(self)
self.accumulator = ""
self.title = None
def handle_data(self, data):
if data:
self.accumulator = "%s%s" % (self.accumulator, data)
def handle_charref(self, name):
self.accumulator = "%s&%s" % (self.accumulator, name)
def handle_entityref(self, name):
if self.entitydefs.has_key(name): # If it is one of the standard SGML entities - close it with semicolon
x = ';'
else:
x = ''
self.accumulator = "%s&%s%s" % (self.accumulator, name, x)
# Start bookmark
def start_a(self, attrs):
self.accumulator = ''
for attrname, value in attrs:
value = string.strip(value)
if attrname == 'title':
self.title = value
def end_a(self):
accumulator = self.accumulator
self.accumulator = ''
if self.title is None:
self.title = accumulator
print "TITLE: ", self.title
self.title = None # reset for next <A> tag
if __name__ == '__main__':
infile = open(sys.argv[1], 'r')
parser = ATagParser()
while 1:
line = infile.readline()
if not line:
break
parser.feed(line)
parser.close()
infile.close()
More information about the Python-list
mailing list