Extracting data from HTML
Geoff Gerrietts
geoff at gerrietts.net
Fri May 31 15:54:09 EDT 2002
Quoting Hazel (lailian98 at hotmail.com):
> how do i write a program that
> will extract info from an HTML and print
> of a list of TV programmes, its Time, and Duration
> using urllib?
You might check into htmllib -- it's got some basic parser structures
in there that can help you parse through the HTML.
You might check out http://www.python9.org/p9-zadka.ppt, which goes
over some of that.
And at the end of this message, I've affixed some (very sloppy, not
very good) Python code that I pounded out the other day to (more or
less) strip markup from a page, so you can see how I went about
prototyping a solution to a (somewhat) similar problem.
--
Geoff Gerrietts <geoff at gerrietts dot net> http://www.gerrietts.net/
"Politics, as a practice, whatever its professions, has always been the
systematic organization of hatreds." --Henry Adams
#!/usr/local/bin/python -i
import htmllib, formatter
class DataStorage:
""" DataStorage
helper class for the parser. effectively implements a string that
changes in-place.
"""
def __init__(self, weight=2):
self.data = ""
self.count = 0
self.weight = weight
def __add__(self, other):
""" __add__
the __add__ routine just appends. clean it later.
"""
self.data = self.data + str(other)
return self
def purge(self):
dat = [self.data] * self.weight
self.data = ""
return string.join(dat)
class HTMLMunger(htmllib.HTMLParser):
TITLE_WT = 5
HEADING_WT = 3
EMPH_WT = 2
def __init__(self):
htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
self.plaindata = DataStorage()
self.storagestack = []
def start_body(self, attrs):
self.savedata = self.plaindata
def push_storage(self,stor):
self.storagestack.append(self.savedata)
self.savedata = stor
def pop_storage(self):
dat = self.savedata.purge()
self.savedata = self.storagestack.pop()
self.handle_data(dat)
def start_h1(self, attrs):
self.push_storage(DataStorage(self.HEADING_WT))
start_h2 = start_h3 = start_h4 = start_h5 = start_h6 = start_h1
def end_h1(self):
self.pop_storage()
end_h2 = end_h3 = end_h4 = end_h5 = end_h6 = end_h1
def start_i(self, attrs):
self.push_storage(DataStorage(self.EMPH_WT))
start_b = start_i
def end_i(self):
self.pop_storage()
end_b = end_i
def anchor_end(self):
# prevent the link number from showing up
self.anchor = None
def extract(self):
dat = string.join(([self.title] * self.TITLE_WT) + [self.plaindata])
return dat
class TextMunger:
def __init__(self):
self.data = ''
def feed(self, data):
self.data = self.data + data
def extract(self):
return self.data
class DocFetcherException(Exception):
pass
def DocFetcher:
handlers = {
'text/html': HTMLMunger,
'text/plain': TextMunger
}
def get_url(self, url):
url_obj = urllib.urlopen(url)
ct = url_obj.info()['Content-Type']
h = self.handlers.get(ct)
if not h:
raise DocFetcherException, "no handler for [%s] type [%s]" %(url,ct)
dp = h()
dp.feed(url_obj.read())
return dp.extract()
if __name__ == '__main__':
pm = HTMLMunger()
import urllib
print "Retrieving"
dat = urllib.urlopen("http://www.yahoo.com/").read()
print "Parsing"
pm.feed(dat)
print "Plain data: ", len(pm.plaindata.data)
print "Emph. data: ", len(pm.emphdata.data)
print "Head. data: ", len(pm.headerdata.data)
print "pm.plaindata.data"
More information about the Python-list
mailing list