html parsing (htmllib)
Jørn Helge B. Dahl
jorn.h.b.dahl at stud.jbi.hioslo.no
Fri Jan 28 07:14:29 EST 2000
* Patrick Tufts <zippy at cs.brandeis.edu>
> I want to grab the text from a set of web pages. htmllib seems like
> what I want, except that want to assign the parsed text to a variable
> instead of printing it to stdout or a file.
>
> What do I need to do?
>
> And better yet, has someone already done it?
As a matter of fact, I have. Beware of ugly code. But it gives you an
idea. (I don't remember which module I snatched it from, may be
sgmmlib.py or formatter.py)
## -- code begins --
# Remember this:
import string
from htmllib import *
from formatter import *
class DumbVariableWriter(NullWriter):
def __init__(self, variable, maxcol=72):
self.variable = variable
self.maxcol = maxcol
NullWriter.__init__(self)
self.reset()
def reset(self):
self.col = 0
self.atbreak = 0
def send_paragraph(self, blankline):
self.variable.append('\n' + '\n'*blankline)
self.col = 0
self.atbreak = 0
def send_line_break(self):
self.variable.append('\n')
self.col = 0
self.atbreak = 0
def send_hor_rule(self, *args, **kw):
self.variable.append('\n')
self.variable.append('-'*self.maxcol)
self.variable.append('\n')
self.col = 0
self.atbreak = 0
def send_literal_data(self, data):
self.variable.append(data)
i = string.rfind(data, '\n')
if i >= 0:
self.col = 0
data = data[i+1:]
data = string.expandtabs(data)
self.col = self.col + len(data)
self.atbreak = 0
def send_flowing_data(self, data):
if not data: return
atbreak = self.atbreak or data[0] in string.whitespace
col = self.col
maxcol = self.maxcol
for word in string.split(data):
if atbreak:
if col + len(word) >= maxcol:
self.variable.append('\n')
col = 0
else:
self.variable.append(' ')
col = col + 1
self.variable.append(word)
col = col + len(word)
atbreak = 1
self.col = col
self.atbreak = data[-1] in string.whitespace
## -- end code --
Usage:
# List to temporarily hold the string
list_of_output = []
parser = HTMLParser(AbstractFormatter(DumbVariableWriter(list_of_output)))
parser.feed(open('test.html').read())
# Smash it all together in one string
result = string.join(list_of_output, '')
--
This goes to eleven
More information about the Python-list
mailing list