html parsing (htmllib)

Fri Jan 28 07:14:29 EST 2000

* Patrick Tufts <zippy at cs.brandeis.edu>

> I want to grab the text from a set of web pages.  htmllib seems like
> what I want, except that want to assign the parsed text to a variable
> instead of printing it to stdout or a file.
> 
> What do I need to do?
> 
> And better yet, has someone already done it?

As a matter of fact, I have. Beware of ugly code. But it gives you an
idea. (I don't remember which module I snatched it from, may be
sgmmlib.py or formatter.py)

## -- code begins --

# Remember this:
import string

from htmllib import *
from formatter import *

class DumbVariableWriter(NullWriter):

    def __init__(self, variable, maxcol=72):
        self.variable = variable
        self.maxcol = maxcol
        NullWriter.__init__(self)
        self.reset()

    def reset(self):
        self.col = 0
        self.atbreak = 0

    def send_paragraph(self, blankline):
        self.variable.append('\n' + '\n'*blankline)
        self.col = 0
        self.atbreak = 0

    def send_line_break(self):
        self.variable.append('\n')
        self.col = 0
        self.atbreak = 0

    def send_hor_rule(self, *args, **kw):
        self.variable.append('\n')
        self.variable.append('-'*self.maxcol)
        self.variable.append('\n')
        self.col = 0
        self.atbreak = 0

    def send_literal_data(self, data):
        self.variable.append(data)
        i = string.rfind(data, '\n')
        if i >= 0:
            self.col = 0
            data = data[i+1:]
        data = string.expandtabs(data)
        self.col = self.col + len(data)
        self.atbreak = 0

    def send_flowing_data(self, data):
        if not data: return
        atbreak = self.atbreak or data[0] in string.whitespace
        col = self.col
        maxcol = self.maxcol
        for word in string.split(data):
            if atbreak:
                if col + len(word) >= maxcol:
                    self.variable.append('\n')
                    col = 0
                else:
                    self.variable.append(' ')
                    col = col + 1
            self.variable.append(word)
            col = col + len(word)
            atbreak = 1
        self.col = col
        self.atbreak = data[-1] in string.whitespace

## -- end code --

Usage:

# List to temporarily hold the string
list_of_output = []
parser = HTMLParser(AbstractFormatter(DumbVariableWriter(list_of_output))) 
parser.feed(open('test.html').read())

# Smash it all together in one string
result = string.join(list_of_output, '')
-- 
This goes to eleven