[XML-SIG] Seeking HTML => (DOM) => Python Object Recipes

Jeff Rush Jeff Rush" <jrush@summit-research.com
Thu, 17 Jun 99 07:18:53 -0500


I'm just starting to get into XML and just joined this list.  I'm
working on a Python agent-program that visits bank web pages
and fetches checkbook registers, parsing the HTML via the
python-xml-0.5.1 stuff into a DOM tree.  When finished, it will
then spit some DTD flavor of XML into a
digitally-signed/encrypted email msg.

What I'm looking for is better extraction of HTML tables.  Has
anyone written a good class for that?  I've got a crude one,
but am hoping others have done extensive parsing of pages
using XML and developed a toolkit.

-Jeff Rush

----- cut here -----

class ExtractTable(xml.dom.walker.Walker):
    def __init__(self, tablenode, trim=0, headings=1, allrows=1):
        self.rows = []
        self.row = []
        self.text = ""
        self.nowhitespace = trim
        self.keepheadings = headings
        self.allrows = allrows
        self.walk(tablenode)

    def startElement(self, node):
        if node.get_nodeName() == 'TR':
            self.row  = []
        elif self.keepheadings and node.get_nodeName() == 'TH':
            self.text = ""
            self.row.append({})
        elif node.get_nodeName() == 'TD':
            self.text = ""
            self.row.append({})

    def endElement(self, node):
        if self.keepheadings and node.get_nodeName() == 'TH':
            self.row[-1].update( {'type': 'header', 'value': self.text} )

        elif node.get_nodeName() == 'TD':
            self.row[-1].update( {'type': 'data', 'value': self.text} )

        elif node.get_nodeName() == 'A' :
            self.row[-1]['link'] = node.getAttribute('HREF')

        elif node.get_nodeName() == 'TR':
            if self.allrows or len(self.row) > 0:
                self.rows.append(self.row)

    def doText(self, node):
        str = node.get_data()
        while len(str) and str[0] in ('\r', '\n'):
            str = str[1:]

        if self.nowhitespace:
            str = string.strip(str)

        self.text = self.text + str

    def doComment(self, node):
        pass

    def doOtherNode(self, node):
        str = {
          'nbsp': ' '
        }.get(node.get_nodeName(), None)
        if str is not None:
            self.text = self.text + str

def ExtractLinks(topnode):
    """Scan and extract all links in given subtree of HTML page"""

    links = []
    for node in topnode.getElementsByTagName('A'):
        url = node.getAttribute('HREF')
        if url:
            links.append(url)

    return links

----- cut here -----