[XML-SIG] Seeking HTML => (DOM) => Python Object Recipes
Jeff Rush
Jeff Rush" <jrush@summit-research.com
Thu, 17 Jun 99 07:18:53 -0500
I'm just starting to get into XML and just joined this list. I'm
working on a Python agent-program that visits bank web pages
and fetches checkbook registers, parsing the HTML via the
python-xml-0.5.1 stuff into a DOM tree. When finished, it will
then spit some DTD flavor of XML into a
digitally-signed/encrypted email msg.
What I'm looking for is better extraction of HTML tables. Has
anyone written a good class for that? I've got a crude one,
but am hoping others have done extensive parsing of pages
using XML and developed a toolkit.
-Jeff Rush
----- cut here -----
class ExtractTable(xml.dom.walker.Walker):
def __init__(self, tablenode, trim=0, headings=1, allrows=1):
self.rows = []
self.row = []
self.text = ""
self.nowhitespace = trim
self.keepheadings = headings
self.allrows = allrows
self.walk(tablenode)
def startElement(self, node):
if node.get_nodeName() == 'TR':
self.row = []
elif self.keepheadings and node.get_nodeName() == 'TH':
self.text = ""
self.row.append({})
elif node.get_nodeName() == 'TD':
self.text = ""
self.row.append({})
def endElement(self, node):
if self.keepheadings and node.get_nodeName() == 'TH':
self.row[-1].update( {'type': 'header', 'value': self.text} )
elif node.get_nodeName() == 'TD':
self.row[-1].update( {'type': 'data', 'value': self.text} )
elif node.get_nodeName() == 'A' :
self.row[-1]['link'] = node.getAttribute('HREF')
elif node.get_nodeName() == 'TR':
if self.allrows or len(self.row) > 0:
self.rows.append(self.row)
def doText(self, node):
str = node.get_data()
while len(str) and str[0] in ('\r', '\n'):
str = str[1:]
if self.nowhitespace:
str = string.strip(str)
self.text = self.text + str
def doComment(self, node):
pass
def doOtherNode(self, node):
str = {
'nbsp': ' '
}.get(node.get_nodeName(), None)
if str is not None:
self.text = self.text + str
def ExtractLinks(topnode):
"""Scan and extract all links in given subtree of HTML page"""
links = []
for node in topnode.getElementsByTagName('A'):
url = node.getAttribute('HREF')
if url:
links.append(url)
return links
----- cut here -----