HTML table parser
M.A.Miller
miller at uinpluxa.npl.uiuc.edu
Wed May 5 18:03:34 EDT 1999
Thanks for all example htmllib code that people sent my way.
After some reading and looking at examples, I found htmllib to be
perfect for what I wanted. Along the way I came up with an html
table parser that others may find useful, so here it is. It is
intended to be inherited along with htmllib.HTMLParser to create
a parser for data tables.
Mike
--
Michael A. Miller mmiller at jlab.org
Department of Physics, University of Illinois, Urbana-Champaign
TRUE = (1==1)
FALSE = not TRUE
class HTMLTableRow:
"""A row with a list of data and of headings.
For the first row of a table, there is no data, only column headings.
Subsequent rows have one (or more) row headings and data."""
def __init__(self):
self.headings = []
self.data = []
class HTMLTableData:
"""Storage class for table data parsed by TableParser."""
def __init__(self):
self.data = []
def new_row(self):
self.data.append(HTMLTableRow())
def rows(self):
return len(self.data)
def columns(self):
if self.rows() > 0:
result = len(self.data[0].data)
else:
result = 0
return result
def last_row(self):
return self.data[-1]
def add_data(self,data):
self.data[-1].data.append(data)
if self.rows == 1:
self.columns = self.columns + 1
def add_heading(self,data):
self.data[-1].headings.append(data)
def __repr__(self):
text = '>>> Table <<<\n'
width = 30
format = ' %%%ds ' % ( width )
for row in self.data:
if len(row.data) == 0:
for heading in row.headings:
text = text + format % ( heading[:width] )
text = text + '\n'
else:
if len(row.headings) > 0:
text = text + format % ( row.headings[0][:width] )
for data in row.data:
text = text + format % ( data[:width] )
text = text + '\n'
return text
__str__ = __repr__
def full_repr(self):
text = '>>> Full Table <<<\n'
width = 40
hformat = ' (h)-%%%ds ' % ( width )
dformat = ' (d)-%%%ds ' % ( width )
for row in self.data:
for heading in row.headings:
text = text + hformat % ( heading[:width] )
text = text + '\n'
for data in row.data:
text = text + dformat % ( data[:width] )
text = text + '\n'
return text
class TableParser:
"""Parser for HTML tables.
Each <table> start tag appends an HTMLTableData to the tables
list. <tr>, <td> and <th> tags are added to the last table
in the list. This will fail if there are <tr>, <tr> or <th>
tags that come before a <table> tag.
Table structure is liek this:
<tr> <th> <th> ... <th> <--- column headings
<tr> <th> <td> ... <td>
<tr> <th> <td> ... <td>
<tr> <th> <td> ... <td>
^
row headings.
"""
def __init__(self):
self.finished = FALSE
self.tables = []
def start_table(self,attrs):
self.tables.append(HTMLTableData())
self.finished = FALSE
def end_table(self):
self.finished = TRUE
def start_tr(self,attrs):
self.current_table().new_row()
def end_tr(self):
pass
def start_th(self,attrs):
self.save_bgn()
def end_th(self):
data = self.save_end()
self.current_table().add_heading(data)
def start_td(self,attrs):
self.save_bgn()
def end_td(self):
data = self.save_end()
self.current_table().add_data(data)
#print 'new data = [', data, ']'
#print self.current_table().data
def current_table(self):
return self.tables[-1]
More information about the Python-list
mailing list