HTML table parser

M.A.Miller miller at uinpluxa.npl.uiuc.edu
Wed May 5 18:03:34 EDT 1999


Thanks for all example htmllib code that people sent my way.
After some reading and looking at examples, I found htmllib to be
perfect for what I wanted.  Along the way I came up with an html
table parser that others may find useful, so here it is.  It is
intended to be inherited along with htmllib.HTMLParser to create
a parser for data tables.

Mike

-- 
Michael A. Miller                                mmiller at jlab.org
  Department of Physics, University of Illinois, Urbana-Champaign



TRUE = (1==1)
FALSE = not TRUE

class HTMLTableRow:
    """A row with a list of data and of headings.

    For the first row of a table, there is no data, only column headings.
    Subsequent rows have one (or more) row headings and data."""
    def __init__(self):
        self.headings = []
        self.data = []


class HTMLTableData:
    """Storage class for table data parsed by TableParser."""
    def __init__(self):
        self.data = []
        
    def new_row(self):
        self.data.append(HTMLTableRow())

    def rows(self):
        return len(self.data)

    def columns(self):
        if self.rows() > 0:
            result = len(self.data[0].data)
        else:
            result = 0
        return result
        
    def last_row(self):
        return self.data[-1]

    def add_data(self,data):
        self.data[-1].data.append(data)
        if self.rows == 1:
            self.columns = self.columns + 1

    def add_heading(self,data):
        self.data[-1].headings.append(data)

    def __repr__(self):
        text = '>>> Table <<<\n'
        width = 30
        format = ' %%%ds ' % ( width )
        for row in self.data:
            if len(row.data) == 0:
                for heading in row.headings:
                    text = text + format % ( heading[:width] )
                text = text + '\n'
            else:
                if len(row.headings) > 0:
                    text = text + format % ( row.headings[0][:width] )
                for data in row.data:
                    text = text + format % ( data[:width] )
                text = text + '\n'
        return text
    
    __str__ = __repr__

    def full_repr(self):
        text = '>>> Full Table <<<\n'
        width = 40
        hformat = ' (h)-%%%ds ' % ( width )
        dformat = ' (d)-%%%ds ' % ( width )
        for row in self.data:
            for heading in row.headings:
                text = text + hformat % ( heading[:width] )
            text = text + '\n'
            for data in row.data:
                text = text + dformat % ( data[:width] )
            text = text + '\n'
        return text


class TableParser:
    """Parser for HTML tables.

    Each <table> start tag appends an HTMLTableData to the tables
    list.  <tr>, <td> and <th> tags are added to the last table
    in the list.  This will fail if there are <tr>, <tr> or <th>
    tags that come before a <table> tag.

    Table structure is liek this:
    <tr> <th>  <th> ... <th>    <--- column headings
    <tr> <th>  <td> ... <td>
    <tr> <th>  <td> ... <td>
    <tr> <th>  <td> ... <td>

          ^
          row headings.
    """
    def __init__(self):
        self.finished = FALSE
        self.tables = []
        
    def start_table(self,attrs):
        self.tables.append(HTMLTableData())
        self.finished = FALSE

    def end_table(self):
        self.finished = TRUE
                    
    def start_tr(self,attrs):
        self.current_table().new_row()
        
    def end_tr(self):
        pass

    def start_th(self,attrs):
        self.save_bgn()
    def end_th(self):
        data = self.save_end()
        self.current_table().add_heading(data)

    def start_td(self,attrs):
        self.save_bgn()
    def end_td(self):
        data = self.save_end()
        self.current_table().add_data(data)
        #print 'new data = [', data, ']'
        #print self.current_table().data

    def current_table(self):
        return self.tables[-1]




More information about the Python-list mailing list