vars between classes

Darrell Gallion darrell at dorb.com
Sun May 28 02:28:26 EDT 2000


Marc Tardif wrote:
> Indeed this is related to a concrete programming problem. I'm trying to
> write a small class to parse html tables into lists. Of course, I have
> searched Parnassus for such code, but what was available didn't support
> embedded tables. I'm therefore writing my own class to recursively go
> through embedded tables and I think I'm almost there.

This would be a good job for one of the standard parsers.
But I like to write parsers so your problem sounded fun.
This needs a decent regression test, but I've run out of time.
Thought a test that generates random tables and checks them would be cool.

import re

class TagParser:
    """
    Given a list of tags use getTags(buf) to parse buf.
    The result is a dictionary tree.
    Each element found has an entry in the tree.
    The dictionary key is the tag type and a count for that tag type.
    Each key has a list for it's data. The list holds each child tag.
    The first element of each list is a tuple(start, end) which
    outlines the elements contents.
    """
    def __init__(self, tagList):
        self._tagList=tagList

    def getTokens(self, buf, patList):
        """
        patList is a list of tags to search for.
        Return list of (tagOffset, matchObject)
        """
        pat = "(?i)"
        for p in patList:
            pat=pat+"(<%s>)|(</%s>)|"%(p,p)

        pat = pat[:-1]
        res = self.findallExt(pat, buf, 0, len(buf))
        res1=[]
        for r in res:
            cnt=0
            for n in r.groups():
                if n:
                    res1.append((cnt, (r.start(), r.end())))
                    break
                    # cnt will correspond to the offset in patList
                cnt=cnt+1
        return res1

    def findallExt(self, pat, buf, start, end):
        """
        Search buf for pat. Return a list of matchObjects
        """
        pat=re.compile(pat)
        res=[]
        last=start
        while 1:
            mo=pat.search(buf,last, end)
            if mo==None:
                break
            res.append(mo)
            last=mo.end()
        return res

    def getTags(self, buf):
        tokens      = self.getTokens(buf, self._tagList)
        self._tags  = {'Root':[]}
        parent      = self._tags["Root"]
        parentKey   = "Root"
        parentStack = []
        tagCnt= [0]*len(self._tagList)

        def openTag(self, mo, tagType, parentKey, tags, tagCnt):
            """
            A new tag is opening.
            Adjust the key and add a new element to tags.
            """
            childKey= parentKey+"/%s%s"%(self._tagList[tagType], tagCnt)
            parent  = tags[parentKey]
            parent.append(childKey)
            tags[childKey]=[mo[-1]]
            return childKey

        for m in tokens:
            tagType=m[0]/2
            if m[0]%2 == 0:
                    # Keep track of parents as we go down
                parentStack.append(parentKey)
                parentKey = openTag(self, m, tagType, parentKey, self._tags,
tagCnt[tagType])
                tagCnt[tagType]=tagCnt[tagType]+1
            else:
                body=self._tags[parentKey][0]
                self._tags[parentKey][0]=(body[1],m[1][0])
                if 0:
                    # Wast time by cutting the text
                    self._tags[parentKey][0]=buf[body[1]:m[1][0]]
                    # Restore previous parent
                parentKey=parentStack.pop()

        return self._tags

    def testSimp(self):
        self.getTags(open("table.html").read())
        import pprint
        pprint.pprint(self._tags)

    def regress(self):
        funcs=self.__class__.__dict__.keys()
        funcs.sort()
        funcs=filter(lambda x: re.match('test',x), funcs)

        for f in funcs:
            getattr(self,f)()

parser = TagParser(["Table","tr","td"])
parser.regress()

************* output
{'Root': ['Root/Table0'],
 'Root/Table0': [(7, 270),
                 'Root/Table0/tr0',
                 'Root/Table0/tr1',
                 'Root/Table0/tr4'],
 'Root/Table0/tr0': [(13, 51),
                     'Root/Table0/tr0/td0',
                     'Root/Table0/tr0/td1',
                     'Root/Table0/tr0/td2'],
 'Root/Table0/tr0/td0': [(19, 21)],
 'Root/Table0/tr0/td1': [(30, 32)],
 'Root/Table0/tr0/td2': [(41, 43)],
 'Root/Table0/tr1': [(63, 214), 'Root/Table0/tr1/Table1'],
 'Root/Table0/tr1/Table1': [(74, 201),
                            'Root/Table0/tr1/Table1/tr2',
                            'Root/Table0/tr1/Table1/tr3'],
 'Root/Table0/tr1/Table1/tr2': [(83, 130),
                                'Root/Table0/tr1/Table1/tr2/td3',
                                'Root/Table0/tr1/Table1/tr2/td4',
                                'Root/Table0/tr1/Table1/tr2/td5'],
 'Root/Table0/tr1/Table1/tr2/td3': [(92, 95)],
 'Root/Table0/tr1/Table1/tr2/td4': [(104, 107)],
 'Root/Table0/tr1/Table1/tr2/td5': [(116, 119)],
 'Root/Table0/tr1/Table1/tr3': [(144, 191),
                                'Root/Table0/tr1/Table1/tr3/td6',
                                'Root/Table0/tr1/Table1/tr3/td7',
                                'Root/Table0/tr1/Table1/tr3/td8'],
 'Root/Table0/tr1/Table1/tr3/td6': [(153, 156)],
 'Root/Table0/tr1/Table1/tr3/td7': [(165, 168)],
 'Root/Table0/tr1/Table1/tr3/td8': [(177, 180)],
 'Root/Table0/tr4': [(225, 263),
                     'Root/Table0/tr4/td9',
                     'Root/Table0/tr4/td10',
                     'Root/Table0/tr4/td11'],
 'Root/Table0/tr4/td10': [(242, 244)],
 'Root/Table0/tr4/td11': [(253, 255)],
 'Root/Table0/tr4/td9': [(231, 233)]}

************ input
<table>
 <tr> <td>a1</td><td>a2</td><td>a2</td> </tr>
  <tr> <table>
        <tr><td>aa1</td><td>aa2</td><td>aa2</td></tr>
        <tr><td>bb1</td><td>bb2</td><td>bb2</td></tr>
       </table>
</tr> <tr> <td>b1</td><td>b2</td><td>b2</td></tr>
</table>








More information about the Python-list mailing list