vars between classes
Darrell Gallion
darrell at dorb.com
Sun May 28 02:28:26 EDT 2000
Marc Tardif wrote:
> Indeed this is related to a concrete programming problem. I'm trying to
> write a small class to parse html tables into lists. Of course, I have
> searched Parnassus for such code, but what was available didn't support
> embedded tables. I'm therefore writing my own class to recursively go
> through embedded tables and I think I'm almost there.
This would be a good job for one of the standard parsers.
But I like to write parsers so your problem sounded fun.
This needs a decent regression test, but I've run out of time.
Thought a test that generates random tables and checks them would be cool.
import re
class TagParser:
"""
Given a list of tags use getTags(buf) to parse buf.
The result is a dictionary tree.
Each element found has an entry in the tree.
The dictionary key is the tag type and a count for that tag type.
Each key has a list for it's data. The list holds each child tag.
The first element of each list is a tuple(start, end) which
outlines the elements contents.
"""
def __init__(self, tagList):
self._tagList=tagList
def getTokens(self, buf, patList):
"""
patList is a list of tags to search for.
Return list of (tagOffset, matchObject)
"""
pat = "(?i)"
for p in patList:
pat=pat+"(<%s>)|(</%s>)|"%(p,p)
pat = pat[:-1]
res = self.findallExt(pat, buf, 0, len(buf))
res1=[]
for r in res:
cnt=0
for n in r.groups():
if n:
res1.append((cnt, (r.start(), r.end())))
break
# cnt will correspond to the offset in patList
cnt=cnt+1
return res1
def findallExt(self, pat, buf, start, end):
"""
Search buf for pat. Return a list of matchObjects
"""
pat=re.compile(pat)
res=[]
last=start
while 1:
mo=pat.search(buf,last, end)
if mo==None:
break
res.append(mo)
last=mo.end()
return res
def getTags(self, buf):
tokens = self.getTokens(buf, self._tagList)
self._tags = {'Root':[]}
parent = self._tags["Root"]
parentKey = "Root"
parentStack = []
tagCnt= [0]*len(self._tagList)
def openTag(self, mo, tagType, parentKey, tags, tagCnt):
"""
A new tag is opening.
Adjust the key and add a new element to tags.
"""
childKey= parentKey+"/%s%s"%(self._tagList[tagType], tagCnt)
parent = tags[parentKey]
parent.append(childKey)
tags[childKey]=[mo[-1]]
return childKey
for m in tokens:
tagType=m[0]/2
if m[0]%2 == 0:
# Keep track of parents as we go down
parentStack.append(parentKey)
parentKey = openTag(self, m, tagType, parentKey, self._tags,
tagCnt[tagType])
tagCnt[tagType]=tagCnt[tagType]+1
else:
body=self._tags[parentKey][0]
self._tags[parentKey][0]=(body[1],m[1][0])
if 0:
# Wast time by cutting the text
self._tags[parentKey][0]=buf[body[1]:m[1][0]]
# Restore previous parent
parentKey=parentStack.pop()
return self._tags
def testSimp(self):
self.getTags(open("table.html").read())
import pprint
pprint.pprint(self._tags)
def regress(self):
funcs=self.__class__.__dict__.keys()
funcs.sort()
funcs=filter(lambda x: re.match('test',x), funcs)
for f in funcs:
getattr(self,f)()
parser = TagParser(["Table","tr","td"])
parser.regress()
************* output
{'Root': ['Root/Table0'],
'Root/Table0': [(7, 270),
'Root/Table0/tr0',
'Root/Table0/tr1',
'Root/Table0/tr4'],
'Root/Table0/tr0': [(13, 51),
'Root/Table0/tr0/td0',
'Root/Table0/tr0/td1',
'Root/Table0/tr0/td2'],
'Root/Table0/tr0/td0': [(19, 21)],
'Root/Table0/tr0/td1': [(30, 32)],
'Root/Table0/tr0/td2': [(41, 43)],
'Root/Table0/tr1': [(63, 214), 'Root/Table0/tr1/Table1'],
'Root/Table0/tr1/Table1': [(74, 201),
'Root/Table0/tr1/Table1/tr2',
'Root/Table0/tr1/Table1/tr3'],
'Root/Table0/tr1/Table1/tr2': [(83, 130),
'Root/Table0/tr1/Table1/tr2/td3',
'Root/Table0/tr1/Table1/tr2/td4',
'Root/Table0/tr1/Table1/tr2/td5'],
'Root/Table0/tr1/Table1/tr2/td3': [(92, 95)],
'Root/Table0/tr1/Table1/tr2/td4': [(104, 107)],
'Root/Table0/tr1/Table1/tr2/td5': [(116, 119)],
'Root/Table0/tr1/Table1/tr3': [(144, 191),
'Root/Table0/tr1/Table1/tr3/td6',
'Root/Table0/tr1/Table1/tr3/td7',
'Root/Table0/tr1/Table1/tr3/td8'],
'Root/Table0/tr1/Table1/tr3/td6': [(153, 156)],
'Root/Table0/tr1/Table1/tr3/td7': [(165, 168)],
'Root/Table0/tr1/Table1/tr3/td8': [(177, 180)],
'Root/Table0/tr4': [(225, 263),
'Root/Table0/tr4/td9',
'Root/Table0/tr4/td10',
'Root/Table0/tr4/td11'],
'Root/Table0/tr4/td10': [(242, 244)],
'Root/Table0/tr4/td11': [(253, 255)],
'Root/Table0/tr4/td9': [(231, 233)]}
************ input
<table>
<tr> <td>a1</td><td>a2</td><td>a2</td> </tr>
<tr> <table>
<tr><td>aa1</td><td>aa2</td><td>aa2</td></tr>
<tr><td>bb1</td><td>bb2</td><td>bb2</td></tr>
</table>
</tr> <tr> <td>b1</td><td>b2</td><td>b2</td></tr>
</table>
More information about the Python-list
mailing list