[XML-SIG] XML parser benchmark
Lars Marius Garshol
larsga@ifi.uio.no
08 May 1999 11:59:47 +0200
A quick investigation of the benchmark code shows that roughly 40 % of
its time is spent doing Unicode string counting, using the re module.
Here are some interesting numbers, using the big.xml document:
Original: 108 seconds
Using len to count: 62 seconds
My optimized version: 92 seconds
If we replace my naive pure-Python character counting with one of the
Unicode C modules we might well have a version of statsexpat.py that
is faster than the XML::Parser application.
In case anyone wants to attack this, here's my version:
import sys,os,time,pyexpat
class Elinfo:
def __init__(self, name):
self.name = name
self.count = 0
self.minlev = 0
self.charcount = 0
self.empty = 1
self.ptab = {}
self.ktab = {}
self.atab = {}
class Docinfo:
def __init__(self):
self.root = None
self.eltab = {}
self.elstack = []
self.seenorder = 0
def strt_handle(self, name, attrs):
intern(name)
try:
inf = self.eltab[name]
except KeyError:
inf = Elinfo(name)
inf.seen = self.seenorder
self.seenorder = self.seenorder + 1
self.eltab[name] = inf
inf.count = inf.count + 1
if self.elstack!=[]:
parent = self.elstack[-1]
try:
inf.ptab[parent] = inf.ptab[parent] + 1
except KeyError:
inf.ptab[parent] = 1
pinf = self.eltab[parent]
pinf.empty = 0
try:
pinf.ktab[name] = pinf.ktab[name] + 1
except KeyError:
pinf.ktab[name] = 1
else:
self.root = name
#Attribute handling
for i in range(0, len(attrs), 2):
try:
inf.atab[attrs[i]] = inf.atab[attrs[i]] + 1
except KeyError:
inf.atab[attrs[i]] = 1
self.elstack.append(name)
def end_handle(self, name):
del self.elstack[-1]
def char_handle(self, data):
elname = self.elstack[-1]
inf = self.eltab[elname]
inf.empty = 0
cnt = len(data)
max = len(data)
ix=0
while ix<max:
if data[ix]<"\300":
ix=ix+1
elif data[ix] < "\340":
cnt = cnt - 1
ix=ix+2
elif data[ix] < "\360":
cnt = cnt - 2
ix=ix+3
else:
cnt = cnt - 3
ix=ix+4
inf.charcount = inf.charcount + cnt
def set_minlev(self, name, level):
inf = self.eltab[name]
if inf.minlev == 0 or inf.minlev > level:
newlev = level + 1
inf.minlev = level
for kid in inf.ktab.keys():
self.set_minlev(kid, newlev)
def showtab(label, tab, dosum):
if not len(tab):
return
print '\n ', label + ':'
sum = 0
names = tab.keys()
names.sort()
for name in names:
cnt = tab[name]
sum = sum + cnt
print ' %-16s %5d' % (name, cnt)
if dosum and len(names) > 1:
print ' ====='
print ' %5d' % sum
def elcmp(a, b):
cmpmin = a.minlev - b.minlev
if cmpmin:
return cmpmin
return a.seen - b.seen
doc = Docinfo()
parser = pyexpat.ParserCreate()
parser.StartElementHandler = doc.strt_handle
parser.EndElementHandler = doc.end_handle
parser.CharacterDataHandler = doc.char_handle
docstream = open(sys.argv[1])
start=time.clock()
while 1:
buff = docstream.read(32000)
if not len(buff):
break
status = parser.Parse(buff, 0)
if status == 0:
print parser.ErrorCode, ' at line ', parser.ErrorLineNumber,\
', column ', parser.ErrorColumnNumber, ', byte ',\
parser.ErrorByteIndex
sys.exit(-1)
status = parser.Parse('', 1)
pt=time.clock()-start
if status == 0:
print parser.ErrorCode, ' at line ', parser.ErrorLineNumber,\
', column ', parser.ErrorColumnNumber, ', byte ',\
parser.ErrorByteIndex
sys.exit(-1)
print "TIME: "+`pt`
sys.exit()
doc.set_minlev(doc.root, 0)
sortinf = doc.eltab.values()
sortinf.sort(elcmp)
for elinf in sortinf:
print '\n================'
print elinf.name + ':', elinf.count
if elinf.charcount:
print 'Had', elinf.charcount, 'bytes of character data'
if elinf.empty:
print 'Always empty'
showtab('Parents', elinf.ptab, 0)
showtab('Children', elinf.ktab, 1)
showtab('Attributes', elinf.atab, 0)
--Lars M.